In [1]:
import codecs
import pandas as pd
import itertools as it

from pandas import DataFrame
import os

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import en_core_web_sm
nlp = spacy.load('en')

In [2]:
stars_filepath = 'review_stars_rest_subset.txt'
review_txt_filepath = 'review_text_rest_subset.txt'
business_filepath = 'review_business_rest_subset.txt'
user_filepath = 'review_user_rest_subset.txt'

In [3]:
def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [4]:
def get_data(filepath,review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(filepath),
                          review_number, review_number+1))[0]

In [5]:
with open(stars_filepath) as f:
    stars = f.readlines()
# remove whitespace characters like `\n` at the end of each line
stars = [x.strip() for x in stars]

In [6]:
with open(review_txt_filepath) as f:
    texts = f.readlines()
# remove whitespace characters like `\n` at the end of each line
texts = [x.strip() for x in texts]

In [7]:
with open(business_filepath) as f:
    business = f.readlines()
# remove whitespace characters like `\n` at the end of each line
business = [x.strip() for x in business]

In [8]:
with open(user_filepath) as f:
    user = f.readlines()
# remove whitespace characters like `\n` at the end of each line
user = [x.strip() for x in user]

In [9]:
#test lengths, all should be the same
print(len(stars), len(texts), len(business), len(user))

1570963 1570963 1570963 1570963


In [10]:
bus_set = frozenset(business)
print(len(bus_set))

52810


In [11]:
user_set = frozenset(user)
print(len(user_set))

169373


In [12]:
test_set = frozenset(business[1:20])
print(test_set)

frozenset({'ZnxudK5ExgpfXs4bicS4IA', 'hjk3ox7w1akbEuOgTJ03Bw', 'Xy74meQwdTnloAAyRC-4cg', '0W4lkclzZThpx3V65bVgig', '4_GIJk0tX3k0x0FcUv4sNA', 'PFPUMF38-lraKzLcTiz5gQ', 'XWTPNfskXoUL-Lf32wSk0Q', 'a9aW5e731lplWGHUZ02-zQ', '5r6-G9C4YLbC7Ziz57l3rQ', '28adZ4lsuUeVB2aWzohK9g', 'Eox_Qq74oaFZ-YjthpHhBw', 'N93EYZy9R0sdlEvubu94ig', 'oWTn2IzrprsRkPfULtjZtQ', 'Aov96CM4FZAXeZvKtsStdA', 'zxJlg4XCHNoFy78WZPv89w', 'RtUvSWO_UZ8V3Wpj0n077w', 'z8oIoCT1cXz7gZP5GeU5OA', 'I8rveLd-dl81u6c8YqAxmw', 'zgQHtqX0gqMw1nlBZl2VnQ'})


Make collased business text file

In [13]:
all_bus_filepath = "collapsed_business_rest_subset.txt"

In [14]:
all_test_filepath = "collapsed_business_rest_subset_test.txt"

In [17]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_bus_filepath, 'w', encoding='utf_8') as f:
        for x in bus_set:
            review_index = [i for i,j in enumerate(business) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')
            #print(x)
    

CPU times: user 1h 51min 35s, sys: 3.08 s, total: 1h 51min 38s
Wall time: 1h 51min 51s


In [18]:
collapsed_business = LineSentence(all_bus_filepath)

In [25]:
collapsed_business

<gensim.models.word2vec.LineSentence at 0x7fa49e78e4a8>

In [15]:
with open(all_bus_filepath) as f:
    all_bus = f.readlines()
# remove whitespace characters like `\n` at the end of each line
all_bus = [x.strip() for x in all_bus]

In [16]:
len(all_bus)

52810

In [17]:
with open(all_test_filepath) as f:
    test_bus = f.readlines()
# remove whitespace characters like `\n` at the end of each line
test_bus = [x.strip() for x in test_bus]

In [18]:
len(test_bus)

19

Make collapsed user text file

In [19]:
all_user_filepath = "collapsed_user_rest_subset.txt"

In [20]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_user_filepath, 'w', encoding='utf_8') as f:
        for x in user_set:
            review_index = [i for i,j in enumerate(user) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')

CPU times: user 5h 44min 44s, sys: 5.56 s, total: 5h 44min 49s
Wall time: 5h 45min 12s


Setup and define function for NLP

In [20]:
#helper functions from modern nlp in python
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [21]:
def get_sample_review(review_txt_filepath, review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [22]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

In [23]:
lda_model_filepath = 'lda_model_eat_30'
trigram_dictionary_filepath = 'trigram_dict_eat_30.dict'
trigram_model_filepath = 'trigram_model_all_eat_30'
bigram_model_filepath = 'bigram_model_all_eat_30'

In [24]:
lda = LdaMulticore.load(lda_model_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

In [25]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
                       
    for topic_number, freq in review_lda:
#        if freq < min_topic_freq:
#            break
            
        # print the most highly related topic names and frequencies
        print('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 50)))

In [26]:
topic_names = {0: u'chinese',
               1: u'thai',
               2: u'healthy',
               3: u'smell',
               4: u'japanese',
               5: u'toronto',
               6: u'service',
               7: u'experience',
               8: u'asian soup',
               9: u'grocery',
               10: u'parking',
               11: u'bar ambiance',
               12: u'uk',
               13: u'good service',
               14: u'fun ambiance',
               15: u'young',
               16: u'comfort food',
               17: u'greek',
               18: u'high end',
               19: u'hotwing',
               20: u'breakfast',
               21: u'sweet',
               22: u'wine & dine',
               23: u'pubs',
               24: u'good taste',
               25: u'na drinks',
               26: u'desserts',
               27: u'coffee shop',
               28: u'mexican',
               29: u'reviews',
               30: u'new york',
               31: u'general restaurant',
               32: u'beach',
               33: u'location',
               34: u'happy hour',
               35: u'amazing',
               36: u'vietnamese',
               37: u'time',
               38: u'vas legas',
               39: u'montreal',
               40: u'deli',
               41: u'buffet',
               42: u'bbq',
               43: u'french',
               44: u'money',
               45: u'street taco',
               46: u'pizza',
               47: u'airport & delivery',
               48: u'burger & fries',
               49: u'italian'}

In [36]:
#topic_names_filepath = 'topic_names_eats_30.pkl'

#with open(topic_names_filepath, 'wb') as f:
#    pickle.dump(topic_names, f)

In [27]:
sample_review = get_sample_review(all_test_filepath, 6)
#print(sample_review)

In [29]:
#lda_description(sample_review)

In [89]:
all_numbers = list(range(0,49))
df_topics = pd.DataFrame(columns =["topic_name"])

for topic_number in all_numbers:
    df_topics = df_topics.append({
     "topic_name": topic_names[topic_number]
      }, ignore_index=True)
#print(df_topics)

            topic_name
0              chinese
1                 thai
2              healthy
3                smell
4             japanese
5              toronto
6              service
7           experience
8           asian soup
9              grocery
10             parking
11        bar ambiance
12                  uk
13        good service
14        fun ambiance
15               young
16        comfort food
17               greek
18            high end
19             hotwing
20           breakfast
21               sweet
22         wine & dine
23                pubs
24          good taste
25           na drinks
26            desserts
27         coffee shop
28             mexican
29             reviews
30            new york
31  general restaurant
32               beach
33            location
34          happy hour
35             amazing
36          vietnamese
37                time
38           vas legas
39            montreal
40                deli
41              buffet
42         

In [94]:
df_full = pd.merge(df_topics, df, how='left', on=['topic_name'])
df_full = df_full.fillna(0)
#print(df_full)

            topic_name    freq
0              chinese  0.3742
1                 thai  0.0000
2              healthy  0.0000
3                smell  0.0115
4             japanese  0.0132
5              toronto  0.0000
6              service  0.0624
7           experience  0.0000
8           asian soup  0.0000
9              grocery  0.0156
10             parking  0.0169
11        bar ambiance  0.0000
12                  uk  0.0000
13        good service  0.0347
14        fun ambiance  0.0000
15               young  0.0000
16        comfort food  0.0000
17               greek  0.0336
18            high end  0.0000
19             hotwing  0.0000
20           breakfast  0.0000
21               sweet  0.0000
22         wine & dine  0.0000
23                pubs  0.0105
24          good taste  0.0124
25           na drinks  0.0143
26            desserts  0.0000
27         coffee shop  0.0197
28             mexican  0.0000
29             reviews  0.0616
30            new york  0.0000
31  gene

In [95]:
dfList = df['freq'].tolist()

In [63]:
 for topic_number, freq in review_lda:
#        if freq < min_topic_freq:
#            break
            
        # print the most highly related topic names and frequencies
    print('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 4)))

chinese                   0.3741999864578247
vietnamese                0.07109999656677246
service                   0.06239999830722809
reviews                   0.06159999966621399
amazing                   0.04450000077486038
good service              0.034699998795986176
greek                     0.03359999880194664
time                      0.031099999323487282
general restaurant        0.02710000053048134
money                     0.02449999935925007
montreal                  0.0203000009059906
coffee shop               0.019700000062584877
airport & delivery        0.01810000091791153
vas legas                 0.017899999395012856
parking                   0.016899999231100082
grocery                   0.015599999576807022
na drinks                 0.014299999922513962
japanese                  0.013199999928474426
good taste                0.012400000356137753
smell                     0.011500000022351742
pubs                      0.010499999858438969


In [105]:
reviews_filepath = all_test_filepath

test_df = pd.DataFrame(columns=["topic_name", "freq", "bus"])

In [None]:
%%time
#function to loop through all businesses and convert review to 
if 1 == 1:
    for busi in list(range(1,len(all_bus))):
        # parse the review text with spaCy
        parsed_review = nlp(all_bus[busi])
    
        # lemmatize the text and remove punctuation and whitespace
        unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
        # apply the first-order and secord-order phrase models
        bigram_review = bigram_model[unigram_review]
        trigram_review = trigram_model[bigram_review]
    
        # remove any remaining stopwords
        trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
        # create a bag-of-words representation
        review_bow = trigram_dictionary.doc2bow(trigram_review)
    
        # create an LDA representation
        review_lda = lda[review_bow]
    
        # sort with the most highly related topics first
        review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
        #convert lda output to dataframe and get topic names
        df = pd.DataFrame(columns=["topic_name", "freq", "bus"])
        for topic_number, freq in review_lda:
            df = df.append({
             "topic_name": topic_names[topic_number],
             "freq":  round(freq, 4),
             "bus": ''.join([list(x) for x in bus_set][busi])
              }, ignore_index=True)
        
        #merge with complete topic list and replace na with zero
        df_full = pd.merge(df_topics, df, how='left', on=['topic_name'])
        df_full = df_full.fillna(0)
        
        test_df = pd.concat([test_df,df_full])
                       



In [98]:
all_bus[1]

"This place is so good. I work nearby and come here often for lunch.  Their menu is 7.49 for one veg & one meat & rice or noodles, 7.99 for two veg & one meat & rice or noodles, 8.49 for one veg & two meat & rice or noodles. They also offer salads (7.49 I believe).  The rice is amazing. They have a few different kinds, but I have only tried the Jambalayan kind. It's sort of spicy but SO delicious. It is, in fact, so good, I am not willing to try the other kinds. Instead of rice, you can also get noodles.  Their vegetables include home fries, corn, mixed veggies, zucchini, mashed potatoes, and a few others. I have tried the home fries, mixed veggies, and zucchini and they are all good, but I personally like the home fries the most.  Their meats change on a daily basis, but they normally have bourbon chicken, blackened chicken, honey glazed chicken, spicy beef, rainbow shrimp, a type of fish (I forget), and a few other chicken flavours. The bourbon chicken is amazing. Ask for extra sauce

In [101]:
bus_set

frozenset({'RynlRSJ7cXHe1ckIuxbMag',
           'DGwDXazeFcD7DByweszpFA',
           'XrpWb4vB7-1z26OreeCD2w',
           'p4YvOzp42g3JzlXJny8F0g',
           '7skIQa_9nrcDU1A5KPJwCQ',
           'hloS23Thw57IITw7_DLLOQ',
           'xeSE3ct_V9U3_ibcrpb3_g',
           '5SI7EJB7QaUjuh7OnFkVUQ',
           'rOPj68rUjK9tv84ShWaetg',
           'Lij0OcvPD4LAnmklHQpDCg',
           'zaotMgByfU9_RnxpTeqVeg',
           'gdWy-x5KbCbMrwIwI87mMg',
           'eOl6WS3oAjtGZpcXfv8tlA',
           'gchI4BdZ76teMX9BsI2H-w',
           'p1MZyPUMflxMARQ6I8X3Kw',
           'M4Bi2eyelIP9YQuwD7a3JA',
           'g03AjjsVjSxfBgzf04FrFg',
           'NKx2a3_PQ9c7Ts_A6JxKWQ',
           '8RJYiN6asLcEWWysZKEyVA',
           'DVJkxZkLKS76Khb2y6XA6A',
           'dvbcUnKv2awsIxog7dO4vw',
           'PoqEm7BKF_kf73Q1zORnMQ',
           'RruVXr3L6G5RplZ_PtDNuA',
           'arGo97Zk7YoHbFGKDFUFKg',
           'bqMEml5HIFSMo4CPfjlG0A',
           'UlI0TksGFiIXtcbtg2KalQ',
           'ZyGpQ9k1D0c4xd8k7I_RXw',
 

In [103]:
''.join([list(x) for x in bus_set][1])

'DGwDXazeFcD7DByweszpFA'

In [104]:
df = pd.DataFrame(review_lda)
df.columns = ['topic_number', 'freq']
df = df.assign(topic=topic_names[df['topic_number']])
print(df)

TypeError: 'Series' objects are mutable, thus they cannot be hashed