In [1]:
import codecs
import pandas as pd
import itertools as it

from pandas import DataFrame
import os

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import en_core_web_sm
nlp = spacy.load('en')

In [2]:
stars_filepath = 'review_stars_rest_subset.txt'
review_txt_filepath = 'review_text_rest_subset.txt'
business_filepath = 'review_business_rest_subset.txt'
user_filepath = 'review_user_rest_subset.txt'

In [3]:
def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [4]:
def get_data(filepath,review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(filepath),
                          review_number, review_number+1))[0]

In [5]:
with open(stars_filepath) as f:
    stars = f.readlines()
# remove whitespace characters like `\n` at the end of each line
stars = [x.strip() for x in stars]

In [6]:
with open(review_txt_filepath) as f:
    texts = f.readlines()
# remove whitespace characters like `\n` at the end of each line
texts = [x.strip() for x in texts]

In [7]:
with open(business_filepath) as f:
    business = f.readlines()
# remove whitespace characters like `\n` at the end of each line
business = [x.strip() for x in business]

In [8]:
with open(user_filepath) as f:
    user = f.readlines()
# remove whitespace characters like `\n` at the end of each line
user = [x.strip() for x in user]

In [9]:
#test lengths, all should be the same
print(len(stars), len(texts), len(business), len(user))

1570963 1570963 1570963 1570963


In [10]:
bus_set = frozenset(business)
print(len(bus_set))

52810


In [11]:
user_set = frozenset(user)
print(len(user_set))

169373


In [13]:
test_set = frozenset(business[1:20])
print(test_set)

frozenset({'Eox_Qq74oaFZ-YjthpHhBw', 'Aov96CM4FZAXeZvKtsStdA', 'N93EYZy9R0sdlEvubu94ig', 'I8rveLd-dl81u6c8YqAxmw', 'ZnxudK5ExgpfXs4bicS4IA', 'hjk3ox7w1akbEuOgTJ03Bw', 'a9aW5e731lplWGHUZ02-zQ', 'zgQHtqX0gqMw1nlBZl2VnQ', 'RtUvSWO_UZ8V3Wpj0n077w', '5r6-G9C4YLbC7Ziz57l3rQ', 'z8oIoCT1cXz7gZP5GeU5OA', '0W4lkclzZThpx3V65bVgig', 'oWTn2IzrprsRkPfULtjZtQ', '4_GIJk0tX3k0x0FcUv4sNA', '28adZ4lsuUeVB2aWzohK9g', 'PFPUMF38-lraKzLcTiz5gQ', 'zxJlg4XCHNoFy78WZPv89w', 'Xy74meQwdTnloAAyRC-4cg', 'XWTPNfskXoUL-Lf32wSk0Q'})


Make collased business text file

In [14]:
all_bus_filepath = "collapsed_business_rest_subset.txt"

In [15]:
all_test_filepath = "collapsed_business_rest_subset_test.txt"

In [17]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_bus_filepath, 'w', encoding='utf_8') as f:
        for x in bus_set:
            review_index = [i for i,j in enumerate(business) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')
            #print(x)
    

CPU times: user 1h 51min 35s, sys: 3.08 s, total: 1h 51min 38s
Wall time: 1h 51min 51s


In [18]:
collapsed_business = LineSentence(all_bus_filepath)

In [25]:
collapsed_business

<gensim.models.word2vec.LineSentence at 0x7fa49e78e4a8>

In [16]:
with open(all_bus_filepath) as f:
    all_bus = f.readlines()
# remove whitespace characters like `\n` at the end of each line
all_bus = [x.strip() for x in all_bus]

In [17]:
len(all_bus)

52810

In [18]:
with open(all_test_filepath) as f:
    test_bus = f.readlines()
# remove whitespace characters like `\n` at the end of each line
test_bus = [x.strip() for x in test_bus]

In [19]:
len(test_bus)

19

Make collapsed user text file

In [20]:
all_user_filepath = "collapsed_user_rest_subset.txt"

In [20]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_user_filepath, 'w', encoding='utf_8') as f:
        for x in user_set:
            review_index = [i for i,j in enumerate(user) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')

CPU times: user 5h 44min 44s, sys: 5.56 s, total: 5h 44min 49s
Wall time: 5h 45min 12s


Setup and define function for NLP

In [21]:
#helper functions from modern nlp in python
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [22]:
def get_sample_review(review_txt_filepath, review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [23]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

In [24]:
lda_model_filepath = 'lda_model_eat_30'
trigram_dictionary_filepath = 'trigram_dict_eat_30.dict'
trigram_model_filepath = 'trigram_model_all_eat_30'
bigram_model_filepath = 'bigram_model_all_eat_30'

In [25]:
lda = LdaMulticore.load(lda_model_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

In [26]:
topic_names = {0: u'chinese',
               1: u'thai',
               2: u'healthy',
               3: u'smell',
               4: u'japanese',
               5: u'toronto',
               6: u'service',
               7: u'experience',
               8: u'asian soup',
               9: u'grocery',
               10: u'parking',
               11: u'bar ambiance',
               12: u'uk',
               13: u'good service',
               14: u'fun ambiance',
               15: u'young',
               16: u'comfort food',
               17: u'greek',
               18: u'high end',
               19: u'hotwing',
               20: u'breakfast',
               21: u'sweet',
               22: u'wine & dine',
               23: u'pubs',
               24: u'good taste',
               25: u'na drinks',
               26: u'desserts',
               27: u'coffee shop',
               28: u'mexican',
               29: u'reviews',
               30: u'new york',
               31: u'general restaurant',
               32: u'beach',
               33: u'location',
               34: u'happy hour',
               35: u'amazing',
               36: u'vietnamese',
               37: u'time',
               38: u'vas legas',
               39: u'montreal',
               40: u'deli',
               41: u'buffet',
               42: u'bbq',
               43: u'french',
               44: u'money',
               45: u'street taco',
               46: u'pizza',
               47: u'airport & delivery',
               48: u'burger & fries',
               49: u'italian'}

In [36]:
#topic_names_filepath = 'topic_names_eats_30.pkl'

#with open(topic_names_filepath, 'wb') as f:
#    pickle.dump(topic_names, f)

In [27]:
sample_review = get_sample_review(all_test_filepath, 6)
#print(sample_review)

In [29]:
#lda_description(sample_review)

In [52]:
all_numbers = list(range(0,50))
df_all_numbers = pd.DataFrame(columns =["topic_number"])
for topic_number in all_numbers:
    df_all_numbers = df_all_numbers.append({
     "topic_number": topic_number
      }, ignore_index=True)


df_topics = pd.DataFrame(columns =["topic_name"])

for topic_number in all_numbers:
    df_topics = df_topics.append({
     "topic_name": topic_names[topic_number]
      }, ignore_index=True)
#print(df_topics)

In [35]:
reviews_filepath = all_test_filepath

test_df = pd.DataFrame(columns=["topic_name", "freq", "bus"])

In [86]:
all_bus_lda_filepath = "review_bus_lda.csv"

In [101]:
%%time
#function to loop through all businesses and convert review to 
if 1 == 1:
    with open(all_bus_lda_filepath, 'w') as f:
        for busi in list(range(0,3)):
            # parse the review text with spaCy
            parsed_review = nlp(test_bus[busi])
    
            # lemmatize the text and remove punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
            # apply the first-order and secord-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
    
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
            # create a bag-of-words representation
            review_bow = trigram_dictionary.doc2bow(trigram_review)
    
            # create an LDA representation
            review_lda = lda[review_bow]
    
            # sort with the most highly related topics first
            review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
        
            bus = ''.join([list(x) for x in bus_set][busi])

            df = pd.DataFrame(columns=["topic_number", "freq"])

            for topic_number, freq in review_lda:
                df = df.append({
                "topic_number": topic_number,
                "freq":  round(freq, 4)
                }, ignore_index=True)
            #merge with complete topic list and replace na with zero
            df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
            df_full = df_full.fillna(0)
        
            df_full['bus'] = bus

            pivoted = df_full.pivot('bus', 'topic_number')
        
            one_row = pivoted.iloc[0]
        
            out = one_row.values.tolist()
            out.insert(0, bus)
            out2 = str(out)
            out3 = re.sub(r"[\[ | \]]", "", out2)
            
            print(busi)
           
            f.write(out3)
            f.write('\n')



0
1
2
CPU times: user 24.5 s, sys: 12.1 s, total: 36.5 s
Wall time: 7.11 s


In [103]:
test_read = pd.io.parsers.read_csv(all_bus_lda_filepath,sep=",")
test_read

Unnamed: 0,['1ylA7yyrMMUX1zcu5EqO4Q',0.0,0.0.1,0.0430000014603138,0.012299999594688416,0.0.2,0.018699999898672104,0.0.3,0.01209999993443489,0.0.4,...,0.0.21,0.0.22,0.0284000001847744,0.0.23,0.034699998795986176,0.0.24,0.0.25,0.016499999910593033,0.0.26,0.0]
0,['-OEIW0dO96-492qa_luxaw',0.0,0.0155,0.0,0.0117,0.0,0.0,0.0671,0.0279,0.0,...,0.0109,0.0,0.0203,0.0,0.0129,0.0,0.0,0.0168,0.0475,0.011500000022351742]
1,['ToNd6fEn_SvcQc1Fulsidg',0.0979,0.0,0.0,0.0,0.2166,0.0,0.1061,0.0163,0.0,...,0.0,0.1569,0.0,0.0,0.0161,0.0,0.0,0.0192,0.0,0.0]


In [124]:
import re
out3 = re.sub(r"[\[ | \]]", "", out2)

In [125]:
out3



"'ToNd6fEn_SvcQc1Fulsidg',0.09790000319480896,0.0,0.0,0.0,0.21660000085830688,0.0,0.10610000044107437,0.016300000250339508,0.0,0.011699999682605267,0.014100000262260437,0.0,0.0,0.0,0.014700000174343586,0.0,0.01209999993443489,0.032999999821186066,0.0,0.0,0.0,0.0,0.0,0.020999999716877937,0.010400000028312206,0.0,0.021900000050663948,0.0,0.0,0.052000001072883606,0.0,0.03269999846816063,0.0,0.013899999670684338,0.0,0.0,0.0,0.04740000143647194,0.0,0.0,0.0,0.15690000355243683,0.0,0.0,0.016100000590085983,0.0,0.0,0.019200000911951065,0.0,0.0"

In [96]:
out2 = str(out)
''.join(out2)

"['1ylA7yyrMMUX1zcu5EqO4Q', 0.0, 0.0, 0.0430000014603138, 0.012299999594688416, 0.0, 0.018699999898672104, 0.0, 0.01209999993443489, 0.0, 0.017799999564886093, 0.0, 0.01889999955892563, 0.0, 0.010200000368058681, 0.0560000017285347, 0.0, 0.30070000886917114, 0.0, 0.0, 0.0, 0.02810000069439411, 0.0, 0.0, 0.03150000050663948, 0.0348999984562397, 0.0, 0.0, 0.0, 0.0, 0.07020000368356705, 0.0, 0.027400000020861626, 0.0, 0.02290000021457672, 0.0, 0.035100001841783524, 0.0, 0.05079999938607216, 0.025299999862909317, 0.0203000009059906, 0.0, 0.0, 0.0284000001847744, 0.0, 0.034699998795986176, 0.0, 0.0, 0.016499999910593033, 0.0, 0.0]"

In [81]:
busi =0
parsed_review = nlp(all_bus[busi])
unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
trigram_review = [term for term in trigram_review
    if not term in spacy.lang.en.stop_words.STOP_WORDS]
review_bow = trigram_dictionary.doc2bow(trigram_review)
review_lda = lda[review_bow]

bus = ''.join([list(x) for x in bus_set][busi])

df = pd.DataFrame(columns=["topic_number", "freq"])

for topic_number, freq in review_lda:
    df = df.append({
    "topic_number": topic_number,
    "freq":  round(freq, 4)
    }, ignore_index=True)
#merge with complete topic list and replace na with zero
df_full = pd.merge(df_all_numbers, df, how='left', on=['topic_number'])
df_full = df_full.fillna(0)
        
df_full['bus'] = bus

pivoted = df_full.pivot('bus', 'topic_number')



In [82]:
pivoted

Unnamed: 0_level_0,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq,freq
topic_number,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
bus,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1ylA7yyrMMUX1zcu5EqO4Q,0.0,0.0,0.0,0.0624,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0171,0.0269,0.0,0.0592,0.0886,0.0,0.1216


In [83]:
one_row = pivoted.iloc[0]

In [84]:
out = one_row.values.tolist()
out.insert(0, bus)
print(out)

['1ylA7yyrMMUX1zcu5EqO4Q', 0.0, 0.0, 0.0, 0.06239999830722809, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.02630000002682209, 0.0674000009894371, 0.0, 0.023099999874830246, 0.0, 0.0, 0.0, 0.0, 0.0, 0.11969999969005585, 0.0, 0.21979999542236328, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06069999933242798, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.022199999541044235, 0.07370000332593918, 0.0, 0.0, 0.0, 0.017100000753998756, 0.026900000870227814, 0.0, 0.05920000001788139, 0.08860000222921371, 0.0, 0.12160000205039978]


In [85]:
all_bus[1]

"This place is so good. I work nearby and come here often for lunch.  Their menu is 7.49 for one veg & one meat & rice or noodles, 7.99 for two veg & one meat & rice or noodles, 8.49 for one veg & two meat & rice or noodles. They also offer salads (7.49 I believe).  The rice is amazing. They have a few different kinds, but I have only tried the Jambalayan kind. It's sort of spicy but SO delicious. It is, in fact, so good, I am not willing to try the other kinds. Instead of rice, you can also get noodles.  Their vegetables include home fries, corn, mixed veggies, zucchini, mashed potatoes, and a few others. I have tried the home fries, mixed veggies, and zucchini and they are all good, but I personally like the home fries the most.  Their meats change on a daily basis, but they normally have bourbon chicken, blackened chicken, honey glazed chicken, spicy beef, rainbow shrimp, a type of fish (I forget), and a few other chicken flavours. The bourbon chicken is amazing. Ask for extra sauce

In [103]:
''.join([list(x) for x in bus_set][1])

'DGwDXazeFcD7DByweszpFA'