In [1]:
import codecs
import pandas as pd
import itertools as it

from pandas import DataFrame
import os

import spacy
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

import en_core_web_sm
nlp = spacy.load('en')

In [2]:
stars_filepath = 'review_stars_rest_subset.txt'
review_txt_filepath = 'review_text_rest_subset.txt'
business_filepath = 'review_business_rest_subset.txt'
user_filepath = 'review_user_rest_subset.txt'

In [3]:
def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')

In [4]:
def get_data(filepath,review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(filepath),
                          review_number, review_number+1))[0]

In [5]:
with open(stars_filepath) as f:
    stars = f.readlines()
# remove whitespace characters like `\n` at the end of each line
stars = [x.strip() for x in stars]

In [6]:
with open(review_txt_filepath) as f:
    texts = f.readlines()
# remove whitespace characters like `\n` at the end of each line
texts = [x.strip() for x in texts]

In [7]:
with open(business_filepath) as f:
    business = f.readlines()
# remove whitespace characters like `\n` at the end of each line
business = [x.strip() for x in business]

In [8]:
with open(user_filepath) as f:
    user = f.readlines()
# remove whitespace characters like `\n` at the end of each line
user = [x.strip() for x in user]

In [9]:
#test lengths, all should be the same
print(len(stars), len(texts), len(business), len(user))

1570963 1570963 1570963 1570963


In [10]:
bus_set = frozenset(business)
print(len(bus_set))

52810


In [11]:
user_set = frozenset(user)
print(len(user_set))

169373


In [13]:
test_set = frozenset(business[1:20])
print(test_set)

frozenset({'5r6-G9C4YLbC7Ziz57l3rQ', 'XWTPNfskXoUL-Lf32wSk0Q', 'I8rveLd-dl81u6c8YqAxmw', 'Eox_Qq74oaFZ-YjthpHhBw', '0W4lkclzZThpx3V65bVgig', 'zxJlg4XCHNoFy78WZPv89w', 'Aov96CM4FZAXeZvKtsStdA', 'oWTn2IzrprsRkPfULtjZtQ', 'z8oIoCT1cXz7gZP5GeU5OA', '4_GIJk0tX3k0x0FcUv4sNA', 'a9aW5e731lplWGHUZ02-zQ', 'Xy74meQwdTnloAAyRC-4cg', 'PFPUMF38-lraKzLcTiz5gQ', 'N93EYZy9R0sdlEvubu94ig', 'ZnxudK5ExgpfXs4bicS4IA', 'zgQHtqX0gqMw1nlBZl2VnQ', '28adZ4lsuUeVB2aWzohK9g', 'hjk3ox7w1akbEuOgTJ03Bw', 'RtUvSWO_UZ8V3Wpj0n077w'})


Make collased business text file

In [16]:
all_bus_filepath = "collapsed_business_rest_subset.txt"

In [14]:
all_test_filepath = "collapsed_business_rest_subset_test.txt"

In [17]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_bus_filepath, 'w', encoding='utf_8') as f:
        for x in bus_set:
            review_index = [i for i,j in enumerate(business) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')
            #print(x)
    

CPU times: user 1h 51min 35s, sys: 3.08 s, total: 1h 51min 38s
Wall time: 1h 51min 51s


In [18]:
collapsed_business = LineSentence(all_bus_filepath)

In [25]:
collapsed_business

<gensim.models.word2vec.LineSentence at 0x7fa49e78e4a8>

Make collapsed user text file

In [19]:
all_user_filepath = "collapsed_user_rest_subset.txt"

In [20]:
%%time
#function to loop through all business ids, collapse all reviews for that business into one string
if 1 == 0:
    with codecs.open(all_user_filepath, 'w', encoding='utf_8') as f:
        for x in user_set:
            review_index = [i for i,j in enumerate(user) if j == x]
            review_sub = [texts[ind] for ind in review_index]
            review_out = ''.join(map(str, review_sub))
            f.write(review_out + '\n')

CPU times: user 5h 44min 44s, sys: 5.56 s, total: 5h 44min 49s
Wall time: 5h 45min 12s


Setup and define function for NLP

In [27]:
#helper functions from modern nlp in python
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [28]:
def get_sample_review(review_txt_filepath, review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [31]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

In [29]:
lda_model_filepath = 'lda_model_eat_30'
trigram_dictionary_filepath = 'trigram_dict_eat_30.dict'
trigram_model_filepath = 'trigram_model_all_eat_30'
bigram_model_filepath = 'bigram_model_all_eat_30'

In [32]:
lda = LdaMulticore.load(lda_model_filepath)
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)
trigram_model = Phrases.load(trigram_model_filepath)
bigram_model = Phrases.load(bigram_model_filepath)

In [55]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.stop_words.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
                       
    for topic_number, freq in review_lda:
#        if freq < min_topic_freq:
#            break
            
        # print the most highly related topic names and frequencies
        print('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 50)))

In [35]:
topic_names = {0: u'chinese',
               1: u'thai',
               2: u'healthy',
               3: u'smell',
               4: u'japanese',
               5: u'toronto',
               6: u'service',
               7: u'experience',
               8: u'asian soup',
               9: u'grocery',
               10: u'parking',
               11: u'bar ambiance',
               12: u'uk',
               13: u'good service',
               14: u'fun ambiance',
               15: u'young',
               16: u'comfort food',
               17: u'greek',
               18: u'high end',
               19: u'hotwing',
               20: u'breakfast',
               21: u'sweet',
               22: u'wine & dine',
               23: u'pubs',
               24: u'good taste',
               25: u'na drinks',
               26: u'desserts',
               27: u'coffee shop',
               28: u'mexican',
               29: u'reviews',
               30: u'new york',
               31: u'general restaurant',
               32: u'beach',
               33: u'location',
               34: u'happy hour',
               35: u'amazing',
               36: u'vietnamese',
               37: u'time',
               38: u'vas legas',
               39: u'montreal',
               40: u'deli',
               41: u'buffet',
               42: u'bbq',
               43: u'french',
               44: u'money',
               45: u'street taco',
               46: u'pizza',
               47: u'airport & delivery',
               48: u'burger & fries',
               49: u'italian'}

In [36]:
topic_names_filepath = 'topic_names_eats_30.pkl'

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)

In [None]:
all_test_filepath

In [41]:
sample_review = get_sample_review(all_test_filepath, 6)
print(sample_review)

This place is awesome! Definitely authentic!!!  My two favourite dishes are the rice flour rolls and the chicken pho. The rice flour rolls are always fresh whenever I'm there! And the chicken pho is always flavourful!! mmmm....just thinking of it makes me want some!  My boyfriend is Vietnamese and he agrees that this place is authentic and one of the best Vietnamese restaurants he has ever eaten at.  Prices are very reasonable too!i love this joint.  really down to earth people.  the pho and the broth is minimal but good. less is more.  no funny dry mouth feeling because of the the msg.  tons of bean sprouts and the herbal lettuce stuff.  fast service, most of the times.  the prices are very reasonable.  it's a small place but it's big enough.  just go after lunch.  the menu is not that big which is a good thing.I think this is the best sit down meal you can get in the city for $10, all in. The Banh Cuon, soups and noodle bows are tasty and comforting. It's around the corner from our h

In [56]:
lda_description(sample_review)

chinese                   inf
vietnamese                inf
service                   inf
reviews                   inf
amazing                   inf
good service              inf
greek                     inf
time                      inf
general restaurant        inf
money                     inf
montreal                  inf
coffee shop               inf
airport & delivery        inf
vas legas                 inf
parking                   inf
grocery                   inf
na drinks                 inf
japanese                  inf
good taste                inf
smell                     inf
pubs                      inf




In [44]:
parsed_review = nlp(sample_review)
print(parsed_review)

This place is awesome! Definitely authentic!!!  My two favourite dishes are the rice flour rolls and the chicken pho. The rice flour rolls are always fresh whenever I'm there! And the chicken pho is always flavourful!! mmmm....just thinking of it makes me want some!  My boyfriend is Vietnamese and he agrees that this place is authentic and one of the best Vietnamese restaurants he has ever eaten at.  Prices are very reasonable too!i love this joint.  really down to earth people.  the pho and the broth is minimal but good. less is more.  no funny dry mouth feeling because of the the msg.  tons of bean sprouts and the herbal lettuce stuff.  fast service, most of the times.  the prices are very reasonable.  it's a small place but it's big enough.  just go after lunch.  the menu is not that big which is a good thing.I think this is the best sit down meal you can get in the city for $10, all in. The Banh Cuon, soups and noodle bows are tasty and comforting. It's around the corner from our h

In [46]:
unigram_review = [token.lemma_ for token in parsed_review
        if not punct_space(token)]
print(unigram_review)

['this', 'place', 'be', 'awesome', 'definitely', 'authentic', '-PRON-', 'two', 'favourite', 'dish', 'be', 'the', 'rice', 'flour', 'roll', 'and', 'the', 'chicken', 'pho', 'the', 'rice', 'flour', 'roll', 'be', 'always', 'fresh', 'whenever', '-PRON-', 'be', 'there', 'and', 'the', 'chicken', 'pho', 'be', 'always', 'flavourful', 'mmmm', 'just', 'think', 'of', '-PRON-', 'make', '-PRON-', 'want', 'some', '-PRON-', 'boyfriend', 'be', 'vietnamese', 'and', '-PRON-', 'agree', 'that', 'this', 'place', 'be', 'authentic', 'and', 'one', 'of', 'the', 'good', 'vietnamese', 'restaurant', '-PRON-', 'have', 'ever', 'eat', 'at', 'price', 'be', 'very', 'reasonable', 'too!i', 'love', 'this', 'joint', 'really', 'down', 'to', 'earth', 'people', 'the', 'pho', 'and', 'the', 'broth', 'be', 'minimal', 'but', 'good', 'less', 'be', 'more', 'no', 'funny', 'dry', 'mouth', 'feeling', 'because', 'of', 'the', 'the', 'msg', 'ton', 'of', 'bean', 'sprout', 'and', 'the', 'herbal', 'lettuce', 'stuff', 'fast', 'service', 'most

In [47]:
bigram_review = bigram_model[unigram_review]
trigram_review = trigram_model[bigram_review]
print(trigram_review)

['this', 'place', 'be', 'awesome', 'definitely', 'authentic', '-PRON-', 'two', 'favourite', 'dish', 'be', 'the', 'rice', 'flour', 'roll', 'and', 'the', 'chicken', 'pho', 'the', 'rice', 'flour', 'roll', 'be', 'always', 'fresh', 'whenever', '-PRON-', 'be', 'there', 'and', 'the', 'chicken', 'pho', 'be', 'always', 'flavourful', 'mmmm', 'just', 'think', 'of', '-PRON-', 'make', '-PRON-', 'want', 'some', '-PRON-', 'boyfriend', 'be', 'vietnamese', 'and', '-PRON-', 'agree', 'that', 'this', 'place', 'be', 'authentic', 'and', 'one', 'of', 'the', 'good', 'vietnamese', 'restaurant', '-PRON-', 'have', 'ever', 'eat', 'at', 'price', 'be', 'very', 'reasonable', 'too!i', 'love', 'this', 'joint', 'really', 'down', 'to', 'earth', 'people', 'the', 'pho', 'and', 'the', 'broth', 'be', 'minimal', 'but', 'good', 'less', 'be', 'more', 'no', 'funny', 'dry', 'mouth', 'feeling', 'because', 'of', 'the', 'the', 'msg', 'ton', 'of', 'bean_sprout', 'and', 'the', 'herbal', 'lettuce', 'stuff', 'fast', 'service', 'most', 



In [48]:
trigram_review = [term for term in trigram_review
                 if not term in spacy.lang.en.stop_words.STOP_WORDS]
print(trigram_review)

['place', 'awesome', 'definitely', 'authentic', '-PRON-', 'favourite', 'dish', 'rice', 'flour', 'roll', 'chicken', 'pho', 'rice', 'flour', 'roll', 'fresh', '-PRON-', 'chicken', 'pho', 'flavourful', 'mmmm', 'think', '-PRON-', '-PRON-', 'want', '-PRON-', 'boyfriend', 'vietnamese', '-PRON-', 'agree', 'place', 'authentic', 'good', 'vietnamese', 'restaurant', '-PRON-', 'eat', 'price', 'reasonable', 'too!i', 'love', 'joint', 'earth', 'people', 'pho', 'broth', 'minimal', 'good', 'funny', 'dry', 'mouth', 'feeling', 'msg', 'ton', 'bean_sprout', 'herbal', 'lettuce', 'stuff', 'fast', 'service', 'time', 'price', 'reasonable', '-PRON-', 'small', 'place', '-PRON-', 'big', 'lunch', 'menu', 'big', 'good', 'thing', '-PRON-', 'think', 'good', 'sit', 'meal', '-PRON-', 'city', '$', '10', 'banh_cuon', 'soup', 'noodle', 'bow', 'tasty', 'comforting', '-PRON-', 'corner', '-PRON-', 'house', '-PRON-', 'husband', '-PRON-', 'like', 'duck', 'quick', 'lunch', '-PRON-', 'service', 'usually', 'speedy', 'restaurant', 

In [50]:
review_bow = trigram_dictionary.doc2bow(trigram_review)
print(review_bow)

[(1, 7), (2, 4), (3, 2), (4, 2), (5, 7), (6, 1), (8, 3), (9, 2), (10, 2), (12, 1), (13, 2), (14, 1), (15, 16), (17, 37), (18, 1), (20, 14), (21, 22), (23, 3), (25, 2), (26, 9), (27, 8), (29, 1), (30, 17), (32, 4), (34, 2), (36, 1), (37, 1), (38, 2), (39, 1), (41, 7), (42, 13), (45, 1), (46, 1), (47, 3), (48, 1), (53, 7), (57, 11), (60, 1), (64, 3), (66, 9), (69, 1), (70, 11), (72, 3), (73, 2), (75, 1), (76, 1), (77, 2), (78, 1), (79, 1), (80, 3), (81, 1), (82, 3), (87, 6), (89, 1), (91, 1), (93, 2), (94, 6), (97, 2), (104, 1), (111, 1), (113, 9), (115, 4), (120, 4), (121, 2), (123, 1), (127, 2), (129, 1), (130, 9), (131, 1), (132, 2), (133, 2), (134, 3), (135, 1), (136, 10), (137, 1), (138, 1), (139, 8), (140, 7), (141, 7), (142, 6), (143, 1), (144, 11), (145, 4), (146, 17), (147, 7), (148, 1), (151, 1), (152, 1), (153, 1), (155, 2), (157, 12), (160, 1), (161, 2), (162, 2), (164, 1), (168, 1), (170, 1), (172, 3), (173, 1), (175, 2), (176, 4), (179, 4), (181, 4), (184, 1), (186, 5), (18

In [51]:
review_lda = lda[review_bow]
print(review_lda)

[(0, 0.3742013), (3, 0.011497187), (4, 0.013167743), (6, 0.062449753), (9, 0.015615614), (10, 0.016892921), (13, 0.03469188), (17, 0.0336184), (23, 0.010528261), (24, 0.01236545), (25, 0.014293573), (27, 0.019657997), (29, 0.06155967), (31, 0.027089786), (35, 0.044541087), (36, 0.07107436), (37, 0.031105291), (38, 0.017945733), (39, 0.02027938), (44, 0.024539562), (47, 0.018142685)]


In [52]:
review_lda = sorted(review_lda, key=lambda review_lda: -review_lda[1])
print(review_lda)

[(0, 0.3742013), (36, 0.07107436), (6, 0.062449753), (29, 0.06155967), (35, 0.044541087), (13, 0.03469188), (17, 0.0336184), (37, 0.031105291), (31, 0.027089786), (44, 0.024539562), (39, 0.02027938), (27, 0.019657997), (47, 0.018142685), (38, 0.017945733), (10, 0.016892921), (9, 0.015615614), (25, 0.014293573), (4, 0.013167743), (24, 0.01236545), (3, 0.011497187), (23, 0.010528261)]


In [75]:
df = pd.DataFrame(review_lda)
df.columns = ['topic_number', 'freq']
df = df.assign(topic=topic_names[df['topic_number']])
print(df)

TypeError: 'Series' objects are mutable, thus they cannot be hashed

In [81]:
df = pd.DataFrame(columns=["topic_name", "freq"])
for topic_number, freq in review_lda:
    df = df.append({
     "topic_name": topic_names[topic_number],
     "freq":  round(freq, 4)
      }, ignore_index=True)
print(df)

            topic_name    freq
0              chinese  0.3742
1           vietnamese  0.0711
2              service  0.0624
3              reviews  0.0616
4              amazing  0.0445
5         good service  0.0347
6                greek  0.0336
7                 time  0.0311
8   general restaurant  0.0271
9                money  0.0245
10            montreal  0.0203
11         coffee shop  0.0197
12  airport & delivery  0.0181
13           vas legas  0.0179
14             parking  0.0169
15             grocery  0.0156
16           na drinks  0.0143
17            japanese  0.0132
18          good taste  0.0124
19               smell  0.0115
20                pubs  0.0105


In [63]:
 for topic_number, freq in review_lda:
#        if freq < min_topic_freq:
#            break
            
        # print the most highly related topic names and frequencies
    print('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 4)))

chinese                   0.3741999864578247
vietnamese                0.07109999656677246
service                   0.06239999830722809
reviews                   0.06159999966621399
amazing                   0.04450000077486038
good service              0.034699998795986176
greek                     0.03359999880194664
time                      0.031099999323487282
general restaurant        0.02710000053048134
money                     0.02449999935925007
montreal                  0.0203000009059906
coffee shop               0.019700000062584877
airport & delivery        0.01810000091791153
vas legas                 0.017899999395012856
parking                   0.016899999231100082
grocery                   0.015599999576807022
na drinks                 0.014299999922513962
japanese                  0.013199999928474426
good taste                0.012400000356137753
smell                     0.011500000022351742
pubs                      0.010499999858438969
