In [13]:
import os
import codecs

data_directory = os.path.join(os.getcwd(), 'dataset',
                              'dataset_yelp')

businesses_filepath = os.path.join(data_directory,
                                   'business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print (first_business_record)

{"business_id": "FYWN1wneV18bWNgQjJ2GNg", "name": "Dental by Design", "neighborhood": "", "address": "4855 E Warner Rd, Ste B9", "city": "Ahwatukee", "state": "AZ", "postal_code": "85044", "latitude": 33.3306902, "longitude": -111.9785992, "stars": 4.0, "review_count": 22, "is_open": 1, "attributes": {"AcceptsInsurance": true, "ByAppointmentOnly": true, "BusinessAcceptsCreditCards": true}, "categories": ["Dentists", "General Dentistry", "Health & Medical", "Oral Surgeons", "Cosmetic Dentists", "Orthodontists"], "hours": {"Friday": "7:30-17:00", "Tuesday": "7:30-17:00", "Thursday": "7:30-17:00", "Wednesday": "7:30-17:00", "Monday": "7:30-17:00"}}



In [14]:
review_json_filepath = os.path.join(data_directory,
                                    'review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print (first_review_record)

{"review_id":"v0i_UHJMo_hPBq9bxWvW4w","user_id":"bv2nCi5Qv5vroFiqKGopiw","business_id":"0W4lkclzZThpx3V65bVgig","stars":5,"date":"2016-05-28","text":"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.","useful":0,"funny":0,"cool":0}



In [15]:
import json

restaurant_ids = set()

# open the businesses file
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business is not a restaurant, skip to the next one
        if u'Restaurants' not in business[u'categories']:
            continue
            
        # add the restaurant business id to our restaurant_ids set
        restaurant_ids.add(business[u'business_id'])

# turn restaurant_ids into a frozenset, as we don't need to change it anymore
restaurant_ids = frozenset(restaurant_ids)

# print the number of unique restaurant ids in the dataset
print ('{:,}'.format(len(restaurant_ids)), u'restaurants in the dataset.')

54,618 restaurants in the dataset.


In [16]:
intermediate_directory = os.path.join(data_directory, 'intermediate')
if not os.path.exists(intermediate_directory):
    os.makedirs(intermediate_directory)
print(intermediate_directory)
review_txt_filepath = os.path.join(intermediate_directory,
                                   'review_text_all.txt')

/home/ubuntu/saurav/statistical-inferences/dataset/dataset_yelp/intermediate


In [17]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print (u'''Text from {:,} restaurant reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print (u'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1))

Text from 3,223,214 restaurant reviews in the txt file.
CPU times: user 30.5 s, sys: 1.21 s, total: 31.7 s
Wall time: 30.7 s


In [18]:
import spacy
import pandas as pd
import itertools as it
nlp = spacy.load('en')

In [19]:
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print (sample_review)

This is currently my parents new favourite restaurant. 

We come here in the morning for dim sum. They are not the cart pushing type of dim sum, it is order off of the sheet. Dim sum is not bad and not expensive either.

We also frequent the dinner scene. Their set dinner menu is not bad. We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 

Overall, food is pretty tasty!



In [20]:
%%time
parsed_review = nlp(sample_review)

CPU times: user 723 ms, sys: 5.77 ms, total: 729 ms
Wall time: 26.5 ms


In [21]:
print (parsed_review)

This is currently my parents new favourite restaurant. 

We come here in the morning for dim sum. They are not the cart pushing type of dim sum, it is order off of the sheet. Dim sum is not bad and not expensive either.

We also frequent the dinner scene. Their set dinner menu is not bad. We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 

Overall, food is pretty tasty!



In [22]:
for num, sentence in enumerate(parsed_review.sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')

Sentence 1:
This is currently my parents new favourite restaurant. 



Sentence 2:
We come here in the morning for dim sum.

Sentence 3:
They are not the cart pushing type of dim sum, it is order off of the sheet.

Sentence 4:
Dim sum is not bad and not expensive either.



Sentence 5:
We also frequent the dinner scene.

Sentence 6:
Their set dinner menu is not bad.

Sentence 7:
We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 



Sentence 8:
Overall, food is pretty tasty!




In [23]:
for num, entity in enumerate(parsed_review.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

Entity 1: 6 - CARDINAL

Entity 2: 9 - CARDINAL

Entity 3: 
 - GPE



In [24]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_pos)),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,This,DET
1,is,VERB
2,currently,ADV
3,my,ADJ
4,parents,NOUN
5,new,ADJ
6,favourite,ADJ
7,restaurant,NOUN
8,.,PUNCT
9,\n\n,SPACE


In [25]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,This,this,Xxxx
1,is,be,xx
2,currently,currently,xxxx
3,my,-PRON-,xx
4,parents,parent,xxxx
5,new,new,xxx
6,favourite,favourite,xxxx
7,restaurant,restaurant,xxxx
8,.,.,.
9,\n\n,\n\n,\n\n


In [14]:
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,This,,O
1,is,,O
2,currently,,O
3,my,,O
4,parents,,O
5,new,,O
6,favourite,,O
7,restaurant,,O
8,.,,O
9,\n\n,,O


In [26]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,This,-20.0,,,,,Yes
1,is,-20.0,Yes,,,,Yes
2,currently,-20.0,,,,,Yes
3,my,-20.0,Yes,,,,Yes
4,parents,-20.0,,,,,Yes
5,new,-20.0,,,,,Yes
6,favourite,-20.0,,,,,Yes
7,restaurant,-20.0,,,,,Yes
8,.,-20.0,,Yes,,,Yes
9,\n\n,-20.0,,,Yes,,Yes


In [27]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [28]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [29]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

In [30]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

CPU times: user 0 ns, sys: 8 µs, total: 8 µs
Wall time: 15.7 µs


In [31]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [32]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print (u' '.join(unigram_sentence))
    print (u'')

-PRON- be good but after 4 dish already this be just a little too rich

-PRON- come with banana bread that be really good

overall definitely worth the try

just make sure -PRON- bring cash =)

omg come here with some friend for brunch

and wow

-PRON- do have to wait for about 30 min for a table for 4 but -PRON- be later in the day

look at -PRON- menu -PRON- want to try everything

the french toast the strawberry shortcake donut -PRON- be sell out by the time -PRON- get there

but -PRON- wind up only order two thing the classic 2 egg any style house cure bacon or sage and onion sausage crispy griddle red skin potato with sea salt and thyme



In [33]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [34]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 9.65 s, sys: 1.54 s, total: 11.2 s
Wall time: 11 s


In [35]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [36]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = u' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 15.3 µs


In [37]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [38]:
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print (u' '.join(bigram_sentence))
    print (u'')

-PRON- be good but after 4 dish already this be just a little too rich

-PRON- come with banana bread that be really good

overall definitely worth the try

just make sure -PRON- bring cash =)

omg come here with some friend for brunch

and wow

-PRON- do have to wait for about 30_min for a table for 4 but -PRON- be later in the day

look at -PRON- menu -PRON- want to try everything

the french_toast the strawberry_shortcake donut -PRON- be sell out by the time -PRON- get there

but -PRON- wind up only order two thing the classic 2 egg any style house cure bacon or sage and onion sausage crispy griddle red skin potato with sea_salt and thyme



In [39]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [40]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 0 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 11.7 s, sys: 1.76 s, total: 13.5 s
Wall time: 13.2 s


In [41]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [42]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = u' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 15.3 µs


In [43]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [44]:
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print (u' '.join(trigram_sentence))
    print (u'')

-PRON- be good but after 4 dish already this be just a little too rich

-PRON- come with banana bread that be really good

overall definitely worth the try

just make sure -PRON- bring cash =)

omg come here with some friend for brunch

and wow

-PRON- do have to wait for about 30_min for a table for 4 but -PRON- be later in the day

look at -PRON- menu -PRON- want to try everything

the french_toast the strawberry_shortcake donut -PRON- be sell out by the time -PRON- get there

but -PRON- wind up only order two thing the classic 2 egg any style house cure bacon or sage and onion sausage crispy griddle red skin potato with sea_salt and thyme



In [45]:
trigram_reviews_filepath = os.path.join(intermediate_directory,
                                        'trigram_transformed_reviews_all.txt')

In [46]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(trigram_reviews_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_review in nlp.pipe(line_review(review_txt_filepath),
                                      batch_size=10000, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                              if not punct_space(token)]
            
            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]
            
            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review
                              if term not in spacy.lang.en.STOP_WORDS]
            
            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')

CPU times: user 9 µs, sys: 1 µs, total: 10 µs
Wall time: 16.5 µs


In [47]:
print (u'Original:' + u'\n')

for review in it.islice(line_review(review_txt_filepath), 1, 3):
    print (review)

print (u'----' + u'\n')
print (u'Transformed:' + u'\n')

with codecs.open(trigram_reviews_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 1, 3):
        print (review)

Original:

Super simple place but amazing nonetheless. It's been around since the 30's and they still serve the same thing they started with: a bologna and salami sandwich with mustard. 

Staff was very helpful and friendly.

Small unassuming place that changes their menu every so often. Cool decor and vibe inside their 30 seat restaurant. Call for a reservation. 

We had their beef tartar and pork belly to start and a salmon dish and lamb meal for mains. Everything was incredible! I could go on at length about how all the listed ingredients really make their dishes amazing but honestly you just need to go. 

A bit outside of downtown montreal but take the metro out and it's less than a 10 minute walk from the station.

----

Transformed:

super simple place amazing nonetheless -PRON- 30 's -PRON- serve thing -PRON- start bologna salami sandwich mustard staff helpful friendly

small unassum place change -PRON- menu cool decor vibe inside -PRON- 30 seat restaurant reservation -PRON- -PR

In [48]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle as pickle

In [49]:
trigram_dictionary_filepath  = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [50]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 0 == 1:

    trigram_reviews = LineSentence(trigram_reviews_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_reviews)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

CPU times: user 44.3 ms, sys: 12.1 ms, total: 56.4 ms
Wall time: 53.7 ms


In [51]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')
if not os.path.exists(trigram_bow_filepath):
    os.mknod(trigram_bow_filepath)

In [52]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for review in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(review)

In [53]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 0 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath,
                       trigram_bow_generator(trigram_reviews_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 281 ms, sys: 64.8 ms, total: 346 ms
Wall time: 314 ms


In [54]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [55]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 0 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=50,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 139 ms, sys: 72.5 ms, total: 212 ms
Wall time: 210 ms


In [93]:
def explore_topic(topic_number, topn=25):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print (u'{:20} {}'.format(u'term', u'frequency') + u'\n')

    for term, frequency in lda.show_topic(topic_number, topn=25):
        print (u'{:20} {:.3f}'.format(term, round(frequency, 3)))

In [94]:
explore_topic(topic_number=20)

term                 frequency

wing                 0.144
game                 0.064
watch                0.048
tv                   0.038
bar                  0.020
's                   0.017
play                 0.017
buffalo              0.015
special              0.013
hot                  0.013
sport                0.012
great                0.012
sport_bar            0.012
night                0.012
ranch                0.009
above_average        0.008
big                  0.008
wednesday            0.007
tuesday              0.007
lot                  0.007
pretty               0.007
monday               0.006
fan                  0.006
sunday               0.006
honey                0.006


In [88]:
topic_names = {0: u'mexican',
               1: u'menu',
               2: u'thai',
               3: u'steak',
               4: u'donuts & appetizers',
               5: u'specials',
               6: u'soup',
               7: u'wings, sports bar',
               8: u'foreign language',
               9: u'las vegas',
               10: u'chicken',
               11: u'aria buffet',
               12: u'noodles',
               13: u'ambience & seating',
               14: u'sushi',
               15: u'arizona',
               16: u'family',
               17: u'price',
               18: u'sweet',
               19: u'waiting',
               20: u'general',
               21: u'tapas',
               22: u'dirty',
               23: u'customer service',
               24: u'restrooms',
               25: u'chinese',
               26: u'gluten free',
               27: u'pizza',
               28: u'seafood',
               29: u'amazing',
               30: u'eat, like, know, want',
               31: u'bars',
               32: u'breakfast',
               33: u'location & time',
               34: u'italian',
               35: u'barbecue',
               36: u'arizona',
               37: u'indian',
               38: u'latin & cajun',
               39: u'burger & fries',
               40: u'vegetarian',
               41: u'lunch buffet',
               42: u'customer service',
               43: u'taco, ice cream',
               44: u'high cuisine',
               45: u'healthy',
               46: u'salad & sandwich',
               47: u'greek',
               48: u'experience',
               49: u'wine & dine'}

In [89]:
topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)

In [90]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [91]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath,'rb') as f:
    LDAvis_prepared = pickle.load(f)

CPU times: user 8.38 ms, sys: 24 µs, total: 8.4 ms
Wall time: 7.13 ms


In [92]:
pyLDAvis.display(LDAvis_prepared)

In [73]:
def get_sample_review(review_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(review_txt_filepath),
                          review_number, review_number+1))[0]

In [83]:
def lda_description(review_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the review text with spaCy
    parsed_review = nlp(review_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_review = [token.lemma_ for token in parsed_review
                      if not punct_space(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_review = bigram_model[unigram_review]
    trigram_review = trigram_model[bigram_review]
    
    # remove any remaining stopwords
    trigram_review = [term for term in trigram_review
                      if not term in spacy.lang.en.STOP_WORDS]
    
    # create a bag-of-words representation
    review_bow = trigram_dictionary.doc2bow(trigram_review)
    
    # create an LDA representation
    review_lda = lda[review_bow]
    
    # sort with the most highly related topics first
    #review_lda = sorted(review_lda, key=lambda args: -args[1])
    review_lda = sorted(review_lda, key=lambda args: -args[1])
    
    for topic_number, freq in review_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print ('{:25} {}'.format(topic_names[topic_number],
                                round(freq, 3)))

In [95]:
sample_review = get_sample_review(50)
print (sample_review)

Small little Japanese restaurant in the Don Mills neighbourhood. Lots of different rolls to pick from. 

Great lunch special selection.

Service is a little slow though. So if you only have 1 hour for lunch, may not be the best place. Take out is pretty quick though.



In [96]:
lda_description(sample_review)

experience                0.7099999785423279
price                     0.0949999988079071
specials                  0.06599999964237213




In [97]:
sample_review = get_sample_review(100)
print (sample_review)

This place is a bit of an adventure...at least for me. I've never had charcuterie before and I'm not generally a big meat eater, so this was a little out of the box. That said, I like trying to be adventurous and trying new things, so when are visiting friends suggested we give it a go, we went for it. Dinner service is open at 6pm, so we agreed we'd arrive at 6pm. We got there at about 6:15....there was already a HUGE wait for a table, but we managed to wrangle seats at the bar. 

The four of us ordered a bunch of dishes to split between ourselves. We tried the charcuterie (sort of a platter of assorted sliced meats and pâté), horse tar tar, roasted bone marrow, pork carnitas tacos, tongue on brioche and the foie gras and nutella for dessert. 

The dishes came out staggered, so we got to sit around chatting and enjoying ourselves and our cocktails. The charcuterie came out first. It was assorted sliced meats arranged on a wood plank and served with bread. It was actually really nice. 

In [98]:
lda_description(sample_review)

tapas                     0.2709999978542328
pizza                     0.1379999965429306
greek                     0.12700000405311584
restrooms                 0.10999999940395355
steak                     0.08299999684095383
wings, sports bar         0.07000000029802322
location & time           0.05400000140070915




In [99]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

In [104]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 0 == 1:

    # initiate the model and perform the first epoch of training
    food2vec = Word2Vec(trigram_sentences, size=100, window=5,
                        min_count=20, sg=1, workers=20)
    
    food2vec.save(word2vec_filepath)

    # perform another 11 epochs of training
    #for i in range(1,12):

        #food2vec.train(trigram_sentences)
        #food2vec.save(word2vec_filepath)
        
# load the finished model from disk
food2vec = Word2Vec.load(word2vec_filepath)
#food2vec.init_sims()

print (u'{} training epochs so far.'.format(food2vec.train_count))

1 training epochs so far.
CPU times: user 770 ms, sys: 136 ms, total: 907 ms
Wall time: 905 ms


  setattr(self, attrib, None)


In [107]:
print (u'{:,} terms in the food2vec vocabulary.'.format(len(food2vec.wv.vocab)))

92,757 terms in the food2vec vocabulary.


In [115]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in food2vec.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
#ordered_vocab = sorted(ordered_vocab, key=lambda (term, index, count): -count)
ordered_vocab = sorted(ordered_vocab, key=lambda args: -args[2])

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels

print (type (food2vec.wv.syn0norm))
word_vectors = pd.DataFrame(food2vec.wv.syn0norm[term_indices, :],
                            index=ordered_terms)
word_vectors

<class 'NoneType'>


  app.launch_new_instance()


NameError: name 'word_vectors' is not defined

In [116]:
def get_related_terms(token, topn=10):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in food2vec.most_similar(positive=[token], topn=topn):

        print (u'{:20} {}'.format(word, round(similarity, 3)))

In [117]:
get_related_terms(u'burger_king')

  import sys


KeyError: "word 'burger_king' not in vocabulary"