In [14]:
import os
import codecs

data_directory = os.path.join(os.getcwd(), 'dataset',
                              'dataset_yelp')

businesses_filepath = os.path.join(data_directory,
                                   'business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print (first_business_record)

{"business_id": "FYWN1wneV18bWNgQjJ2GNg", "name": "Dental by Design", "neighborhood": "", "address": "4855 E Warner Rd, Ste B9", "city": "Ahwatukee", "state": "AZ", "postal_code": "85044", "latitude": 33.3306902, "longitude": -111.9785992, "stars": 4.0, "review_count": 22, "is_open": 1, "attributes": {"AcceptsInsurance": true, "ByAppointmentOnly": true, "BusinessAcceptsCreditCards": true}, "categories": ["Dentists", "General Dentistry", "Health & Medical", "Oral Surgeons", "Cosmetic Dentists", "Orthodontists"], "hours": {"Friday": "7:30-17:00", "Tuesday": "7:30-17:00", "Thursday": "7:30-17:00", "Wednesday": "7:30-17:00", "Monday": "7:30-17:00"}}



In [16]:
review_json_filepath = os.path.join(data_directory,
                                    'review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print (first_review_record)

{"review_id":"v0i_UHJMo_hPBq9bxWvW4w","user_id":"bv2nCi5Qv5vroFiqKGopiw","business_id":"0W4lkclzZThpx3V65bVgig","stars":5,"date":"2016-05-28","text":"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.","useful":0,"funny":0,"cool":0}



In [17]:
import json

restaurant_ids = set()

# open the businesses file
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    
    # iterate through each line (json record) in the file
    for business_json in f:
        
        # convert the json record to a Python dict
        business = json.loads(business_json)
        
        # if this business is not a restaurant, skip to the next one
        if u'Restaurants' not in business[u'categories']:
            continue
            
        # add the restaurant business id to our restaurant_ids set
        restaurant_ids.add(business[u'business_id'])

# turn restaurant_ids into a frozenset, as we don't need to change it anymore
restaurant_ids = frozenset(restaurant_ids)

# print the number of unique restaurant ids in the dataset
print ('{:,}'.format(len(restaurant_ids)), u'restaurants in the dataset.')

54,618 restaurants in the dataset.


In [24]:
intermediate_directory = os.path.join(data_directory, 'intermediate')
print(intermediate_directory)
review_txt_filepath = os.path.join(intermediate_directory,
                                   'review_text_all.txt')

C:\dev\DataScience_handon\statistical-inferences\dataset\dataset_yelp\intermediate


In [27]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    
    review_count = 0

    # create & open a new file in write mode
    with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

        # open the existing review json file
        with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

            # loop through all reviews in the existing file and convert to dict
            for review_json in review_json_file:
                review = json.loads(review_json)

                # if this review is not about a restaurant, skip to the next one
                if review[u'business_id'] not in restaurant_ids:
                    continue

                # write the restaurant review as a line in the new file
                # escape newline characters in the original review text
                review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
                review_count += 1

    print (u'''Text from {:,} restaurant reviews
              written to the new txt file.'''.format(review_count))
    
else:
    
    with codecs.open(review_txt_filepath, encoding='utf_8') as review_txt_file:
        for review_count, line in enumerate(review_txt_file):
            pass
        
    print (u'Text from {:,} restaurant reviews in the txt file.'.format(review_count + 1))

Text from 3,221,419 restaurant reviews
              written to the new txt file.
Wall time: 3min 35s


In [28]:
import spacy
import pandas as pd
import itertools as it
nlp = spacy.load('en')

In [30]:
with codecs.open(review_txt_filepath, encoding='utf_8') as f:
    sample_review = list(it.islice(f, 8, 9))[0]
    sample_review = sample_review.replace('\\n', '\n')
        
print (sample_review)

This is currently my parents new favourite restaurant. 

We come here in the morning for dim sum. They are not the cart pushing type of dim sum, it is order off of the sheet. Dim sum is not bad and not expensive either.

We also frequent the dinner scene. Their set dinner menu is not bad. We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 

Overall, food is pretty tasty!



In [31]:
%%time
parsed_review = nlp(sample_review)

Wall time: 680 ms


In [32]:
print (parsed_review)

This is currently my parents new favourite restaurant. 

We come here in the morning for dim sum. They are not the cart pushing type of dim sum, it is order off of the sheet. Dim sum is not bad and not expensive either.

We also frequent the dinner scene. Their set dinner menu is not bad. We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 

Overall, food is pretty tasty!



In [35]:
for num, sentence in enumerate(parsed_review.sents):
    print ('Sentence {}:'.format(num + 1))
    print (sentence)
    print ('')

Sentence 1:
This is currently my parents new favourite restaurant. 



Sentence 2:
We come here in the morning for dim sum.

Sentence 3:
They are not the cart pushing type of dim sum, it is order off of the sheet.

Sentence 4:
Dim sum is not bad and not expensive either.



Sentence 5:
We also frequent the dinner scene.

Sentence 6:
Their set dinner menu is not bad.

Sentence 7:
We typically order a 6 dish menu and it's big enough to feed a family of 9 with leftovers. 



Sentence 8:
Overall, food is pretty tasty!




In [36]:
for num, entity in enumerate(parsed_review.ents):
    print ('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print ('')

Entity 1: 6 - CARDINAL

Entity 2: 9 - CARDINAL

Entity 3: 
 - GPE



In [41]:
token_text = [token.orth_ for token in parsed_review]
token_pos = [token.pos_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_pos)),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,This,DET
1,is,VERB
2,currently,ADV
3,my,ADJ
4,parents,NOUN
5,new,ADJ
6,favourite,ADJ
7,restaurant,NOUN
8,.,PUNCT
9,\n\n,SPACE


In [42]:
token_lemma = [token.lemma_ for token in parsed_review]
token_shape = [token.shape_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_lemma, token_shape)),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,This,this,Xxxx
1,is,be,xx
2,currently,currently,xxxx
3,my,-PRON-,xx
4,parents,parent,xxxx
5,new,new,xxx
6,favourite,favourite,xxxx
7,restaurant,restaurant,xxxx
8,.,.,.
9,\n\n,\n\n,\n\n


In [43]:
token_entity_type = [token.ent_type_ for token in parsed_review]
token_entity_iob = [token.ent_iob_ for token in parsed_review]

pd.DataFrame(list(zip(token_text, token_entity_type, token_entity_iob)),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,This,,O
1,is,,O
2,currently,,O
3,my,,O
4,parents,,O
5,new,,O
6,favourite,,O
7,restaurant,,O
8,.,,O
9,\n\n,,O


In [44]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_review]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,This,-20.0,,,,,Yes
1,is,-20.0,Yes,,,,Yes
2,currently,-20.0,,,,,Yes
3,my,-20.0,Yes,,,,Yes
4,parents,-20.0,,,,,Yes
5,new,-20.0,,,,,Yes
6,favourite,-20.0,,,,,Yes
7,restaurant,-20.0,,,,,Yes
8,.,-20.0,,Yes,,,Yes
9,\n\n,-20.0,,,Yes,,Yes


In [45]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence



In [46]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [47]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 0 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(review_txt_filepath):
            f.write(sentence + '\n')

In [None]:
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print (u' '.join(unigram_sentence))
    print (u'')

In [None]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)