# Parse Yelp dataset 
- Select only comments of business=Restaurants whit size between 100 and 300 chars

In [1]:
import os
import codecs
import json
import itertools as it

data_directory = '/home/ubuntu/data/training/text_mining/dataset'



In [2]:
businesses_filepath = os.path.join(data_directory, 'business.json')

with codecs.open(businesses_filepath, encoding='utf_8') as f:
    first_business_record = f.readline() 

print(first_business_record)

{"business_id": "FYWN1wneV18bWNgQjJ2GNg", "name": "Dental by Design", "neighborhood": "", "address": "4855 E Warner Rd, Ste B9", "city": "Ahwatukee", "state": "AZ", "postal_code": "85044", "latitude": 33.3306902, "longitude": -111.9785992, "stars": 4.0, "review_count": 22, "is_open": 1, "attributes": {"AcceptsInsurance": true, "ByAppointmentOnly": true, "BusinessAcceptsCreditCards": true}, "categories": ["Dentists", "General Dentistry", "Health & Medical", "Oral Surgeons", "Cosmetic Dentists", "Orthodontists"], "hours": {"Friday": "7:30-17:00", "Tuesday": "7:30-17:00", "Thursday": "7:30-17:00", "Wednesday": "7:30-17:00", "Monday": "7:30-17:00"}}



In [3]:
review_json_filepath = os.path.join(data_directory, 'review.json')

with codecs.open(review_json_filepath, encoding='utf_8') as f:
    first_review_record = f.readline()
    
print(first_review_record)

{"review_id":"v0i_UHJMo_hPBq9bxWvW4w","user_id":"bv2nCi5Qv5vroFiqKGopiw","business_id":"0W4lkclzZThpx3V65bVgig","stars":5,"date":"2016-05-28","text":"Love the staff, love the meat, love the place. Prepare for a long line around lunch or dinner hours. \n\nThey ask you how you want you meat, lean or something maybe, I can't remember. Just say you don't want it too fatty. \n\nGet a half sour pickle and a hot pepper. Hand cut french fries too.","useful":0,"funny":0,"cool":0}



In [4]:

# Select the business ids of the restaurants
restaurant_ids = set()
with codecs.open(businesses_filepath, encoding='utf_8') as f:
    for business_json in f:
        business = json.loads(business_json)
        if u'Restaurants' not in business[u'categories']:
            continue
        restaurant_ids.add(business[u'business_id'])

print(len(restaurant_ids), 'restaurants in the dataset.')

54618 restaurants in the dataset.


In [5]:
%%time

review_count = 0

# create & open a new file in write mode
review_txt_filepath = os.path.join(data_directory, 'review_text_all.txt')

with codecs.open(review_txt_filepath, 'w', encoding='utf_8') as review_txt_file:

    # open the existing review json file
    with codecs.open(review_json_filepath, encoding='utf_8') as review_json_file:

        # loop through all reviews in the existing file and convert to dict
        for review_json in review_json_file:
            review = json.loads(review_json)

            # if this review is not about a restaurant, skip to the next one
            if review[u'business_id'] not in restaurant_ids:
                continue

            if len(review[u'text'])>600 or len(review[u'text'])<300:
                continue
                
            # write the restaurant review as a line in the new file
            # escape newline characters in the original review text
            review_txt_file.write(review[u'text'].replace('\n', '\\n') + '\n')
            review_count += 1

print( u'''Text from {:,} restaurant reviews written to the new txt file.'''.format(review_count))
# 1024739    


Text from 974,066 restaurant reviews written to the new txt file.
CPU times: user 2min 47s, sys: 2.32 s, total: 2min 49s
Wall time: 2min 51s


# Spacy NLP process

In [6]:
import spacy

nlp = spacy.load('en')

In [7]:
def punct_space(token):
    """
    helper function to eliminate tokens
    that are pure punctuation or whitespace
    """
    
    return token.is_punct or token.is_space

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename),
                                  batch_size=10000, n_threads=4):
        
        for sent in parsed_review.sents:
            yield u' '.join([token.lemma_ for token in sent
                             if not punct_space(token)])

In [None]:
%%time

# segmenting the reviews into individual sentences and normalizing the text
unigram_sentences_filepath = os.path.join(data_directory, 'unigram_sentences_all.txt')
with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for sentence in lemmatized_sentence_corpus(review_txt_filepath):
        f.write(sentence + '\n')

In [11]:
# Check the content

# LineSentence: iterator over a file with one sentence by line.
from gensim.models.word2vec import LineSentence

unigram_sentences = LineSentence(unigram_sentences_filepath)
for unigram_sentence in it.islice(unigram_sentences, 230, 240):
    print(u' '.join(unigram_sentence), '\n')


-PRON- advice be avoid this place if -PRON- can 

unfortunately -PRON- will probably eat here again as -PRON- be right across the street from -PRON- 

and there be few other choice in the wee hour of the morning 

when -PRON- be look for a nice quiet drink in a clean bar this be always the place -PRON- choose 

this be a place that -PRON- would refer to as a bar for adult as oppose to those place where all those kid hang out 

chance be -PRON- be not go to find a bunch of kid act like -PRON- be have there first beer 

if someone decide to have a shot -PRON- will not have to listen to all there friend yell shot shot shot shot good food clean good service 

a class place 

-PRON- have eat here several time as -PRON- work in the area 

as usual for a food court area in a casino -PRON- be always very loud here 



## Phrase Modeling

In [18]:
%%time

# Learn a phrase model that will link individual words into two-word phrases
from gensim.models import Phrases

bigram_model_filepath = os.path.join(data_directory, 'bigram_model_all')

bigram_model = Phrases(unigram_sentences)
bigram_model.save(bigram_model_filepath)

CPU times: user 3min 13s, sys: 10.4 s, total: 3min 23s
Wall time: 3min 24s


In [19]:
%%time

# With the trained phrase model for word pairs, let's apply it to the review sentences data 
bigram_sentences_filepath = os.path.join(data_directory, 'bigram_sentences_all.txt')

with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for unigram_sentence in unigram_sentences:
        bigram_sentence = u' '.join(bigram_model[unigram_sentence])
        f.write(bigram_sentence + '\n')



CPU times: user 9min 3s, sys: 29.7 s, total: 9min 32s
Wall time: 9min 37s


In [20]:
# Explore the results
bigram_sentences = LineSentence(bigram_sentences_filepath)
for bigram_sentence in it.islice(bigram_sentences, 230, 240):
    print(u' '.join(bigram_sentence), '\n')


-PRON- advice be avoid this place if -PRON- can 

unfortunately -PRON- will probably eat here again as -PRON- be right across the street from -PRON- 

and there be few other choice in the wee hour of the morning 

when -PRON- be look for a nice quiet drink in a clean bar this be always the place -PRON- choose 

this be a place that -PRON- would refer to as a bar for adult as oppose to those place where all those kid hang out 

chance be -PRON- be not go to find a bunch of kid act like -PRON- be have there first beer 

if someone decide to have a shot -PRON- will not have to listen to all there friend yell shot shot shot shot good food clean good service 

a class place 

-PRON- have eat here several time as -PRON- work in the area 

as usual for a food court area in a casino -PRON- be always very loud here 



In [21]:
%%time

# Learn a new phrase model over the previous results to obtain trigrams.
trigram_model_filepath = os.path.join(data_directory, 'trigram_model_all')

trigram_model = Phrases(bigram_sentences)
trigram_model.save(trigram_model_filepath)

CPU times: user 3min 11s, sys: 11.7 s, total: 3min 22s
Wall time: 3min 22s


In [22]:
%%time

# Apply over the corpus
trigram_sentences_filepath = os.path.join(data_directory, 'trigram_sentences_all.txt')

with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
    for bigram_sentence in bigram_sentences:
        trigram_sentence = u' '.join(trigram_model[bigram_sentence])
        f.write(trigram_sentence + '\n')



CPU times: user 8min 59s, sys: 29.2 s, total: 9min 28s
Wall time: 9min 32s


In [23]:
# Explore the results
trigram_sentences = LineSentence(trigram_sentences_filepath)
for trigram_sentence in it.islice(trigram_sentences, 230, 240):
    print(u' '.join(trigram_sentence), '\n')


-PRON- advice be avoid this place if -PRON- can 

unfortunately -PRON- will probably eat here again as -PRON- be right across the street from -PRON- 

and there be few other choice in the wee_hour of the morning 

when -PRON- be look for a nice quiet drink in a clean bar this be always the place -PRON- choose 

this be a place that -PRON- would refer to as a bar for adult as_oppose to those place where all those kid hang_out 

chance be -PRON- be not go to find a bunch of kid act like -PRON- be have there first beer 

if someone decide to have a shot -PRON- will not have to listen to all there friend yell shot shot shot shot good food clean good service 

a class place 

-PRON- have eat here several time as -PRON- work in the area 

as usual for a food court area in a casino -PRON- be always very loud here 



In [24]:
# Check results of transformation
print(u'Original:' + u'\n')

for review in it.islice(line_review(review_txt_filepath), 0, 3):
    print(review)

print(u'----' + u'\n')
print(u'Transformed:' + u'\n')

with codecs.open(trigram_sentences_filepath, encoding='utf_8') as f:
    for review in it.islice(f, 0, 14):
        print(review)

Original:

Small unassuming place that changes their menu every so often. Cool decor and vibe inside their 30 seat restaurant. Call for a reservation. 

We had their beef tartar and pork belly to start and a salmon dish and lamb meal for mains. Everything was incredible! I could go on at length about how all the listed ingredients really make their dishes amazing but honestly you just need to go. 

A bit outside of downtown montreal but take the metro out and it's less than a 10 minute walk from the station.

Lester's is located in a beautiful neighborhood and has been there since 1951. They are known for smoked meat which most deli's have but their brisket sandwich is what I come to montreal for. They've got about 12 seats outside to go along with the inside. 

The smoked meat is up there in quality and taste with Schwartz's and you'll find less tourists at Lester's as well.

Love coming here. Yes the place always needs the floor swept but when you give out  peanuts in the shell how w

# Complete pipeline for new texts

In [24]:
%%time

# Final step:
# - pipeline that applies our text normalization and phrase models
# - remove stopwords
# - write the transformed text out to a new file
def apply_transformation(review_txt_filepath_in, trigram_reviews_filepath_out):

    with codecs.open(trigram_reviews_filepath_out, 'w', encoding='utf_8') as f:

        for parsed_review in nlp.pipe(line_review(review_txt_filepath_in), batch_size=10000, n_threads=4):

            # lemmatize the text, removing punctuation and whitespace
            unigram_review = [token.lemma_ for token in parsed_review
                                  if not punct_space(token)]

            # apply the first-order and second-order phrase models
            bigram_review = bigram_model[unigram_review]
            trigram_review = trigram_model[bigram_review]

            # remove any remaining stopwords
            trigram_review = [term for term in trigram_review if term not in spacy.en.STOPWORDS]

            # write the transformed review as a line in the new file
            trigram_review = u' '.join(trigram_review)
            f.write(trigram_review + '\n')
            
            




AttributeError: module 'spacy' has no attribute 'en'