#Working with Yelp Review Text using NLTK

In [2]:
import pandas as pd
import numpy as np
import time

In [139]:
#Read in review data

## Commented this out; created a smaller csv to work with because my computer can't handle 
## reviews and business data simultaneously.

## To get the full dataset, uncomment the below: 
#rvws = pd.read_csv("data\yelp_academic_dataset_review.csv")

## Below csv was created previously using the first 5,000 reviews. Once this is a working model we can expand it to a fuller dataset.
#rvws = reviews.iloc[:5000,:]
#rvws.to_csv("yelp_dataset_5k_review.csv")
rvws = pd.read_csv("data\yelp_dataset_5k_review.csv")

In [140]:
## To make things easier when trying to analyze the text, let's just look at restaurants. For that, we'll have to join the review
## and business data. I'm also puling the city and state so I can add location later on if I desire to.

headers = ['categories','business_id','city','state']
business = pd.read_csv("data\yelp_academic_dataset_business.csv", usecols = headers)

In [141]:
## Now that we have the data joined with the business information, we can narrow the dataset to just restaurants so that
## the language we are looking for is more consistent. 
## (e.g. a good doctor review will have differnt language than a good restaurant review)

reviews = pd.merge(rvws, business, on = 'business_id')
cat = reviews['categories']
rest = []

for x in range(len(cat)):
    if "Restaurants" in cat[x]:
        rest.append(1)
    elif "Food" in cat[x]:
        rest.append(1)
    else:
        rest.append(0)

reviews['restaurant'] = rest

## This will modify our reviews DataFrame to only include business that fall into the "restaurants" category.

reviews = reviews[reviews['restaurant'] == 1]
reviews = reviews.reset_index(drop=True)

In [144]:
text = reviews['text']
stars = reviews['stars']
len(text)

3735

In [43]:
## Let's create some data based on the text so we can run some regressions on characteristics of the text.

text_length = []
for x in range(len(reviews['text'])):
    text_length.append(len(text[x]))

reviews['text_length'] = text_length
reviews.head()
#reviews['text_length'] = len(reviews['text'])
#reviews

Unnamed: 0.1,Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful,categories,state,city,restaurant,text_length
0,22,LWbYpcangjBMm4KPxZGOKg,6w6gMZ3iBLGcUM4RBIuifQ,This place was DELICIOUS!! My parents saw a r...,0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2012-12-01,review,5,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,1038
1,23,m1FpV3EAeggaAdfPx0hBRQ,jVVv_DA5mCDB6mediuwHAw,Can't miss stop for the best Fish Sandwich in ...,0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-03-15,review,0,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,57
2,24,8fApIAMHn2MZJFUiCQto5Q,3Es8GsjkssusYgeU6_ZVpQ,This place should have a lot more reviews - bu...,1,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-03-30,review,2,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,1216
3,25,uK8tzraOp4M5u3uYrqIBXg,KAkcn7oQP1xX8KsZ-XmktA,This place was very good. I found out about Em...,0,mVHrayjG3uZ_RLHkLj-AMg,0,4,2013-10-20,review,1,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,401
4,26,6wvlM5L4_EroGXbnb_92xQ,BZNJkkP0bXnwQ2-sCqat2Q,"Old school.....traditional ""mom 'n pop"" qualit...",0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-11-07,review,0,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,217


In [54]:
## Downloads from here: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html
## Grab a generic words of positive and negative words; we are going to use these to get a sentiment score out of the text.

positives = open('data/positive-words.txt').read()
negatives = open('data/negative-words.txt').read()
pos_word_list = positives.split('\n')
neg_word_list = negatives.split('\n')

In [55]:
## Create function to output list of words in given list from a sentence.

def review_to_wordlist(review, remove_stopwords=False):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    review_text = review   
    
    # 1. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    # 2. Convert words to lower case and split them
    words = review_text.lower().split()

    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 5. Return a list of words
    return(words)

def word_count(sentence, word_list):
    count = 0
    for x in review_to_wordlist(sentence):
        if x in word_list:
            count += 1
    
    return count

In [46]:
## In an initial run of this test, two separate values were created, which counted the positive and negative words in the corpus.
## Upon further review, this was deleted and replace with a sentiment score which was 1 for all positive words
## and -1 for all negative words

#pos_word_count = []
#neg_word_count = []
#pos_neg_ratio = []
#for x in text:
#    positive = word_count(x, pos_word_list)
#    negative = word_count(x, neg_word_list)
#    if (negative > 0):
#        ratio = float(positive)/negative
#    else:
#        ratio = positive
#    
#    if ((positive + negative) == 0):
#        sent = 0
#    else:
#        sent = ((positive-negative)/(float(positive)+negative))
#        
#    pos_word_count.append(positive)
#    neg_word_count.append(negative)
#    pos_neg_ratio.append(ratio)

#reviews['pos_word_count'] = pos_word_count
#reviews['neg_word_count'] = neg_word_count
#reviews['sentiment'] = sentiment
#reviews['pos_neg_ratio'] = pos_neg_ratio

In [56]:
## Use positive and negative words to give each piece of a text a sentiment score

sentiment = []
pos_neg_ratio = []
count = 0
for x in text:
    positive = word_count(x, pos_word_list)
    negative = word_count(x, neg_word_list)
    if (negative > 0):
        ratio = float(positive)/negative
    else:
        ratio = positive
    
    if ((positive + negative) == 0):
        sent = 1
    else:
        sent = ((positive-negative)/(float(positive)+negative))+1
        
    sentiment.append(sent)
    pos_neg_ratio.append(ratio)
    if count % 1000 == 0:
        print "Scoring text #{}".format(count)
    count += 1

reviews['sentiment'] = sentiment
reviews['pos_neg_ratio'] = pos_neg_ratio

In [57]:
## Features used for initial pass: text length, ratio of positive to negative words, and sentiment.

feature_list = ['text_length', 'pos_neg_ratio', 'sentiment']
features = reviews[feature_list]
stars = reviews[['stars']]

In [58]:
## Test using Various models. Set up the train test split first, then fit various models and see how they perform.

from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score

In [59]:
## Fit various models

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

x_train, x_test, y_train, y_test = cross_validation.train_test_split(features, stars, test_size=0.3, random_state=12)
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
mnbayes = MultinomialNB()

tree.fit(x_train, y_train)
forest.fit(x_train, y_train)
mnbayes.fit(x_train, y_train)

print "Decision Tree score: {}".format(tree.score(x_test, y_test))
print "Random Forest score: {}".format(forest.score(x_test, y_test))
print "Multinomial Naive Bayes score: {}".format(mnbayes.score(x_test, y_test))

  y = column_or_1d(y, warn=True)


Decision Tree score: 0.321141837645
Random Forest score: 0.316681534344
Multinomial Naive Bayes score: 0.351471900089


In [60]:
## Before trying to move on to a new method, let's make this a binary predictor by
## separating reviews into "good" (4-5 stars) and "bad" (1-3 stars)

# Playing with different star thresholds for what we consider a "good" or "bad" review.
star_threshold = 4

reviews['good'] = (reviews['stars'] >= star_threshold)
good = reviews['good']
good = good.map({True: 1, False: 0})

In [61]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(features, good, test_size=0.3, random_state=12)
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
mnbayes = MultinomialNB()

tree.fit(x_train, y_train)
forest.fit(x_train, y_train)
mnbayes.fit(x_train, y_train)

print "Decision Tree score: {}".format(tree.score(x_test, y_test))
print "Random Forest score: {}".format(forest.score(x_test, y_test))
print "Multinomial Naive Bayes score: {} \n".format(mnbayes.score(x_test, y_test))

print "Decision Tree AUC: {}".format(cross_val_score(tree, features, good, cv=10, scoring='roc_auc').mean())
print "Random Forest AUC: {}".format(cross_val_score(forest, features, good, cv=10, scoring='roc_auc').mean())
print "Multinomial Naive Bayes AUC: {}".format(cross_val_score(mnbayes, features, good, cv=10, scoring='roc_auc').mean())

Decision Tree score: 0.648528099911
Random Forest score: 0.680642283675
Multinomial Naive Bayes score: 0.696699375558 

Decision Tree AUC: 0.637084747433
Random Forest AUC: 0.699280111998
Multinomial Naive Bayes AUC: 0.727446155714


##Let's manipulate the data a little more, then vectorize the text and fit it

In [124]:
text = reviews['text']
text = text[:10]
text

0    [This, place, DELICIOUS, My, parents, saw, rec...
1    [Can, miss, stop, best, Fish, Sandwich, Pittsb...
2    [This, place, lot, reviews, I, m, glad, doesn,...
3    [This, place, good, I, found, Emil, watching, ...
4    [Old, school, traditional, mom, n, pop, qualit...
5    [Seen, restaurant, best, places, Pittsburgh, R...
6    [Wonderful, reuben, Map, shown, Yelp, page, in...
7                               [Good, fish, sandwich]
8    [After, morning, Thrift, Store, hunting, frien...
9    [This, hidden, gem, really, It, took, us, fore...
Name: text, dtype: object

In [145]:
## Let's work with the text before vectorizing. We're going to get rid of stop words and non-letters.

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

stop_words = set(stopwords.words("english"))
count = 0
text_clean = text

for x in range(len(text_clean)):
    count +=1
    if count % 500 == 0:
        print "Cleaning text #{}...".format(count)
    w = text[x]
    w = re.sub("[^a-zA-Z]"," ", w)
    w = word_tokenize(w)
    w = [z for z in w if not z in stop_words]
    text_clean[x] = w

text_clean = [' , '.join(z).strip() for z in text]

Cleaning text #500...
Cleaning text #1000...
Cleaning text #1500...
Cleaning text #2000...
Cleaning text #2500...
Cleaning text #3000...
Cleaning text #3500...


A value is trying to be set on a copy of a slice from a DataFrame

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [96]:
## Lemmatize the text... eventually we want to insert this into the box above so that it loops through the list
## only once and works through the text in one pass.

#from nltk.stem import WordNetLemmatizer
#
#lemmatizer=WordNetLemmatizer()
#
#for w in text:
#    w = lemmatizer.lemmatize(w)

#traindf['ingredients_clean_string'] = [' , '.join(z).strip() for z in traindf['ingredients']]  

SyntaxError: invalid syntax (<ipython-input-96-aafa76c33039>, line 13)

In [148]:
## Vectorize the text with sklearn's CountVectorizer.

from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(decode_error = 'ignore')
vect.fit(text_clean)
text_dtm = vect.transform(text_clean)
text_array = text_dtm.toarray()
text_df = pd.DataFrame(text_array, columns = vect.get_feature_names())

In [149]:
## Initial test by running the test on the vectorization of the text.

x_train, x_test, y_train, y_test = cross_validation.train_test_split(text_array, stars, test_size=0.3, random_state=12)
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
mnbayes = MultinomialNB()

tree.fit(x_train, y_train)
forest.fit(x_train, y_train)
mnbayes.fit(x_train, y_train)

print "Decision Tree score: {}".format(tree.score(x_test, y_test))
print "Random Forest score: {}".format(forest.score(x_test, y_test))
print "Multinomial Naive Bayes score: {}".format(mnbayes.score(x_test, y_test))

Decision Tree score: 0.363068688671
Random Forest score: 0.348795718109
Multinomial Naive Bayes score: 0.448706512043


Per the above results, this performed slightly better than the model generated with the sentiment score, positive/negative ratio and length of text. Multinomial Naive Bayes performed the best so far as well.

In [150]:
x_train, x_test, y_train, y_test = cross_validation.train_test_split(text_array, good, test_size=0.3, random_state=12)
tree = DecisionTreeClassifier()
forest = RandomForestClassifier()
mnbayes = MultinomialNB()

tree.fit(x_train, y_train)
forest.fit(x_train, y_train)
mnbayes.fit(x_train, y_train)

print "Decision Tree score: {}".format(tree.score(x_test, y_test))
print "Random Forest score: {}".format(forest.score(x_test, y_test))
print "Multinomial Naive Bayes score: {} \n".format(mnbayes.score(x_test, y_test))

print "Decision Tree AUC: {}".format(cross_val_score(tree, text_array, good, cv=10, scoring='roc_auc').mean())
print "Random Forest AUC: {}".format(cross_val_score(forest, text_array, good, cv=10, scoring='roc_auc').mean())
print "Multinomial Naive Bayes AUC: {}".format(cross_val_score(mnbayes, text_array, good, cv=10, scoring='roc_auc').mean())

Decision Tree score: 0.693131132917
Random Forest score: 0.743086529884
Multinomial Naive Bayes score: 0.817127564674 

Decision Tree AUC: 0.674537784076
Random Forest AUC: 0.766679499683
Multinomial Naive Bayes AUC: 0.790659426578


### Using the vectorization of the words, we get a respectable AUC for MN Bayes (0.817) when testing for good / bad reviews. Let's continue to try to improve this.