#Working with Yelp Review Text with nltk & manual data manipulation

In [22]:
import pandas as pd
import sklearn as sk
import numpy as np
import time

In [23]:
#Read in review data

## Commented this out; created a smaller csv to work with because my computer can't handle 
## reviews and business data simultaneously.

## To get the full dataset, uncomment the below: 
#rvws = pd.read_csv("data\yelp_academic_dataset_review.csv")

## Below csv was created previously using the first 5,000 reviews. Once this is a working model we can expand it to a fuller dataset.
#rvws = reviews.iloc[:5000,:]
#rvws.to_csv("yelp_dataset_5k_review.csv")
rvws = pd.read_csv("data\yelp_dataset_5k_review.csv")

In [25]:
## To make things easier when trying to analyze the text, let's just look at restaurants. For that, we'll have to join the review
## and business data. I'm also puling the city and state so I can add location later on if I desire to.
headers = ['categories','business_id','city','state']
business = pd.read_csv("data\yelp_academic_dataset_business.csv", usecols = headers)


In [26]:
## Now that we have the data joined with the business information, we can narrow the dataset to just restaurants so that
## the language we are looking for is more consistent. 
## (e.g. a good doctor review will have differnt language than a good restaurant review)

reviews = pd.merge(rvws, business, on = 'business_id')
cat = reviews['categories']
rest = []

for x in range(len(cat)):
    if "Restaurants" in cat[x]:
        rest.append(1)
    else:
        rest.append(0)

reviews['restaurant'] = rest

## This will modify our reviews DataFrame to only include business that fall into the "restaurants" category.
reviews = reviews[reviews['restaurant'] == 1]
reviews = reviews.reset_index(drop=True)

In [35]:
text = reviews['text']
stars = reviews['stars']

In [36]:
text_length = []
for x in range(len(reviews['text'])):
    text_length.append(len(text[x]))

reviews['text_length'] = text_length
reviews.head()
#reviews['text_length'] = len(reviews['text'])
#reviews

Unnamed: 0.1,Unnamed: 0,user_id,review_id,text,votes.cool,business_id,votes.funny,stars,date,type,votes.useful,categories,state,city,restaurant,text_length
0,22,LWbYpcangjBMm4KPxZGOKg,6w6gMZ3iBLGcUM4RBIuifQ,This place was DELICIOUS!! My parents saw a r...,0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2012-12-01,review,5,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,1038
1,23,m1FpV3EAeggaAdfPx0hBRQ,jVVv_DA5mCDB6mediuwHAw,Can't miss stop for the best Fish Sandwich in ...,0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-03-15,review,0,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,57
2,24,8fApIAMHn2MZJFUiCQto5Q,3Es8GsjkssusYgeU6_ZVpQ,This place should have a lot more reviews - bu...,1,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-03-30,review,2,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,1216
3,25,uK8tzraOp4M5u3uYrqIBXg,KAkcn7oQP1xX8KsZ-XmktA,This place was very good. I found out about Em...,0,mVHrayjG3uZ_RLHkLj-AMg,0,4,2013-10-20,review,1,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,401
4,26,6wvlM5L4_EroGXbnb_92xQ,BZNJkkP0bXnwQ2-sCqat2Q,"Old school.....traditional ""mom 'n pop"" qualit...",0,mVHrayjG3uZ_RLHkLj-AMg,0,5,2013-11-07,review,0,"[u'Bars', u'American (New)', u'Nightlife', u'L...",PA,Braddock,1,217


In [78]:
## Downloads from here: http://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html

positives = open('data/positive-words.txt').read()
negatives = open('data/negative-words.txt').read()
pos_word_list = positives.split('\n')
neg_word_list = negatives.split('\n')

In [112]:
import re
from nltk.corpus import stopwords

def review_to_wordlist(review, remove_stopwords=True):
    # Function to convert a document to a sequence of words,
    # optionally removing stop words.  Returns a list of words.

    review_text = review   
    
    # 1. Remove non-letters
    review_text = re.sub("[^a-zA-Z]"," ", review_text)

    # 2. Convert words to lower case and split them
    words = review_text.lower().split()

    # 4. Optionally remove stop words (false by default)
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        words = [w for w in words if not w in stops]

    # 5. Return a list of words
    return(words)

## Create function to output list of words in given list from a sentence.
def word_count(sentence, word_list):
    count = 0
    for x in review_to_wordlist(sentence):
        if x in word_list:
            count += 1
    
    return count

In [113]:
test = text[0]
print word_count(test, pos_word_list)
print word_count(test, neg_word_list)

14
1


In [114]:
pos_word_count = []
neg_word_count = []
pos_neg_ratio = []
for x in text:
    positive = word_count(x, pos_word_list)
    negative = word_count(x, neg_word_list)
    if (negative > 0):
        ratio = float(positive)/negative
    else:
        ratio = positive
    pos_word_count.append(positive)
    neg_word_count.append(negative)
    pos_neg_ratio.append(ratio)

reviews['pos_word_count'] = pos_word_count
reviews['neg_word_count'] = neg_word_count
reviews['pos_neg_ratio'] = pos_neg_ratio

In [115]:
feature_list = ['text_length','pos_word_count','neg_word_count','pos_neg_ratio']
features = reviews[feature_list]
stars = reviews[['stars']]

In [116]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import cross_validation

x_train, x_test, y_train, y_test = cross_validation.train_test_split(features, stars, test_size=0.3, random_state=12)
tree = DecisionTreeClassifier()
tree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            random_state=None, splitter='best')

In [117]:
pred = tree.predict(x_test)
#scores = cross_validation.cross_val_score(tree, x_test, )
tree.score(x_test, y_test)

0.28902316213494461