# Yelp Data Challenge - NLP (Classification)

BitTiger DS501

Mar 2017

In [45]:
import pandas as pd
import numpy as np

In [46]:
df = pd.read_csv('last_2yr_restaurant_reviews.csv')

In [47]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,type,useful,user_id,count
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-26,0,nCqdz-NW64KazpxqnDr0sQ,1,I mainly went for the ceasar salad prepared ta...,review,0,0XVzm4kVIAaH4eQAxWbhvw,1
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,review,0,2aeNFntqY2QDZLADNo8iQQ,1
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-04-05,0,2HrBENXZTiitcCJfzkELgA,2,To be honest it really quit aweful. First the ...,review,0,WFhv5pMJRDPWSyLnKiWFXA,1
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-16,0,6YNPXoq41qTMZ2TEi0BYUA,2,"The food was decent, but the service was defin...",review,0,2S6gWE-K3DHNcKYYSgN7xA,1
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-08,1,4bQrVUiRZ642odcKCS0OhQ,2,If you're looking for craptastic service and m...,review,1,rCTVWx_Tws2jWi-K89iEyw,1


### Define your feature variables, here is the text of the review

In [48]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents=df['text'].values

In [49]:
# inspect your documents, e.g. check the size, take a peek at elements of the numpy array
documents[0:3]

array([ 'I mainly went for the ceasar salad prepared tableside.  I ate in the bar, the bartender was very nice and helpful.  I got the grilled cheese with tomato soup.  Grilled cheese was very good but the soup was nothing special.  Now the salad that i read one reviewer said the best in vegas, which is the only reason i came.  Knowing that they put anchovies in it when they prepare tableside, i was going to tell them to hold off on that once they get started.  So as im waiting for them to come up and make it, they bring it already prepared.  What is that?  The whole point of getting it is to watch it being done and see that its made fresh.  So obviously the anchovies were already in it, and since i explained i didnt want them, they made another.   I was told its a fire hazard to prepare it in the bar area so they made it on the side when i wasnt looking.  The few bites i took werent that good.  So i watch them make the 2nd salad in the hallway.  Needless to say, it was totally flavorl

### Define your target variable (any categorical variable that may be meaningful)

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [50]:
# Make a column and take the values, save to a variable named "target"
df['perfectscore']=(df['stars']>4)
df['perfectscore']=df['perfectscore'] * 1
target=df['perfectscore'].values
target

array([0, 0, 0, ..., 1, 1, 0])

#### You may want to look at the statistic of the target variable

In [51]:
# To be implemented
target[0:10]

array([0, 0, 0, 0, 0, 1, 1, 1, 1, 0])

## Let's create training dataset and test dataset

In [52]:
from sklearn.cross_validation import train_test_split

In [53]:
# Documents is your X, target is your y
# Now split the data to training set and test set

In [54]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split(documents, target, test_size=0.05, random_state=42)


## Let's get NLP representation of the documents

In [74]:
documents_train.shape
target_train.shape

(330238,)

In [55]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [65]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words='english', max_features=200)
vectorizer

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=200, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [94]:
# Train the model with your training data
train_vectors=vectorizer.fit_transform(documents_train).toarray()

In [68]:
train_vectors.shape

(330238, 200)

In [70]:
# Get the vocab of your tfidf
vocab=vectorizer.get_feature_names()
vocab

[u'10',
 u'20',
 u'actually',
 u'amazing',
 u'area',
 u'ask',
 u'asked',
 u'atmosphere',
 u'attentive',
 u'away',
 u'awesome',
 u'bacon',
 u'bad',
 u'bar',
 u'beef',
 u'best',
 u'better',
 u'big',
 u'bit',
 u'bread',
 u'breakfast',
 u'buffet',
 u'burger',
 u'busy',
 u'came',
 u'check',
 u'cheese',
 u'chicken',
 u'clean',
 u'coffee',
 u'come',
 u'coming',
 u'cooked',
 u'crab',
 u'cream',
 u'customer',
 u'day',
 u'decided',
 u'definitely',
 u'delicious',
 u'dessert',
 u'did',
 u'didn',
 u'different',
 u'dining',
 u'dinner',
 u'disappointed',
 u'dish',
 u'dishes',
 u'don',
 u'drink',
 u'drinks',
 u'eat',
 u'eating',
 u'enjoyed',
 u'excellent',
 u'experience',
 u'family',
 u'far',
 u'fast',
 u'favorite',
 u'feel',
 u'fish',
 u'flavor',
 u'food',
 u'free',
 u'fresh',
 u'fried',
 u'friend',
 u'friendly',
 u'friends',
 u'fries',
 u'gave',
 u'going',
 u'good',
 u'got',
 u'great',
 u'half',
 u'happy',
 u'high',
 u'highly',
 u'home',
 u'hot',
 u'hour',
 u'house',
 u'huge',
 u'husband',
 u'inside

In [76]:
# Use the trained model to transform your test data
y_test=target_test
x_test=vectorizer.transform(documents_test)

In [78]:
x_test.shape

(17381, 200)

## Similar review search engine

In [87]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order
                                                            #[::-1]is descending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [88]:
lst = [7, 3, 2, 4, 1]
n = 2
labels = ["cat", "dog", "mouse", "pig", "rabbit"]
get_bottom_values(lst, n, labels)

['rabbit', 'mouse']

In [89]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [124]:
# Draw an arbitrary review from test (unseen in training) documents
arbreview=documents_test[42]
queried_rev=[arbreview]

In [131]:
# Transform the drawn review(s) to vector(s)
vector_rev=vectorizer.transform(queried_rev).toarray()
vector_rev.shape

(1, 200)

In [132]:
# Calculate the similarity score(s) between vector(s) and training vectors
cossimilarity=cosine_similarity(vector_rev , train_vectors)

In [133]:
cossimilarity

array([[ 0.        ,  0.07793682,  0.        , ...,  0.07240568,
         0.        ,  0.        ]])

In [134]:
# Let's find top 5 similar reviews
n = 5
lst=cossimilarity[0]
labels=documents_train
top_5=get_top_values(lst, n, labels)

In [135]:
print 'Our search query:'
print  queried_rev

Our search query:
['Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)']


In [146]:
print 'Most %s similar reviews:' % n
for i, review in enumerate(top_5):
    print "Top_%s:"%(i+1)
    print review

Most 5 similar reviews:
Top_1:
The service was great and the food was definately above average but the two Kale salads that were ordered literally had no Kale in them so the server had to get 2 bowls of Kale to add. Lots of good beers on tap and a fun atmosphere.
Top_2:
Great food, atmosphere and service. 

Competitive pricing and food very tasty!
Top_3:
Good authentic Mexican food with a fun atmosphere. great service from our server Tillana too !
Top_4:
The service is supreme. And the food divine. The pumpkin bread is amazing and a delight.
Top_5:
Very tasty food. Owner is Italiano & she has a great sense of humor. The food was outstanding. My first visit and I was completely impressed. Mangia Mangia


#### Q: Does the result make sense to you?

A: (insert your comments here)

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [147]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB
model=MultinomialNB()
model.fit(train_vectors,target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [148]:
# Get score for training set
model.score(train_vectors,target_train)

0.75585486830709969

In [149]:
# Get score for test set
model.score(x_test, y_test)

0.75835682641965363

#### Logistic Regression Classifier

In [150]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(train_vectors,target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [151]:
# Get score for training set
model.score(train_vectors,target_train)

0.77437181668978128

In [152]:
# Get score for test set
model.score(x_test, y_test)

0.77400609861342851

In [154]:
model.coef_.shape


(1, 200)

#### Q: What are the key features(words) that make the positive prediction?

In [158]:
# Let's find it out by ranking
n = 20
get_top_values(model.coef_[0][model.coef_[0]>0], n, vocab)

[u'20',
 u'attentive',
 u'ask',
 u'bread',
 u'favorite',
 u'different',
 u'clean',
 u'did',
 u'crab',
 u'eating',
 u'enjoyed',
 u'didn',
 u'bit',
 u'know',
 u'delicious',
 u'day',
 u'come',
 u'inside',
 u'happy',
 u'area']

A: (insert your comments here)

#### Q: What are the key features(words) that make the negative prediction?

In [159]:
# Let's find it out by ranking
n = 20
get_top_values(model.coef_[0][model.coef_[0]<0], n, vocab)

[u'friendly',
 u'eating',
 u'friend',
 u'don',
 u'hour',
 u'friends',
 u'big',
 u'bit',
 u'decided',
 u'll',
 u'beef',
 u'didn',
 u'love',
 u'busy',
 u'family',
 u'fast',
 u'coming',
 u'enjoyed',
 u'highly',
 u'cream']

A: (insert your comments here)

#### Random Forest Classifier

In [160]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier
model=RandomForestClassifier()
model.fit(train_vectors,target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [161]:
# Get score for training set
model.score(train_vectors,target_train)

0.98497750107498228

In [162]:
# Get score for test set
model.score(x_test, y_test)

0.73948564524480753

#### Q: What do you see from the training score and the test score?

A: (insert your comments here)

In [164]:
model.feature_importances_

array([ 0.00230931,  0.00165563,  0.00196741,  0.04325571,  0.0025802 ,
        0.00196957,  0.00773813,  0.00389005,  0.00338643,  0.00217985,
        0.01896679,  0.00192376,  0.00902757,  0.00374597,  0.00297405,
        0.03675618,  0.0085297 ,  0.00298646,  0.00564775,  0.00258422,
        0.00350173,  0.00301431,  0.00356806,  0.00257577,  0.00495175,
        0.00210423,  0.00384645,  0.00600469,  0.0031197 ,  0.00228699,
        0.00609589,  0.003561  ,  0.00267077,  0.0017848 ,  0.00204217,
        0.00246356,  0.00327188,  0.00182417,  0.01213006,  0.02558394,
        0.00272921,  0.00375353,  0.00988651,  0.00271709,  0.00192361,
        0.00384839,  0.00266671,  0.00221089,  0.00253114,  0.00607826,
        0.00252514,  0.00353612,  0.00528039,  0.00222228,  0.00302327,
        0.01083605,  0.00460564,  0.00329241,  0.00209532,  0.0036564 ,
        0.01066111,  0.00224618,  0.00217429,  0.00329014,  0.01621671,
        0.00255354,  0.00728046,  0.00327906,  0.00202719,  0.01

#### Q: Can you tell what features (words) are important by inspecting the RFC model?

In [165]:
n = 20
get_top_values(model.feature_importances_, n, vocab)

[u'amazing',
 u'great',
 u'best',
 u'delicious',
 u'love',
 u'awesome',
 u'good',
 u'food',
 u'vegas',
 u'ok',
 u'place',
 u'service',
 u'definitely',
 u'just',
 u'friendly',
 u'excellent',
 u'favorite',
 u'pretty',
 u'didn',
 u'like']

## Extra Credit #1: Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

In [None]:
# To be implemented
pass

## Extra Credit #2: Use grid search to find best predictable classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

In [None]:
# To be implemented
pass