# Yelp Data Challenge - NLP

Bo Shen

Sep 2017

In [2]:
%qtconsole

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('data/last_2_years_restaurant_reviews.csv')

In [3]:
df.head()

Unnamed: 0,business_id,name,categories,avg_stars,cool,date,funny,review_id,stars,text,type,useful,user_id
0,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-26,0,nCqdz-NW64KazpxqnDr0sQ,1,I mainly went for the ceasar salad prepared ta...,review,0,0XVzm4kVIAaH4eQAxWbhvw
1,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-06-29,0,iwx6s6yQxc7yjS7NFANZig,4,Nice atmosphere and wonderful service. I had t...,review,0,2aeNFntqY2QDZLADNo8iQQ
2,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2015-04-05,0,2HrBENXZTiitcCJfzkELgA,2,To be honest it really quit aweful. First the ...,review,0,WFhv5pMJRDPWSyLnKiWFXA
3,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-16,0,6YNPXoq41qTMZ2TEi0BYUA,2,"The food was decent, but the service was defin...",review,0,2S6gWE-K3DHNcKYYSgN7xA
4,--9e1ONYQuAa-CB_Rrw7Tw,Delmonico Steakhouse,"[Steakhouses, Restaurants, Cajun/Creole]",4.0,0,2016-02-08,1,4bQrVUiRZ642odcKCS0OhQ,2,If you're looking for craptastic service and m...,review,1,rCTVWx_Tws2jWi-K89iEyw


### Define feature variables, here is the text of the review

In [4]:
# Take the values of the column that contains review text data, save to a variable named "documents"
documents = df['text'].values

In [5]:
# inspect documents, e.g. check the size, take a peek at elements of the numpy array
documents.dtype, documents.shape

(dtype('O'), (347619L,))

### Define target variable

#### For example, I am interested in perfect (5 stars) and imperfect (1-4 stars) rating

In [6]:
# Make a column and take the values, save to a variable named "target"
df['favorable'] = (df['stars'] > 4)

In [7]:
target = df['favorable'].values

In [8]:
print target[:10]

[False False False False False  True  True  True  True False]


In [9]:
# To be implemented
target.mean(), target.std()

(0.46076595352958266, 0.49845831279812869)

## Let's create training dataset and test dataset

In [10]:
from sklearn.cross_validation import train_test_split



In [None]:
# Documents is X, target is y
# Now split the data to training set and test set

In [11]:
# Split to documents_train, documents_test, target_train, target_test
documents_train, documents_test, target_train, target_test = train_test_split( documents, target, test_size = 0.8, random_state = 42)

## Let's get NLP representation of the documents

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Create TfidfVectorizer, and name it vectorizer
vectorizer = TfidfVectorizer(stop_words = 'english', max_features = 5000)

In [14]:
# Train the model with training data
vectors_train = vectorizer.fit_transform(documents_train).toarray()

In [15]:
# Get the vocab of tfidf
words = vectorizer.get_feature_names()

In [16]:
vectors_train.shape

(69523L, 5000L)

In [17]:
# Use the trained model to transform test data
vectors_test = vectorizer.transform(documents_test).toarray()

## Similar review search engine

In [18]:
import numpy as np

# We will need these helper methods pretty soon

def get_top_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the highest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["cat", "pig"]
    '''
    return [labels[i] for i in np.argsort(lst)[::-1][:n]]  # np.argsort by default sorts values in ascending order

def get_bottom_values(lst, n, labels):
    '''
    INPUT: LIST, INTEGER, LIST
    OUTPUT: LIST

    Given a list of values, find the indices with the lowest n values.
    Return the labels for each of these indices.

    e.g.
    lst = [7, 3, 2, 4, 1]
    n = 2
    labels = ["cat", "dog", "mouse", "pig", "rabbit"]
    output: ["mouse", "rabbit"]
    '''
    return [labels[i] for i in np.argsort(lst)[:n]]


In [19]:
# Let's use cosine similarity
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
# Draw an arbitrary review from test (unseen in training) documents
query = [documents_test[42]]
print query


['Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)']


In [21]:
# Transform the drawn review(s) to vector(s)
query_vect = vectorizer.transform(query).toarray()

In [22]:
# Calculate the similarity score(s) between vector(s) and training vectors
similarity = cosine_similarity(query_vect, vectors_train)

In [23]:
# Let's find top 5 similar reviews
n = 5

top5_related = get_top_values(similarity[0], n, documents_train)

In [24]:
print 'Our search query:'
print  query[0]

Our search query:
Very tasty food! Atmosphere on the patio is amazing. Service was fantastic. Nothing we requested was forgotten. More bread, done! More wine, done! Food was delivered exactly as ordered. We were very impressed with our first visit to Salute. Have to give a shout out to our server, Kevin. Thanks for being so  good at your job! ;)


In [25]:
print 'Most %s similar reviews:' % n
for i, review in enumerate(top5_related):
    print '# %s:' % str(i+1)
    print review

Most 5 similar reviews:
# 1:
Have a very good food, and Kevin is a very nice person also in side service is very good too.
# 2:
Great service, food, and atmosphere. Shout out to Mark. Thanks for making our special night great.
# 3:
You must go to Kevin haha he's a funny guy and great bartender. He def made my night lol thanks Kevin
# 4:
Server Kevin was awesome. I love his service. He's been so friendly all the time during the dinner. Definitely will come back again.
# 5:
Another wine dinner at Flemings. Incredible food and wine pairings. Outstanding, knowledgable staff. Kevin, the wine manager is amazing!


#### The returned results make sense to me since they contains positive words like "Good, Great, awesome, Incredible" and "outstanding".  However, it seems that the server's name "Kevin" also plays important role in the returned result, and they probably are different "Kevin", since we are not looking at reviews of a specific restaurant. In this case, it might be better to just take out Kevin when we process the query.  

## Classifying positive/negative review

#### Naive-Bayes Classifier

In [20]:
# Build a Naive-Bayes Classifier

from sklearn.naive_bayes import MultinomialNB

model_nb = MultinomialNB()

model_nb.fit(vectors_train, target_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [23]:
# Get score for training set
model_nb.score(vectors_train, target_train)

0.8095306589186313

In [24]:
# Get score for test set
model_nb.score(vectors_test, target_test)

0.80080979230193894

#### Gradient Boosting regression

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

model_GB = GradientBoostingClassifier(n_estimators = 100, learning_rate = 1.0,
                                     max_depth = 3, random_state = 0)
model_GB.fit(vectors_train, target_train)

In [None]:
# Get score for training set
print model_nb.score(vectors_train, target_train)

# Get score for test set
print model_nb.score(vectors_test, target_test)

#### Logistic Regression Classifier

In [23]:
# Build a Logistic Regression Classifier

from sklearn.linear_model import LogisticRegression

model_lrc = LogisticRegression()
model_lrc.fit(vectors_train, target_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [41]:
# Get score for training set
model_lrc.score(vectors_train, target_train)

0.84262761963666699

In [42]:
# Get score for test set
model_lrc.score(vectors_test, target_test)

0.82183850181232376

#### Q: What are the key features(words) that make the positive prediction?

In [43]:
# Let's find it out by ranking
n = 20
get_top_values(model_lrc.coef_[0], n, words)

[u'amazing',
 u'best',
 u'awesome',
 u'perfect',
 u'thank',
 u'delicious',
 u'highly',
 u'fantastic',
 u'great',
 u'incredible',
 u'phenomenal',
 u'heaven',
 u'favorite',
 u'love',
 u'gem',
 u'wow',
 u'notch',
 u'excellent',
 u'soooo',
 u'outstanding']

A:  Key words that make the positive prediction are as shown above

#### Q: What are the key features(words) that make the negative prediction?

In [44]:
# Let's find it out by ranking
n = 20
get_bottom_values(model_lrc.coef_[0], n, words)

[u'worst',
 u'ok',
 u'rude',
 u'horrible',
 u'bland',
 u'slow',
 u'terrible',
 u'disappointing',
 u'okay',
 u'mediocre',
 u'average',
 u'overpriced',
 u'decent',
 u'poor',
 u'lacking',
 u'dry',
 u'awful',
 u'meh',
 u'lacked',
 u'reason']

A: Key words that make the negative prediction are as shown above

#### Random Forest Classifier

In [26]:
# Build a Random Forest Classifier

from sklearn.ensemble import RandomForestClassifier

model_rfc = RandomForestClassifier(max_depth = None, n_estimators = 5, min_samples_leaf = 10)
model_rfc.fit(vectors_train, target_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=10, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=5, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [47]:
# Get score for training set
model_rfc.score(vectors_train, target_train)

0.80976079858464101

In [48]:
# Get score for test set
model_rfc.score(vectors_test, target_test)

0.76224037742362349

#### The training scores and the test scores are not that different, which means the variances of models are not large for our dataset.

### Identify features (words) are important by inspecting the RFC model

In [50]:
n = 20
get_top_values(model_rfc.feature_importances_, n, words)

[u'amazing',
 u'great',
 u'delicious',
 u'best',
 u'didn',
 u'awesome',
 u'love',
 u'vegas',
 u'ok',
 u'perfect',
 u'decent',
 u'like',
 u'wasn',
 u'terrible',
 u'favorite',
 u'friendly',
 u'highly',
 u'definitely',
 u'place',
 u'average']

### Use cross validation to evaluate your classifiers

[sklearn cross validation](http://scikit-learn.org/stable/modules/cross_validation.html)

#### Naive Bayes

In [21]:
from sklearn.model_selection import cross_val_score

cv_scores = cross_val_score(model_nb,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring="accuracy")

In [22]:
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Accuracy: 0.80 (+/- 0.00)


#### Logistic Regression

In [24]:
cv_scores = cross_val_score(model_lrc,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring="accuracy")

In [25]:
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Accuracy: 0.82 (+/- 0.00)


#### Random Forest

In [27]:
cv_scores = cross_val_score(model_rfc,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring="accuracy")

In [28]:
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Accuracy: 0.76 (+/- 0.01)


#### AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score

model_Ada = AdaBoostClassifier(n_estimators = 100)
cv_scores = cross_val_score(model_Ada,
                            vectors_train,
                            target_train,
                            cv = 5,
                            scoring="accuracy")

In [None]:
print("Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

### Use grid search to find best classifier


[sklearn grid search tutorial (with cross validation)](http://scikit-learn.org/stable/modules/grid_search.html#grid-search)

[sklearn grid search documentation (with cross validation)](http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV)

#### Logistic Regression

In [30]:
# To be implemented
# import GridSearchCV

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

param_grid = [{'penalty':['l1'], 'C':[0.1, 100]},
              {'penalty':['l2'], 'C':[0.1, 100]}]

scores = ['accuracy', 'precision']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score + "\n\n")
    clf = GridSearchCV(LogisticRegression(),
                       param_grid,
                       cv=5,
                       scoring=score)
    clf.fit(vectors_train[:500,:], target_train[:500])
    print("Best parameters set found on development set:\n\n")
    print(clf.best_params_)
    print("\nGrid scores on development set:\n\n")
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    
    print("\nDetailed classification report:\n")
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print("\n")
    y_true, y_pred = target_test, clf.predict(vectors_test)
    print(classification_report(y_true, y_pred))
    print("\n")

# Tuning hyper-parameters for accuracy


Best parameters set found on development set:


{'penalty': 'l1', 'C': 100}

Grid scores on development set:


0.522 (+/-0.005) for {'penalty': 'l1', 'C': 0.1}
0.730 (+/-0.063) for {'penalty': 'l1', 'C': 100}
0.644 (+/-0.097) for {'penalty': 'l2', 'C': 0.1}
0.722 (+/-0.116) for {'penalty': 'l2', 'C': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


             precision    recall  f1-score   support

      False       0.72      0.77      0.75    149887
       True       0.71      0.65      0.68    128209

avg / total       0.71      0.72      0.71    278096



# Tuning hyper-parameters for precision




  'precision', 'predicted', average, warn_for)


Best parameters set found on development set:


{'penalty': 'l2', 'C': 0.1}

Grid scores on development set:


0.000 (+/-0.000) for {'penalty': 'l1', 'C': 0.1}
0.726 (+/-0.097) for {'penalty': 'l1', 'C': 100}
0.938 (+/-0.194) for {'penalty': 'l2', 'C': 0.1}
0.730 (+/-0.122) for {'penalty': 'l2', 'C': 100}

Detailed classification report:

The model is trained on the full development set.
The scores are computed on the full evaluation set.


             precision    recall  f1-score   support

      False       0.61      0.96      0.74    149887
       True       0.86      0.28      0.42    128209

avg / total       0.72      0.64      0.59    278096





#### Gradient Tree Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.grid_search import GridSearchCV

param_grid = {'learning_rate': [0.1, 0.05, 0.02, 0.01],
              'max_depth': [3, 4, 6],
              'min_samples_leaf': [3, 5, 9, 18],
              'max_features': [1.0, 0.3, 0.1] 
              }

est = GradientBoostingClassifierr(n_estimators=3000)
# It takes too long to run this
gs_cv = GridSearchCV(est, param_grid, n_jobs=4).fit(vectors_train[:500,:], target_train[:500])

# best hyperparameter setting
print gs_cv.best_params_
