#Yelp Restaurant Review Project

###Classification model with numeric data

In [1]:
%matplotlib inline  
import nltk
import pandas as pd
from pandas import Series, DataFrame
from collections import Counter
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfVectorizer
import random as rd
import numpy as np
from sklearn.metrics import classification_report


yelp = pd.read_csv('Yelp Data Restaurant Reviews Ratings.csv')

yelp['Ratings'] = 0

mask = yelp['stars'] > 3
yelp.ix[mask, 'Ratings'] = 1
yelp = yelp.drop('stars', 1)

Y, X = dmatrices('Ratings ~ 0+ votes_cool+ votes_funny+ votes_useful+ Cheap+ Moderate+ Expensive+ VeryExpensive+ American+ Chinese+ French+ Japanese+ Indian+ Italian + Greek+ Mediterranean+ Mexican+ Thai+ Vietnamese', yelp, return_type = "dataframe")
y = Y['Ratings']

#### Lets fit a tree to this data and check how accurately it is able to classify based on the numeric data.

In [3]:
from sklearn import tree
model = tree.DecisionTreeClassifier(criterion='entropy', max_depth = 2)


####Splitting the data into Train and Test data in the ratio of 7:3

In [2]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)


####Run the model, get the predicted values and then compare with the actual data in the test to find the accuracy of our tree classification model

In [5]:
from sklearn import metrics

model = LogisticRegression()
model.fit(X_train, y_train)
y_predicted = model.predict(X_train)

print 'Train Accuracy - ', metrics.accuracy_score(y_train, y_predicted)
y_predicted = model.predict(X_test)

print "MODEL: Trees\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_predicted)

Train Accuracy -  0.684977498393
MODEL: Trees

The precision for this classifier is 0.687370956641
The recall for this classifier is 0.979406717333
The f1 for this classifier is 0.807805075321
The accuracy for this classifier is 0.683166666667

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.55      0.05      0.10      1921
        1.0       0.69      0.98      0.81      4079

avg / total       0.64      0.68      0.58      6000


Here is the confusion matrix:
[[ 104 1817]
 [  84 3995]]


####The Tree Classification model gives us the accuracy of 67.96%

####Applying Logistic Regression Model and check if it gives better accuracy than the Tree Model

In [5]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()

model.fit(X_train, y_train)
y_predicted = model.predict(X_train)

print 'Logistic Regression Model Accuracy - ', metrics.accuracy_score(y_train, y_predicted)
y_predicted = model.predict(X_test)

print "MODEL: Logistic Regression\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_predicted)


Train Accuracy -  0.684977498393
MODEL: Trees

The precision for this classifier is 0.687370956641
The recall for this classifier is 0.979406717333
The f1 for this classifier is 0.807805075321
The accuracy for this classifier is 0.683166666667

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.55      0.05      0.10      1921
        1.0       0.69      0.98      0.81      4079

avg / total       0.64      0.68      0.58      6000


Here is the confusion matrix:
[[ 104 1817]
 [  84 3995]]


####We see that Logistic Regression Model gives us better Accuracy of 68.49% than the Tree Model(67.96%) in this classifcation problem

### Classification model with reviews (text) data without stop words

In [3]:
yelp_review = yelp[['Ratings', 'Review']]

X_train, X_test, y_train, y_test = train_test_split(yelp_review['Review'], yelp_review['Ratings'], test_size=0.4, random_state=1)

Counter(y_train)

Counter({1: 8104, 0: 3895})

####Before running the Multinomial Naive Bayes Model on the Training Data, we make the proportions of High and Low Rating Reviews in the Training Data Set to be equal to remove the skewness from our Training Data and hence obtain better results on the Test Data Set.

In [4]:
import random as rd
import numpy as np
from sklearn.metrics import classification_report

y_train_df = pd.DataFrame(y_train)

y_train_df_0 = y_train_df[y_train_df['Ratings']==1]

sampleset = rd.sample(y_train_df_0.index,3895)

y_train_0 = y_train.ix[sampleset]

X_train_0 = X_train.ix[sampleset]

y1index = y_train_df[y_train_df['Ratings']==0].index

y_train_1 = y_train.ix[y1index]

X_train_1 = X_train.ix[y1index]

X_train = pd.concat([X_train_1,X_train_0], axis = 0)
y_train = pd.concat([y_train_1,y_train_0], axis = 0)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = np.array(X_test)
y_test = np.array(y_test)

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 strip_accents='unicode',
 stop_words = 'english', 
 norm='l2')

X_train_Orig = X_train
X_test_Orig = X_test

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)


#### Run the Multinomial Naive Bayes Model

In [8]:
nb_classifier = MultinomialNB().fit(X_train, y_train)

y_nb_predicted = nb_classifier.predict(X_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(y_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_nb_predicted)

MODEL: Multinomial Naive Bayes

The precision for this classifier is 0.90224325993
The recall for this classifier is 0.804551293815
The f1 for this classifier is 0.850601474583
The accuracy for this classifier is 0.8075

Here is the classification report:
             precision    recall  f1-score   support

          0       0.66      0.81      0.73      2551
          1       0.90      0.80      0.85      5449

avg / total       0.83      0.81      0.81      8000


Here is the confusion matrix:
[[2076  475]
 [1065 4384]]


####The accuracy obtained using Multinomial naive bayes is 80.75%

###Classification model with reviews (text) data without stop words and using Numeric data

In [11]:
from sklearn.preprocessing import normalize

xn = yelp[['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate', 'Expensive', 'VeryExpensive', 'American', 'Chinese', 'French', 'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean', 'Mexican', 'Thai', 'Vietnamese', 'Review']]
yn = yelp['Ratings']

Xn_train, Xn_test, yn_train, yn_test = train_test_split(yelp[['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate', 'Expensive', 
                                                              'VeryExpensive', 'American', 'Chinese', 'French', 'Japanese', 'Indian', 'Italian', 
                                                              'Greek', 'Mediterranean', 'Mexican', 'Thai', 'Vietnamese', 'Review']], yelp['Ratings'], 
                                                        test_size=0.4, random_state=1)

Counter(yn_train)

Counter({1: 8104, 0: 3895})

####Before running the Multinomial Naive Bayes Model on the Training Data, we make the proportions of High and Low Rating Reviews in the Training Data Set to be equal to remove the skewness from our Training Data and hence obtain better results on the Test Data Set.

In [6]:
yn_train_df = pd.DataFrame(yn_train)

yn_train_df_0 = yn_train_df[yn_train_df['Ratings']==1]

yn_train_df = pd.DataFrame(yn_train)

yn_train_df_0 = yn_train_df[yn_train_df['Ratings']==1]

sampleset = rd.sample(yn_train_df_0.index,3895)

yn_train_0 = yn_train.ix[sampleset]

Xn_train_0 = Xn_train.ix[sampleset]

y1index = yn_train_df[yn_train_df['Ratings']==0].index

yn_train_1 = yn_train.ix[y1index]

Xn_train_1 = Xn_train.ix[y1index]

Xn_train = pd.concat([Xn_train_1,Xn_train_0], axis = 0)
yn_train = pd.concat([yn_train_1,yn_train_0], axis = 0)

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 strip_accents='unicode', 
 norm='l2')

X_train = vectorizer.fit_transform(X_train_Orig)
X_test = vectorizer.transform(X_test_Orig)

In [7]:
import warnings
warnings.filterwarnings('ignore')

Xn_train = np.array(Xn_train[['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate', 'Expensive', 'VeryExpensive', 'American',
                              'Chinese', 'French', 'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean', 'Mexican', 'Thai', 'Vietnamese']])
yn_train = np.array(yn_train)

Xn_test = np.array(Xn_test[['votes_cool', 'votes_funny', 'votes_useful', 'Cheap', 'Moderate', 'Expensive', 'VeryExpensive', 'American', 'Chinese',
                            'French', 'Japanese', 'Indian', 'Italian', 'Greek', 'Mediterranean', 'Mexican', 'Thai', 'Vietnamese']])
yn_test = np.array(yn_test)



####Normalized the numeric data to make it perform more accurately during classification

In [8]:
Xn_train = normalize(Xn_train, axis=1, norm='l1')
Xn_test = normalize(Xn_test, axis=1, norm='l1')
Xt_train = np.concatenate([X_train.toarray(), Xn_train], axis = 1)
Xt_test = np.concatenate([X_test.toarray(), Xn_test], axis = 1)

#### Run the Multinomial Naive Bayes Model

In [10]:
from sklearn import metrics
nb_classifier = MultinomialNB().fit(Xt_train, yn_train)

y_nb_predicted = nb_classifier.predict(Xt_test)

print "MODEL: Multinomial Naive Bayes\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_nb_predicted))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_nb_predicted))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_nb_predicted))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_nb_predicted))

print '\nHere is the classification report:'
print classification_report(yn_test, y_nb_predicted)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(yn_test, y_nb_predicted)

MODEL: Multinomial Naive Bayes

The precision for this classifier is 0.906171809842
The recall for this classifier is 0.797577537163
The f1 for this classifier is 0.84841386042
The accuracy for this classifier is 0.805875

Here is the classification report:
             precision    recall  f1-score   support

          0       0.66      0.82      0.73      2551
          1       0.91      0.80      0.85      5449

avg / total       0.83      0.81      0.81      8000


Here is the confusion matrix:
[[2101  450]
 [1103 4346]]


#### The accuracy of the model is very similar to the model with just the text data. The text data which was a better predictor of the model earlier still the dominating feature for prediction and hence there is minimal change in the accuracy even after the inclusion of numeric data

###Using SentiStrength to analyze the sentiments from the reviews in the dataset and building a Logit model for calculating the ratings

####After running the Senti-Strength software, we obtain the positive and negative sentiments for each review.To this, we add another column of 'NetSentiment' which is the sum of 'Positive' and 'Negative' Sentiment. We use NetSentiment as the single independent variable in our classification problem.

In [15]:
yelp_senti_data = pd.read_csv('net_sentiment.csv')
                             
# yelp_senti_data[:20]
yelp_senti_data["Class"]=1
mask = (yelp_senti_data['stars']<=3)
yelp_senti_data['Class'][mask] = 0

formula = 'Class ~ 0 + NetSentiment'        
Y, X = dmatrices(formula, yelp_senti_data, return_type='dataframe')
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

####Applying the Logistic Regression Model using 'NetSentiment' as the predictor variable. 

In [16]:
model = LogisticRegression()

result = model.fit(X_train, y_train)

prediction_train = model.predict(X_train)

y_prediction  = model.predict(X_test)

print "MODEL: Logical Regression using SentStrength Data\n"

print 'The precision for this classifier is ' + str(metrics.precision_score(y_test, y_prediction))
print 'The recall for this classifier is ' + str(metrics.recall_score(y_test, y_prediction))
print 'The f1 for this classifier is ' + str(metrics.f1_score(y_test, y_prediction))
print 'The accuracy for this classifier is ' + str(metrics.accuracy_score(y_test, y_prediction))

print '\nHere is the classification report:'
print classification_report(y_test, y_prediction)

print '\nHere is the confusion matrix:'
print metrics.confusion_matrix(y_test, y_prediction)

MODEL: Logical Regression using SentStrength Data

The precision for this classifier is 0.732625482625
The recall for this classifier is 0.930375091934
The f1 for this classifier is 0.819742952803
The accuracy for this classifier is 0.721833333333

Here is the classification report:
             precision    recall  f1-score   support

        0.0       0.65      0.28      0.39      1921
        1.0       0.73      0.93      0.82      4079

avg / total       0.71      0.72      0.68      6000


Here is the confusion matrix:
[[ 536 1385]
 [ 284 3795]]


### Top 5 “attributes” of a restaurant that are associated with high and low ratings

In [16]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans

####Binning the Dataset into Low and High Rating Categories

In [17]:
yelp=pd.read_csv('Yelp Data Restaurant Reviews Ratings.csv', encoding='utf-8')
yelp.columns.values
yelp['rating']=0
yelp.ix[yelp['stars']>=4,'rating']=1

####Applying the Multinomial Naive Bayes Model to find out the top 5 attributes associated with High and Low rating reviews

In [20]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import naive_bayes

vectorizer = TfidfVectorizer(min_df=2, 
 ngram_range=(1, 1), 
 stop_words = 'english', 
 strip_accents='unicode', 
 norm='l2')
X = vectorizer.fit_transform(yelp['Review'])

model = naive_bayes.MultinomialNB()
model.fit(X, yelp['rating'])

feature_importances = model.feature_log_prob_[1] - model.feature_log_prob_[0]
feature_importance_series = Series(feature_importances, index=vectorizer.get_feature_names())  

print 'high rating reviews words', feature_importance_series.order(ascending=False)[:5].index.values
print feature_importance_series.order(ascending=False)[:5]
print 'low review rating words', feature_importance_series.order(ascending=True)[:5].index.values
print feature_importance_series.order(ascending=True)[:5]

high rating reviews words [u'gem' u'amazing' u'highly' u'perfect' u'die']
gem        1.965407
amazing    1.707398
highly     1.697840
perfect    1.673197
die        1.628526
dtype: float64
low review rating words [u'tasteless' u'worse' u'worst' u'awful' u'terrible']
tasteless   -2.737525
worse       -2.317327
worst       -2.277860
awful       -2.209369
terrible    -2.194620
dtype: float64


#### Top 5 Attributes associated with high rating reviews are 'gem' , 'amazing' , 'highly' , 'perfect' and  'die' 
#### Top 5 Attributes associated with low rating reviews are 'tasteless' , 'worse' , 'worst' , 'awful' and  'terrible'