# Extended Modeling - Binary Naive Bayes & Logistic Regression
To solve create less imbalance, we will change the problem to a binary classification problem.  We are now trying to classify the reviews as positive (4 or 5 stars) or not positive.

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import json

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import pickle

In [2]:
 def report(fitted):   
    ypred = fitted.predict(Xtest)

    print("[Training Classification Report:]")
    print(classification_report(ytrain, fitted.predict(Xtrain)))
    print('Training Accuracy: ',accuracy_score(fitted.predict(Xtrain), ytrain))
    print('')
    print("[Test Classification Report:]")
    print(classification_report(ytest, ypred))
    print('Test Accuracy: ', accuracy_score(ypred, ytest))

    yprobs = fitted.predict_proba(Xtest)[:,1]
    fpr, tpr, threshold = roc_curve(ytest,  yprobs)
    roc_auc = auc(fpr, tpr)
    print('')
    print('AUC: ', roc_auc)
    print('')
    print('Log loss: ', log_loss(ytest, yprobs))
    
    return ypred, yprobs

In [3]:
infile = open('../SavedFiles/fastFood_eng.pkl', 'rb')
fastFood = pickle.load(infile)
infile.close

<function BufferedReader.close>

## Binary Classification
We will define 4- and 5-star reviews as "positive" and 1-, 2-. and 3-star reviews as "not positive."

In [4]:
def ispositive(stars):
    if stars in [4, 5]:
        pos = 1
    else:
        pos = 0
    return pos

fastFood['pos'] = fastFood.stars.apply(ispositive)

In [5]:
fastFood.head()

Unnamed: 0,business_id,city,date,name,review_id,stars,state,text,useful,word_count,text_tok,not_eng,num_non_eng,pos
0,lYv2-FaGQBhZnVxTb3Qc5Q,Las Vegas,2012-04-05 01:45:36,Wendy's,Ieq-XKnp5BrK95VuvXNouQ,1.0,NV,PLEASE AVOID THIS PLACE.\nstupid dude cant cou...,1,28,please avoid this place . stupid dude cant cou...,[],0,0
1,aZOoc-87ESqy8apJO3J-Yw,Gilbert,2017-05-28 23:01:26,Slim Chickens,ZbFJnkkPQ_fiUw0FJKzLbg,5.0,AZ,What a welcome concept in fast food. Cute pla...,0,83,what a welcome concept in fast food . cute pla...,[jalapeño],1,1
2,dKdApYVFDSNYsNOso6NYlA,Las Vegas,2017-01-07 05:43:47,White Castle,xYmddD9LEtZSoPK0x7u52w,3.0,NV,Been wanting to try this place since everyone ...,1,116,been wanting to try this place since everyone ...,[],0,0
3,KpfCj839-MPcxM8FKkW7GA,Pittsburgh,2012-10-18 19:31:19,Taco Bell,wVoXYLxYIO_JsiPQFl7E7w,3.0,PA,Stopped here because I finally had to try the ...,1,153,stopped here because i finally had to try the ...,[],0,0
4,KCIVWrtbeIlLpzRJWxJz4g,Toronto,2017-07-21 17:22:22,McDonalds,ZaAVeAOF2S1PYIsg7sjJyQ,1.0,ON,"Everytime I go to this spot for ice cream, I w...",0,81,"everytime i go to this spot for ice cream , i ...",[],0,0


In [6]:
sw = stopwords.words('english')
sw = word_tokenize(' '.join([w for w in sw]))
sw = sw + ['.', ',', '...', '\'\'', '\"', '``', '¡',  '{','|','||','}', '(',')']

## Make X and y 

In [7]:
REVtrain, REVtest, ytrain, ytest = train_test_split(fastFood.text_tok,fastFood.pos, random_state=123)

In [8]:
REVtest.head()

172202    my wife and i picked up dinner at el pollo loc...
57202     visited the restaurant recently and was quite ...
31914     worst experience ever not the first time but m...
1441      blaze pizza is easily one of my favorite place...
184107    people made this place seem like the best plac...
Name: text_tok, dtype: object

In [9]:
REVtest.shape

(54916,)

In [10]:
#change target variable
def make_x(REVtrain, REVtest, vectorizer=None, min_df = 1, max_df = 1.0):
    """Create the matrix of features """
    if vectorizer is None:
        print('The value of vectorizer is None ... using CountVectorizer')
    else:
        print('The value of vectorizer is NOT None ... using the provided vectorizer')
    #
    if vectorizer is None:
        vectorizer = CountVectorizer(strip_accents = 'ascii', stop_words = sw, 
                                     min_df = min_df, max_df = max_df)
   
    Xtrain = vectorizer.fit_transform(REVtrain)
    Xtrain = Xtrain.tocsc() 
    
    Xtest = vectorizer.transform(REVtest)
    Xtest = Xtest.tocsc()
    
    return Xtrain, Xtest, vectorizer 


In [11]:
Xtrain, Xtest, vectorizer = make_x(REVtrain, REVtest)

The value of vectorizer is None ... using CountVectorizer


In [12]:
n_rows, n_cols = Xtrain.shape
print('Number of reviews: ', n_rows)
print('Number of terms: ', n_cols)

Number of reviews:  164748
Number of terms:  63298


In [13]:
param_grid = {'alpha': [0.1, 1, 5, 10, 50]}
nb = MultinomialNB()
nb_cv = GridSearchCV(nb, param_grid, cv = 5)
fitted = nb_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(nb_cv.best_params_)) 
print("Best score is {}".format(nb_cv.best_score_))
print('')


ypred, yprobs = report(fitted)


Tuned Logistic Regression Parameters: {'alpha': 5}
Best score is 0.8430815548595431

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.88      0.81      0.84     82065
           1       0.82      0.89      0.86     82683

   micro avg       0.85      0.85      0.85    164748
   macro avg       0.85      0.85      0.85    164748
weighted avg       0.85      0.85      0.85    164748

Training Accuracy:  0.8490361036249302

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.87      0.80      0.84     27281
           1       0.82      0.89      0.85     27635

   micro avg       0.84      0.84      0.84     54916
   macro avg       0.85      0.84      0.84     54916
weighted avg       0.85      0.84      0.84     54916

Test Accuracy:  0.8441619928618254

AUC:  0.9034599806275398

Log loss:  0.9138543338405616


In [14]:
param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
logistic = LogisticRegression(solver = 'lbfgs')
logreg_cv = GridSearchCV(logistic, param_grid, cv = 5)
fitted = logreg_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))



Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.8823840046616651




In [15]:
ypred, yprobs = report(fitted)

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     82065
           1       0.89      0.91      0.90     82683

   micro avg       0.90      0.90      0.90    164748
   macro avg       0.90      0.90      0.90    164748
weighted avg       0.90      0.90      0.90    164748

Training Accuracy:  0.9026938111540049

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     27281
           1       0.88      0.89      0.89     27635

   micro avg       0.88      0.88      0.88     54916
   macro avg       0.88      0.88      0.88     54916
weighted avg       0.88      0.88      0.88     54916

Test Accuracy:  0.8844599023963872

AUC:  0.9460221266203857

Log loss:  0.3006074259656983


## Hyperparameter Tuning

In [16]:
Xtrain, Xtest, vectorizer = make_x(REVtrain, REVtest, min_df = 3, max_df = 70000)

The value of vectorizer is None ... using CountVectorizer


In [17]:
n_rows, n_cols = Xtrain.shape
print('Number of reviews: ', n_rows)
print('Number of terms: ', n_cols)

Number of reviews:  164748
Number of terms:  27621


In [18]:
feature_names = vectorizer.get_feature_names()
word_frequencies = np.array([Xtrain[:,i].count_nonzero() for i in range(n_cols)])
max_doc_freq = np.max(word_frequencies)
x_values = range(max_doc_freq)
x_values

range(0, 51703)

In [19]:
most_freq_idx = list(word_frequencies).index(max_doc_freq)
feature_names[most_freq_idx]

'good'

## Naive Bayes

In [20]:
#Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, random_state=123)

param_grid = {'alpha': [0.1, 1, 5, 10, 50]}
nb = MultinomialNB()
nb_cv = GridSearchCV(nb, param_grid, cv = 5)
fitted = nb_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(nb_cv.best_params_)) 
print("Best score is {}".format(nb_cv.best_score_))
print('')


ypred, yprobs = report(fitted)

Tuned Logistic Regression Parameters: {'alpha': 1}
Best score is 0.8425959647461577

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.80      0.84     82065
           1       0.82      0.90      0.86     82683

   micro avg       0.85      0.85      0.85    164748
   macro avg       0.85      0.85      0.85    164748
weighted avg       0.85      0.85      0.85    164748

Training Accuracy:  0.8498798164469371

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.88      0.79      0.83     27281
           1       0.81      0.89      0.85     27635

   micro avg       0.84      0.84      0.84     54916
   macro avg       0.85      0.84      0.84     54916
weighted avg       0.85      0.84      0.84     54916

Test Accuracy:  0.843943477310802

AUC:  0.9027604485936052

Log loss:  0.9544517849886597


In [21]:
confusion_matrix(ytest, ypred)

array([[21646,  5635],
       [ 2935, 24700]])

In [22]:
#identify predictive words

words = np.array(feature_names)

x = np.eye(Xtest.shape[1])
probs = fitted.predict_log_proba(x)[:, 0]
ind = np.argsort(probs)

#10 most positive words
good_words = words[ind[:10]]

#10 most negative words
bad_words = words[ind[-10:]]

good_prob = probs[ind[:10]]
bad_prob = probs[ind[-10:]]

print("Positive words\t     P(positive | word)")
for w, p in zip(good_words, good_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))
    
print("Bad words\t     P(positive | word)")
for w, p in zip(bad_words, bad_prob):
    print("{:>20}".format(w), "{:.2f}".format(1 - np.exp(p)))

Positive words	     P(positive | word)
               pulpo 0.98
              karved 0.97
            completo 0.97
              mmmmmm 0.96
             harmony 0.96
              xavier 0.96
             bajamar 0.96
               yummo 0.96
                quad 0.96
                 pbb 0.96
Bad words	     P(positive | word)
            diarrhea 0.03
         inexcusable 0.03
            dirtiest 0.03
              rudest 0.02
             slowest 0.02
           retrained 0.02
           cockroach 0.02
        unacceptable 0.02
        disrespected 0.02
             replies 0.02


Some of the "positive" words seem strange and the negative words seem more logical.  A Google search shows Karved is a popular restaurant in Las Vegas that gets many good reviews, so it is appearing here. Perhaps words like "xavier" appear in few reviews that happen to be all positive.  This may be a good reason to further increase min_df later. 

## Logistic Regression

In [23]:
#Cross VALIDATION
param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
logistic = LogisticRegression(solver = 'lbfgs')
logreg_cv = GridSearchCV(logistic, param_grid, cv = 5)
fitted = logreg_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))



Tuned Logistic Regression Parameters: {'C': 0.1}
Best score is 0.882402214290917




In [24]:
ypred, yprobs = report(fitted)

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     82065
           1       0.89      0.91      0.90     82683

   micro avg       0.90      0.90      0.90    164748
   macro avg       0.90      0.90      0.90    164748
weighted avg       0.90      0.90      0.90    164748

Training Accuracy:  0.9017529803093209

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     27281
           1       0.88      0.89      0.89     27635

   micro avg       0.88      0.88      0.88     54916
   macro avg       0.88      0.88      0.88     54916
weighted avg       0.88      0.88      0.88     54916

Test Accuracy:  0.8847330468351664

AUC:  0.9459625240496904

Log loss:  0.3008038527214057


In [25]:
confusion_matrix(ytest, ypred)

array([[23896,  3385],
       [ 2945, 24690]])

In [26]:
outfile = open('../SavedFiles/fastFood_eng_binary.pkl', 'wb')
pickle.dump(fastFood, outfile)
outfile.close()