# Logistic Regression - Tfidf Vectorizer

In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.functions import col, split

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

import json

import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import wordpunct_tokenize
from nltk.corpus import stopwords


from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import log_loss
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

import pickle

In [2]:
 def report(fitted):   
    ypred = fitted.predict(Xtest)

    print("[Training Classification Report:]")
    print(classification_report(ytrain, fitted.predict(Xtrain)))
    print('Training Accuracy: ',accuracy_score(fitted.predict(Xtrain), ytrain))
    print('')
    print("[Test Classification Report:]")
    print(classification_report(ytest, ypred))
    print('Test Accuracy: ', accuracy_score(ypred, ytest))

    yprobs = fitted.predict_proba(Xtest)[:,1]
    fpr, tpr, threshold = roc_curve(ytest,  yprobs)
    roc_auc = auc(fpr, tpr)
    print('')
    print('AUC: ', roc_auc)
    print('')
    print('Log loss: ', log_loss(ytest, yprobs))
    
    return ypred, yprobs
    
def make_x(REVtrain, REVtest, vectorizer=None, min_df = 1, max_df = 1.0):
    """Create the matrix of features """
    if vectorizer is None:
        print('The value of vectorizer is None ... using CountVectorizer')
    else:
        print('The value of vectorizer is NOT None ... using the provided vectorizer')
    #
    if vectorizer is None:
        vectorizer = CountVectorizer(strip_accents = 'ascii', stop_words = sw, 
                                     min_df = min_df, max_df = max_df)
   
    Xtrain = vectorizer.fit_transform(REVtrain)
    Xtrain = Xtrain.tocsc() 
    
    Xtest = vectorizer.transform(REVtest)
    Xtest = Xtest.tocsc()
    
    return Xtrain, Xtest, vectorizer 

 

In [3]:
infile = open('../SavedFiles/fastFood_eng_binary.pkl', 'rb')
fastFood = pickle.load(infile)
infile.close

<function BufferedReader.close>

In [4]:
sw = stopwords.words('english')
sw = word_tokenize(' '.join([w for w in sw]))
sw = sw + ['.', ',', '...', '\'\'', '\"', '``', '¡',  '{','|','||','}', '(',')']

## Make X and y 

In [5]:
REVtrain, REVtest, ytrain, ytest = train_test_split(fastFood.text_tok, fastFood.pos, random_state = 123)

In [6]:
tfidf = TfidfVectorizer(strip_accents = 'ascii', stop_words = sw)

In [7]:
Xtrain, Xtest, vectorizer = make_x(REVtrain, REVtest, vectorizer = tfidf)

The value of vectorizer is NOT None ... using the provided vectorizer


In [8]:
n_rows, n_cols = Xtrain.shape
print('Number of reviews: ', n_rows)
print('Number of terms: ', n_cols)

Number of reviews:  164748
Number of terms:  63298


## Logistic Regression - min_df = 1, max_df = 1.0 (defaults)

In [9]:

param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
logistic = LogisticRegression(solver = 'lbfgs')
logreg_cv = GridSearchCV(logistic, param_grid, cv = 5)
fitted = logreg_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))



Tuned Logistic Regression Parameters: {'C': 1}
Best score is 0.8847755359700876




In [10]:
ypred, yprobs = report(fitted)

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.91      0.89      0.90     82065
           1       0.90      0.91      0.90     82683

   micro avg       0.90      0.90      0.90    164748
   macro avg       0.90      0.90      0.90    164748
weighted avg       0.90      0.90      0.90    164748

Training Accuracy:  0.9012491805666837

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     27281
           1       0.88      0.89      0.89     27635

   micro avg       0.89      0.89      0.89     54916
   macro avg       0.89      0.89      0.89     54916
weighted avg       0.89      0.89      0.89     54916

Test Accuracy:  0.8864629616141015

AUC:  0.9537106499925285

Log loss:  0.27704278107091845


## Hyperparameter Tuning

In [11]:
tfidf = TfidfVectorizer(strip_accents = 'ascii', stop_words = sw, min_df = 3, max_df = 70000)

In [12]:
Xtrain, Xtest, vectorizer = make_x(REVtrain, REVtest, vectorizer = tfidf)

The value of vectorizer is NOT None ... using the provided vectorizer


In [13]:
n_rows, n_cols = Xtrain.shape
print('Number of reviews: ', n_rows)
print('Number of terms: ', n_cols)

Number of reviews:  164748
Number of terms:  27621


In [14]:
feature_names = vectorizer.get_feature_names()
word_frequencies = np.array([Xtrain[:,i].count_nonzero() for i in range(n_cols)])
max_doc_freq = np.max(word_frequencies)
x_values = range(max_doc_freq)
x_values

range(0, 51703)

In [15]:
most_freq_idx = list(word_frequencies).index(max_doc_freq)
feature_names[most_freq_idx]

'good'

## Logistic Regression - min_df = 3, max_df = 70000

In [16]:

#Cross VALIDATION
param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
logistic = LogisticRegression(solver = 'lbfgs')
logreg_cv = GridSearchCV(logistic, param_grid, cv = 5)
fitted = logreg_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))



Tuned Logistic Regression Parameters: {'C': 1}
Best score is 0.8850911695437881


In [17]:
ypred, yprobs = report(fitted)

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.90      0.89      0.90     82065
           1       0.90      0.91      0.90     82683

   micro avg       0.90      0.90      0.90    164748
   macro avg       0.90      0.90      0.90    164748
weighted avg       0.90      0.90      0.90    164748

Training Accuracy:  0.8998834583727875

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     27281
           1       0.88      0.89      0.89     27635

   micro avg       0.89      0.89      0.89     54916
   macro avg       0.89      0.89      0.89     54916
weighted avg       0.89      0.89      0.89     54916

Test Accuracy:  0.8860441401413067

AUC:  0.9536927088693234

Log loss:  0.27715603404230615


In [18]:
confusion_matrix(ytest, ypred)

array([[24054,  3227],
       [ 3031, 24604]])

## min_df = 6, max_df = 70000

In [19]:
tfidf = TfidfVectorizer(strip_accents = 'ascii', stop_words = sw, min_df = 6, max_df = 70000)

In [20]:
Xtrain, Xtest, vectorizer = make_x(REVtrain, REVtest, vectorizer = tfidf)

The value of vectorizer is NOT None ... using the provided vectorizer


In [21]:
n_rows, n_cols = Xtrain.shape
print('Number of reviews: ', n_rows)
print('Number of terms: ', n_cols)

Number of reviews:  164748
Number of terms:  19382


In [22]:
feature_names = vectorizer.get_feature_names()
word_frequencies = np.array([Xtrain[:,i].count_nonzero() for i in range(n_cols)])
max_doc_freq = np.max(word_frequencies)
x_values = range(max_doc_freq)
x_values

range(0, 51703)

In [23]:
most_freq_idx = list(word_frequencies).index(max_doc_freq)
feature_names[most_freq_idx]

'good'

In [24]:


param_grid = {'C': [0.001, 0.1, 1, 10, 100]}
logistic = LogisticRegression(solver = 'lbfgs')
logreg_cv = GridSearchCV(logistic, param_grid, cv = 5)
fitted = logreg_cv.fit(Xtrain, ytrain)
print("Tuned Logistic Regression Parameters: {}".format(logreg_cv.best_params_)) 
print("Best score is {}".format(logreg_cv.best_score_))



Tuned Logistic Regression Parameters: {'C': 1}
Best score is 0.8849090732512686


In [25]:
ypred, yprobs = report(fitted)

[Training Classification Report:]
              precision    recall  f1-score   support

           0       0.90      0.89      0.90     82065
           1       0.89      0.91      0.90     82683

   micro avg       0.90      0.90      0.90    164748
   macro avg       0.90      0.90      0.90    164748
weighted avg       0.90      0.90      0.90    164748

Training Accuracy:  0.8991368635734577

[Test Classification Report:]
              precision    recall  f1-score   support

           0       0.89      0.88      0.89     27281
           1       0.88      0.89      0.89     27635

   micro avg       0.89      0.89      0.89     54916
   macro avg       0.89      0.89      0.89     54916
weighted avg       0.89      0.89      0.89     54916

Test Accuracy:  0.886299074950834

AUC:  0.9537289452692083

Log loss:  0.27698474299011816


In [26]:
confusion_matrix(ytest, ypred)

array([[24056,  3225],
       [ 3019, 24616]])