In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# import all models
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [3]:
df = pd.read_csv("./Restaurant_Reviews.tsv", delimiter="\t")

In [4]:
df.dtypes

Review    object
Liked      int64
dtype: object

In [6]:
# Stem and Stopwords

cv = CountVectorizer(max_features = 1500, stop_words = 'english')
trainedCountVectorizer = cv.fit(df.Review)
df_CountVectorized = cv.transform(df.Review).toarray()

In [7]:
df_CountVectorized.shape # 1000, 1500

(1000, 1500)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df_CountVectorized, df.Liked.values, test_size=0.33, random_state=42)

In [10]:
models=[]
models.append(('GLM',LogisticRegression()))
models.append(('KNN',KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)))
models.append(('TREE',DecisionTreeClassifier(criterion = 'entropy', random_state = 0)))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)))

# evaluate each model verify the performance
df_test = pd.DataFrame({"test_id":range(len(y_test))})


for name, model in models:
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test)
    df_test[name+'_pred']=y_pred
    cm = confusion_matrix(y_test, y_pred)
    print ("---------- model: " + name + '   performance as following --------->')
    print("Acurracy: " + str(accuracy_score(y_test, y_pred)))
    TP = cm[1][1]
    TN = cm[0][0]
    FP = cm[0][1]
    FN = cm[1][0]
    precision = TP / (TP + FP)
    recall = TP / (TP + FN)
    print("Precision: " + str(precision) )
    print("Recall: " + str(recall) )

---------- model: GLM   performance as following --------->
Acurracy: 0.781818181818
Precision: 0.774193548387
Recall: 0.764331210191
---------- model: KNN   performance as following --------->
Acurracy: 0.645454545455
Precision: 0.785714285714
Recall: 0.350318471338
---------- model: TREE   performance as following --------->
Acurracy: 0.715151515152
Precision: 0.703225806452
Recall: 0.694267515924
---------- model: NB   performance as following --------->
Acurracy: 0.678787878788
Precision: 0.618604651163
Recall: 0.847133757962
---------- model: RF   performance as following --------->
Acurracy: 0.748484848485
Precision: 0.824561403509
Recall: 0.59872611465


In [11]:
df_test.head()

Unnamed: 0,test_id,GLM_pred,KNN_pred,TREE_pred,NB_pred,RF_pred
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,2,1,1,1,1,0
3,3,1,0,1,1,1
4,4,1,0,1,1,1


In [12]:
models[1][1].predict(X_test)

array([0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       1, 0,

In [13]:
# Prepare Pickle 
model_objects = {
    "cv":cv,
    "models": models
}

In [14]:
with open('./outputs/model_object_dict.pkl', 'wb') as f:
    joblib.dump(model_objects, f)