#  04 - Modeling

In [57]:
# Import libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, StackingClassifier

In [2]:
#  Read vectorized data

X_train = pd.read_csv('../data/X_train_tvec.csv')
X_test = pd.read_csv('../data/X_test_tvec.csv')

In [3]:
#  Read target data 

y_train = pd.read_csv('../data/y_train.csv')
y_test = pd.read_csv('../data/y_test.csv')

In [4]:
#  Reshape targets into vectors

y_train = y_train.values.reshape(len(y_train),)
y_test = y_test.values.reshape(len(y_test),)

In [5]:
#  Check shapes

X_train.shape,X_test.shape,y_train.shape,y_test.shape

((27291, 2000), (9097, 2000), (27291,), (9097,))

##  Baseline Model

In [6]:
1 - y_test.mean()

0.720677146311971

A baseline model predicting the most common class (vegan/target = 0) will be correct approximately 72 % of the time.  

##  Model Fitting

In [66]:
#  Dictionary of models and their params for GridSearch
#  Patterned after a dictionary format designed by Chris Joyce

model_dict = {
    'logr':
    {'model':
     ('logr', LogisticRegression()),
     'params':
     {"C": np.logspace(-5,5,num = 10)}},
    
    'mnb':
    {'model':
     ('mnb', MultinomialNB()),
     'params':
     {"alpha": np.linspace(0,1,num = 10)}},
    
    'gnb':
    {'model':
     ('gnb', GaussianNB()),
     'params':
     {"var_smoothing": np.logspace(1e-6,1e-9,num = 10)}},
    
    'bnb':
    {'model':
     ('bnb', BernoulliNB()),
     'params':
     {"alpha": np.linspace(0,1,num = 10)}},
        
    'knn':
    {'model':
     ('knn', KNeighborsClassifier()),
     'params':
     {"n_neighbors": [3,5,7,10,15]}},
        
    'dt':
    {'model':
     ('dt', DecisionTreeClassifier()),
     'params':
     {"max_depth": [3,5,7,10],
     "min_samples_leaf": [1,3,5]}},

    'rf':
    {'model':
     ('rf', RandomForestClassifier()),
     'params':
     {"n_estimators": [30,50,100,200],
     "max_depth": [3,5,7,10]}},
    
    'abc':
    {'model':
     ('abc', AdaBoostClassifier()),
     'params':
     {"n_estimators": [30,50,100,200],
     }}
}
    

In [None]:
# set dictionary to house best estimators and params
best_models = {}

# loop through models and their params
for key, value in model_dict.items():
    gs = GridSearchCV(value['model'][1], # classifier name
                  value['params'], # parameters
                      cv=5,
                      verbose = 2, # view status during grid fit
                      n_jobs=-1) # apply all available processor cores
    
    # fit the current iteration of GridSearchCV
    gs.fit(X_train, y_train)
    
    # save best model and best params for scoring 
    best_models[key] = {'model': gs.best_estimator_,
                        'params': gs.best_params_}
    print(best_models)

In [45]:
#  Create list to store model scores
model_scores = []

In [46]:
#  Fit and score models
logr = LogisticRegression(C = 0.2782559402207126)
logr.fit(X_train, y_train)
model_scores.append(['logr',logr.score(X_train,y_train),\
                   logr.score(X_test,y_test),\
                    cross_val_score(logr,X_test,y_test,cv=5).mean()])

#  Write preds and pred probabilities to preds folder
pd.DataFrame(logr.predict(X_test)).to_csv('../preds/logr.csv',index=False)
pd.DataFrame(logr.predict_proba(X_test)).to_csv('../preds/logr_proba.csv',index=False)

In [48]:
mnb = MultinomialNB(alpha = 1)
mnb.fit(X_train, y_train)
model_scores.append(['mnb',mnb.score(X_train,y_train),\
                   mnb.score(X_test,y_test),\
                    cross_val_score(mnb,X_test,y_test,cv=5).mean()])

pd.DataFrame(mnb.predict(X_test)).to_csv('../preds/mnb.csv',index=False)
pd.DataFrame(mnb.predict_proba(X_test)).to_csv('../preds/mnb_proba.csv',index=False)

In [49]:
gnb = GaussianNB(var_smoothing = 1.000002302587744)
gnb.fit(X_train, y_train)
model_scores.append(['gnb',gnb.score(X_train,y_train),\
                   gnb.score(X_test,y_test),\
                    cross_val_score(gnb,X_test,y_test,cv=5).mean()])

pd.DataFrame(gnb.predict(X_test)).to_csv('../preds/gnb.csv',index=False)
pd.DataFrame(gnb.predict_proba(X_test)).to_csv('../preds/gnb_proba.csv',index=False)

In [50]:
bnb = BernoulliNB(alpha = 0.7777777777777777)
bnb.fit(X_train, y_train)
model_scores.append(['bnb',bnb.score(X_train,y_train),\
                   bnb.score(X_test,y_test),\
                    cross_val_score(bnb,X_test,y_test,cv=5).mean()])

pd.DataFrame(bnb.predict(X_test)).to_csv('../preds/bnb.csv',index=False)
pd.DataFrame(bnb.predict_proba(X_test)).to_csv('../preds/bnb_proba.csv',index=False)

In [51]:
knn = KNeighborsClassifier(n_neighbors = 15)
knn.fit(X_train, y_train)
model_scores.append(['knn',knn.score(X_train,y_train),\
                   knn.score(X_test,y_test),\
                    cross_val_score(knn,X_test,y_test,cv=5).mean()])

pd.DataFrame(knn.predict(X_test)).to_csv('../preds/knn.csv',index=False)
pd.DataFrame(knn.predict_proba(X_test)).to_csv('../preds/knn_proba.csv',index=False)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [52]:
dt = DecisionTreeClassifier(max_depth = 10, min_samples_leaf = 3)
dt.fit(X_train, y_train)
model_scores.append(['dt',dt.score(X_train,y_train),\
                   dt.score(X_test,y_test),\
                    cross_val_score(dt,X_test,y_test,cv=5).mean()])

pd.DataFrame(dt.predict(X_test)).to_csv('../preds/dt.csv',index=False)
pd.DataFrame(dt.predict_proba(X_test)).to_csv('../preds/dt_proba.csv',index=False)

In [53]:
rf = RandomForestClassifier(max_depth = 10, n_estimators = 50)
rf.fit(X_train, y_train)
model_scores.append(['rf',rf.score(X_train,y_train),\
                     rf.score(X_test,y_test),\
                    cross_val_score(rf,X_test,y_test,cv=5).mean()])

pd.DataFrame(rf.predict(X_test)).to_csv('../preds/rf.csv',index=False)
pd.DataFrame(rf.predict_proba(X_test)).to_csv('../preds/rf_proba.csv',index=False)

In [62]:
abc = AdaBoostClassifier(n_estimators = 200)
abc.fit(X_train, y_train)
model_scores.append(['abc',abc.score(X_train,y_train),\
                     abc.score(X_test,y_test),\
                    cross_val_score(abc,X_test,y_test,cv=5).mean()])

pd.DataFrame(abc.predict(X_test)).to_csv('../preds/abc.csv',index=False)
pd.DataFrame(abc.predict_proba(X_test)).to_csv('../preds/abc_proba.csv',index=False)

In [65]:
model_scores

[['logr', 0.9076252244329632, 0.8938111465318237, 0.880289133626132],
 ['mnb', 0.8646073797222528, 0.8548972188633616, 0.85599538449456],
 ['gnb', 0.8477520061558755, 0.8366494448719358, 0.8386270079563098],
 ['bnb', 0.863031768714961, 0.8554468506100913, 0.8533585051561963],
 ['knn', 0.8492176908138214, 0.8169726283390129, 0.8096077424499635],
 ['dt', 0.8599538309332747, 0.8347806969330549, 0.8390660850968713],
 ['rf', 0.8251438203070609, 0.8124656480158294, 0.8041114849965867],
 ['abc', 0.8956798944707046, 0.8783115312740464, 0.8662187894568323]]

In [60]:
#  Write target data to preds folder
pd.DataFrame(y_test).to_csv('../preds/y_test.csv',index=False)