<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 3: Part 3 - Modeling & Evaluation

_Author: Sharnique Beck (BOS)_

---


In [1]:
# Vectorizer and Model imports:
import pandas as pd

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# read in pre-processed data
data = pd.read_csv('../data/clean_data.csv')

In [3]:
data.head()

Unnamed: 0,title,y
0,doom patrol trailer released,0
1,justice league fight future 6th dimension,0
2,themiscyra atlantis join un,0
3,aquaman movie spoiler brief history aquaman oc...,0
4,anyone else hate comic change match look esthe...,0


## Train Test Split data

In [5]:
X = data['title']
y = data['y']

### Baseline accuracy score

In [6]:
y.value_counts(normalize=True)

0    0.501003
1    0.498997
Name: y, dtype: float64

In [9]:
X_train,X_test,y_train,y_test = train_test_split(X,y,
                                                 random_state=42,
                                                 stratify=y) # account for slight class unbalance

## Pipeline Models

In [10]:
cv = CountVectorizer()
tvect = TfidfVectorizer()
lr = LogisticRegression()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [11]:
# Creates 2 Logistic Regression pipelines using count and tfidf vectorizers
cv_lr_pipe = Pipeline([
    ('cv', cv),
    ('lr', lr)
])
tfidf_lr_pipe = Pipeline([
    ('tvect', tvect),
    ('lr', lr)
])

# Creates two Naive Bayes pipelines with Multinomial and binomial NB
m_nv_pipe = Pipeline([
    ('cv', cv),
    ('mnb', mnb)
])
b_nv_pipe = Pipeline([
    ('tvect', tvect),
    ('bnb', bnb)
])

In [12]:
# search params
params_1 = {
    'cv__max_features':[None,5000,10000],
    'cv__ngram_range':[(1,1),(1,2)]
}
params_2 = {
    'tvect__max_features':[None,4000,5000,10000],
    'tvect__ngram_range':[(1,1),(1,2)]
}

In [27]:
# This function creates a grid search given a pipeline, set of params to search over and data to train on,
# prints out best params found, and train w/test score
def gs(pipe,params,X_tr,y_tr,X,y):
    
    gs = GridSearchCV(pipe,param_grid=params)
    
    gs.fit(X_tr,y_tr);
    
    print('Best parameters:', gs.best_params_)
    print('Best Score:', gs.best_score_)
    print('Test Score:', gs.score(X,y))
    
    return gs
    

## Logistic Regression Models

In [14]:
# count vector
gs(cv_lr_pipe,params_1,X_train,y_train,X_test,y_test)

Best parameters: {'cv__max_features': None, 'cv__ngram_range': (1, 2)}
Best Score: 0.8927664126220083
Test Score: 0.892498997192138


In [15]:
# tfidf vector
gs(tfidf_lr_pipe,params_2,X_train,y_train,X_test,y_test)

Best parameters: {'tvect__max_features': 5000, 'tvect__ngram_range': (1, 2)}
Best Score: 0.8887551811739537
Test Score: 0.8953068592057761


## Naive Bayes Models

In [16]:
# Multinomial NB
gs(m_nv_pipe,params_1,X_train,y_train,X_test,y_test)

Best parameters: {'cv__max_features': None, 'cv__ngram_range': (1, 2)}
Best Score: 0.8937023666265543
Test Score: 0.8977135980746089


In [29]:
# Binomial NB
best = gs(b_nv_pipe,params_2,X_train,y_train,X_test,y_test)
# best

Best parameters: {'tvect__max_features': None, 'tvect__ngram_range': (1, 2)}
Best Score: 0.8967776440700629
Test Score: 0.9037304452466908


## Random Forest  vs. Extra Trees Model

In [18]:
rf = RandomForestClassifier()
et = ExtraTreesClassifier()

In [19]:
cvect= CountVectorizer(ngram_range=(1,2))
X_train_cv = cvect.fit_transform(X_train)
X_test_cv = cvect.transform(X_test)

In [20]:
cross_val_score(rf,X_train_cv,y_train).mean()

0.8612113918973124

In [21]:
cross_val_score(et,X_train_cv,y_train).mean()

0.869634977938227

In [23]:
# Because Extra Trees scored slightly better than random forest cross val score I
# used its model to grid search
et = ExtraTreesClassifier()
et_params = {
    'n_estimators': [10, 20, 30],
    'max_depth': [None, 1, 2, 3, 4, 5],
    'min_samples_split': [2,3,4],
#     'max_features': ['auto',1.0]      took too long to run
}
et_gs = GridSearchCV(et, param_grid=et_params)
et_gs.fit(X_train_cv, y_train)
print(et_gs.best_score_)
et_gs.best_params_

0.8814012568525204


{'max_depth': None, 'min_samples_split': 4, 'n_estimators': 30}

In [25]:
print('train score:',et_gs.score(X_train_cv, y_train))

print('test score:',et_gs.score(X_test_cv, y_test))

train score: 0.9949191068324642
test score: 0.8728439630966707


## Evaluation

In [34]:
# Use best scoring model to evaluate
predictions = best.predict(X_test)
cm = confusion_matrix(y_test, predictions)

In [35]:
# Convert confusion matrix to dataframe
cm_df = pd.DataFrame(cm,
                    columns = ['predicted neg', 'predicted pos'],
                    index = ['actual neg', 'actual pos'])
cm_df

Unnamed: 0,predicted neg,predicted pos
actual neg,1096,153
actual pos,87,1157


In [39]:
# Calculate model accuracy
accuracy =(1096+1157)/(1096+153+87+1157)
accuracy*100

90.37304452466908