In [46]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV

In [2]:
faves = pd.read_csv('../Assets/A/faves.csv')
faves_svd_df = pd.read_csv('../Assets/A/faves_SVD.csv')

In [3]:
faves = faves.as_matrix()
faves_svd = faves_svd_df.as_matrix()

In [18]:
okc = pd.read_csv('../Assets/A/train.csv')

In [20]:
y = LabelEncoder().fit_transform(okc.sex)

### Evaluate Baseline NB Classifier for sex from favorites

In [27]:
# Can you get a single crossval to report multiple scores for each model?
cv = cross_val_score(MultinomialNB(), faves, y=y, scoring='roc_auc',  cv = 5)

In [28]:
cv

array([ 0.71122189,  0.71223197,  0.71396987,  0.71515186,  0.71127318])

In [29]:
cv = cross_val_score(MultinomialNB(), faves, y=y, scoring='f1',  cv = 5)

In [30]:
cv

array([ 0.73204956,  0.7330202 ,  0.74480234,  0.74814707,  0.74335252])

In [31]:
# Can you get a single grid search to report multiple scores for each model?
cv = cross_val_score(MultinomialNB(), faves, y=y, scoring='accuracy',  cv = 5)
cv

array([ 0.66731536,  0.66814938,  0.67695302,  0.67874687,  0.67346371])

### Can I use Truncated SVD to produce a stronger model?
I cannot build a multinomial naive bayesian classifier on results of SVD.  Need positive integers.
I CAN run a Gaussian naive bayes, but recall that features are not normally distributed (the first feature of the SVD, which explains about 97% of the variance appears to be uniformly distributed)

In [33]:
cv = cross_val_score(GaussianNB(), faves_svd, y=y, scoring='roc_auc',  cv = 5)
cv

array([ 0.52060533,  0.51405004,  0.52245366,  0.52203028,  0.51625753])

In [35]:
cv = cross_val_score(GaussianNB(), faves_svd, y=y, scoring='f1',  cv = 5)
cv

array([ 0.74898504,  0.74825905,  0.74606506,  0.7267183 ,  0.72641452])

In [36]:
cv = cross_val_score(GaussianNB(), faves_svd, y=y, scoring='accuracy',  cv = 5)
cv

array([ 0.59892503,  0.59799833,  0.59633028,  0.58466957,  0.58633794])

GaussianNB on faves_SVD performs worse than multinomialNB on faves.  While f1 scores are comparable, roc_auc and accuracy lag behind.  WHAT DOES THAT MEAN?

### How about using a Random Forest Classifier instead of Bayesian?
#### Try on non-SVD first

In [41]:
cv = cross_val_score(RandomForestClassifier(), faves, y=y, scoring='roc_auc',  cv = 5)
cv

array([ 0.54463369,  0.6061449 ,  0.5953767 ,  0.58808907,  0.58173771])

This takes forever and does not perform as well as MultinomialNB

### Try to fit and tune RFC on faves_SVD

In [43]:
cv = cross_val_score(RandomForestClassifier(), faves_svd, y=y, scoring='roc_auc',  cv = 5)
cv

array([ 0.49984657,  0.49459035,  0.47103527,  0.47018652,  0.50048467])

In [44]:
cv = cross_val_score(RandomForestClassifier(), faves_svd, y=y, scoring='f1',  cv = 5)
cv

  'precision', 'predicted', average, warn_for)


array([ 0.        ,  0.53312998,  0.5146846 ,  0.51869277,  0.74888386])

In [45]:
params = {'n_estimators':[5, 10, 15, 30], 'max_depth':[3,8,20,None], 'min_samples_split':[2,8]}

In [48]:
grid = GridSearchCV(RandomForestClassifier(), params, cv=3, scoring = 'roc_auc')

In [49]:
grid.fit(faves_svd, y)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'n_estimators': [5, 10, 15, 30], 'min_samples_split': [2, 8], 'max_depth': [3, 8, 20, None]},
       pre_dispatch='2*n_jobs', refit=True, scoring='roc_auc', verbose=0)

In [50]:
grid.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=8,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [52]:
grid.best_score_

0.4968595887350532

### Try Textacy's text classification model

### Use spaCy to calculate "reading level" of essays and compare that to favorite books/movies, etc.
Compare to income?