Modeling done on the most important features from the total dataset as determined by a random forest regressor, evaluated against random noise

In [121]:
### Importing all of the cool things I'll need...

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn import cross_validation
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve
from sklearn.learning_curve import learning_curve
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

%matplotlib inline

In [None]:
### Function to calculate cross-validation scores across models

def cv_score_means(model, X, y, score_type):
    scores = cross_val_score(model, X, y, cv=30, scoring=score_type)
    return scores.mean()

In [111]:
### Increases width of Jupyter notebook display

from IPython.display import display, HTML

display(HTML(data="""
<style>
    div#notebook-container    { width: 95%; }
    div#menubar-container     { width: 65%; }
    div#maintoolbar-container { width: 99%; }
</style>
"""))

In [112]:
### Reads dataset to pandas dataframe

df = pd.read_csv('mcnulty.csv')

In [126]:
### Curated list of important features as determined by the random forest regressor 

feature_list = ['% Children in Poverty', 'Chlamydia Rates per 100,000', 
                 '% Single-Parent Households', 'Avg Freshman Graduation Rate', 'Total % uninsured women in need of publicly funded contraceptive services,', 
                 'Adherents %', '% Obese']

Splitting into initial test-train sets

In [114]:
tp_tgt = df['High Risk']
tp_vars = df[feature_list]

In [115]:
for i in feature_list:
    tp_vars[i] = tp_vars[i].replace(np.nan, tp_vars[i].median())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [116]:
X_train, X_test, y_train, y_test = cross_validation.train_test_split(tp_vars, tp_tgt, test_size=0.3)

### Random Forest Model

In [117]:
max_features = len(feature_list)

rf =RandomForestClassifier(n_estimators = 500, max_features = max_features )
rf.fit(X_train, y_train)

a  = tp_vars.columns.values

ftr_imp = list(zip(a, rf.feature_importances_))

ftr_imp = sorted(ftr_imp, key=(lambda x: x[1]), reverse = True)

print(rf.score(X_test, y_test))
for i in ftr_imp:
    print(i)

0.839957035446
('% Children in Poverty', 0.36570218982759872)
('Chlamydia Rates per 100,000', 0.11579992855776972)
('Adherents %', 0.085677973521554637)
('Avg Freshman Graduation Rate', 0.077384259262161262)
('Total % uninsured women in need of publicly funded contraceptive services,', 0.064134422093231544)
('% Single-Parent Households', 0.05976243376084045)
('Congregations Per 10K People', 0.05556911934534145)
('% Binge Drinking', 0.050342710115757805)
('% Smokers', 0.045447008919974295)
('% Obese', 0.041024607621511402)
('Violent Crime Rate', 0.039155346974258695)


### KNN (K-Nearest Neighbors) Model

In [None]:
knn_acc = []
rng = [x for x in range(1, 51)]

for n in rng:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)
    acc = accuracy_score(y_test, knn.predict(X_test))
    knn_acc.append(acc)
    print(n, acc)

### Logistic Regression Model

In [118]:
log_reg = LogisticRegression()
log_reg.fit(X_train,y_train)
acc_lr = accuracy_score(y_test, log_reg.predict(X_test))

a  = tp_vars.columns.values
lr_coef = list(zip(a, log_reg.coef_[0]))
lr_coef = sorted(lr_coef, key=(lambda x: x[1]), reverse = True)

print('Logistic Regression accuracy: ', acc_lr)
for i in lr_coef:
    print(i)

Logistic Regression accuracy:  0.832438238453
('% Single-Parent Households', 0.23238908298980213)
('% Children in Poverty', 0.13457739621782397)
('% Smokers', 0.06764208889705968)
('Adherents %', 0.028757001543396835)
('Total % uninsured women in need of publicly funded contraceptive services,', 0.024654917666771949)
('Violent Crime Rate', 0.0004971160070419931)
('Chlamydia Rates per 100,000', 0.00022468468714539124)
('Congregations Per 10K People', -0.012146550696874396)
('% Obese', -0.036372421018075765)
('Avg Freshman Graduation Rate', -0.046268644603896271)
('% Binge Drinking', -0.11430239400565388)


In [131]:
df['predicted_vals'] = log_reg.predict(tp_vars)

In [132]:
df_pred = df[['Locale County FIPS Code', 'predicted_vals']]

In [134]:
df_pred.to_csv('predicted.csv')

In [122]:
### Quick and dirty assessment of model performance

models = {}

models['KNN'] = KNeighborsClassifier(n_neighbors=20)
models['LogReg'] = LogisticRegression()
models['GNB'] = GaussianNB()
models['DecTree'] = DecisionTreeClassifier()
models['RandForest'] = RandomForestClassifier(n_estimators = 100)
models['Gradient Trees'] = GradientBoostingClassifier()

score_list = ['accuracy', 'precision', 'recall', 'f1']

In [123]:
for name, model in models.items():
    print(name, ':')
    for m in score_list:
        scores = cv_score_means(model, tp_vars, tp_tgt, m)
        print(m, scores.mean())
    print('')

LogReg :
accuracy 0.832413640836
precision 0.795802787334
recall 0.694117647059
f1 0.723371802713

GNB :
accuracy 0.815958337779
precision 0.759811443118
recall 0.664824717766
f1 0.694615029381

Gradient Trees :
accuracy 0.829871202153
precision 0.77543139587
recall 0.717379679144
f1 0.728885501884

DecTree :
accuracy 0.763733181597
precision 0.651003158652
recall 0.649792038027
f1 0.626120675623

RandForest :
accuracy 0.827627493865
precision 0.776933615264
recall 0.713339275104
f1 0.725476723182

KNN :
accuracy 0.757170803846
precision 0.707424014158
recall 0.426648841355
f1 0.507036019694



## ROC Curves
Calculates and plots ROC curves for models

In [None]:
KNN = KNeighborsClassifier(n_neighbors=23).fit(X_train, y_train)
GNB = GaussianNB().fit(X_train, y_train)
# SVM = SVC(probability = True).fit(X_train, y_train)
RandForest = RandomForestClassifier(n_estimators = 5).fit(X_train, y_train)
DecTree = DecisionTreeClassifier().fit(X_train, y_train)
LogReg = LogisticRegression().fit(X_train, y_train)

KNN_pred = KNN.predict_proba(X_test)
GNB_pred = GNB.predict_proba(X_test)
# SVM_pred = SVM.predict_proba(X_test)
RandForest_pred = RandForest.predict_proba(X_test)
DecTree_pred = DecTree.predict_proba(X_test)
LogReg_pred = LogReg.predict_proba(X_test)

In [None]:
fpr_knn, tpr_knn, thresholds_knn = roc_curve(y_test, KNN_pred[:,1])
fpr_gnb, tpr_gnb, thresholds_gmb = roc_curve(y_test, GNB_pred[:,1])
# fpr_svm, tpr_svm, thresholds_svm = roc_curve(y_test, SVM_pred[:,1])
fpr_randforest, tpr_randforest, thresholds_randforest = roc_curve(y_test, RandForest_pred[:,1])
fpr_dectree, tpr_dectree, thresholds_dectree = roc_curve(y_test, DecTree_pred[:,1])
fpr_logreg, tpr_logreg, thresholds_logreg = roc_curve(y_test, LogReg_pred[:,1])

In [None]:
# row and column sharing
f, ((ax1, ax2), (ax3, ax4), (ax5, ax6)) = plt.subplots(3, 2, sharex='col')
f.set_size_inches(18, 12)
ax1.plot(fpr_knn, tpr_knn);
ax1.plot(fpr_knn, fpr_knn, 'r--');
ax1.set_title('KNN');
ax2.plot(fpr_gnb, tpr_gnb);
ax2.plot(fpr_gnb, fpr_gnb, 'r--');
ax2.set_title('GNB');
ax3.plot(fpr_randforest, tpr_randforest);
ax3.plot(fpr_randforest, fpr_randforest, 'r--');
ax3.set_title('Random Forest');
ax4.plot(fpr_dectree, tpr_dectree);
ax4.plot(fpr_dectree, fpr_dectree, 'r--');
ax4.set_title('Decision Tree');
ax5.plot(fpr_logreg, tpr_logreg);
ax5.plot(fpr_logreg, fpr_logreg, 'r--');
ax5.set_title('Logistic Regression');
ax6.plot(fpr_svm, tpr_svm);
ax6.plot(fpr_svm, fpr_svm, 'r--');
ax6.set_title('SVM');