In [None]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cross_validation import KFold, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import r2_score
from sklearn.cross_validation import cross_val_score
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO

from IPython.display import Image
import pydot
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
df = pd.read_csv('../assets/trainWeather.csv')

In [None]:
y = df['WnvPresent']
X = df.drop(['WnvPresent','Latitude','Longitude'], axis = 1)

In [None]:
X['SnowFall_x'] = X['SnowFall_x'].map({'  T' : 1} )
X['PrecipTotal_x'] = X['PrecipTotal_x'].map({'  T' : 0} )
X['PrecipTotal_y'] = X['PrecipTotal_y'].map({'  T' : 0} )

In [None]:
X['PrecipTotal_y'].fillna(X['PrecipTotal_y'].mean(), inplace=True)
X['PrecipTotal_x'].fillna(X['PrecipTotal_x'].mean(), inplace=True)
X['SnowFall_x'].fillna(X['SnowFall_x'].mean(), inplace=True)

In [None]:
# Scale year, runtime, and gross columns to values between 0 and 1. Our minimum will now be 0 and max will be 1

scale = StandardScaler()

X.ix[:,1:] = scale.fit_transform(X.ix[:,1:].as_matrix())

In [None]:
def create_month(x):
    return x.split('-')[1]

def create_day(x):
    return x.split('-')[2]

X['month'] = X.Date.apply(create_month)
X['day'] = X.Date.apply(create_day)
# test['month'] = test.Date.apply(create_month)
# test['day'] = test.Date.apply(create_day)

In [None]:
X['Date'] = pd.to_datetime(X['Date'])
X.set_index('Date', inplace=True)

In [None]:
le = LabelEncoder()
y = le.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
#Initiate GridSearch - it just may be that it's not max_depth that will get us the best model, but a certain
#combination of the model's possible parameters

PARAMETERS = {'max_depth':[1,2,3,4,5,6], 'max_features':[1,2,3,4], 
              'max_leaf_nodes':[5,6,7,8,9,10], 'min_samples_leaf':[1,2,3,4],
              'min_samples_split':[1,2,3,4]}
SCORING = 'mean_squared_error'

In [None]:
#We use Grid Search to find optimal values for each paramter

clfModel = DecisionTreeClassifier()
clf = GridSearchCV(clfModel, PARAMETERS, scoring=SCORING, n_jobs=-1)
clf.fit(X, y)

# #After completion, show the final best results and scores
print clf.best_estimator_
print clf.best_score_
print np.sqrt(-clf.best_score_)

In [None]:
# GridSearch provides us with the optimal parameters, so we fit our model with those parameters to the training set

treeclf = DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=1, max_leaf_nodes=5, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=8, splitter='best')
treeclf.fit(X_train, y_train)

In [None]:
# Review this tree with our normal classifier, but explain why these aren't great, leading into our use of ensemble
# in the steps below

dot_data = StringIO()  
export_graphviz(treeclf, out_file=dot_data,  
                feature_names=X.columns,  
                filled=True, rounded=True,  
                special_characters=True)  
graph = pydot.graph_from_dot_data(dot_data.getvalue())  
Image(graph[0].create_png())

In [None]:
# predict class labels for the test set
predicted = treeclf.predict(X_test)
print predicted

In [None]:
# generate class probabilities - this will be useful for our confusion plot below
probs = treeclf.predict_proba(X_test)

In [None]:
# rfc = RandomForestClassifier(n_jobs=-1, max_features= 'sqrt', n_estimators=50, oob_score = True) 

# CV_rfc = GridSearchCV(rfc, PARAMETERS, scoring=SCORING, cv= 5)
# CV_rfc.fit(X, y)
# print CV_rfc.best_params_
# print CV_rfc.best_estimator_

In [None]:
rfClf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=1, max_features=1, max_leaf_nodes=5,
            min_samples_leaf=1, min_samples_split=1,
            min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
            oob_score=True, random_state=63, verbose=0, warm_start=False)

rfClf.fit(X_train, y_train)

In [None]:
rfPreds = rfClf.predict(X_test)
print rfPreds

rfProbs = rfClf.predict_proba(X_test)

In [None]:
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                         algorithm="SAMME",
                         n_estimators=200)

bdt.fit(X, y)


In [None]:
ada = AdaBoostClassifier(algorithm='SAMME',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=5, splitter='best'),
          learning_rate=1.0, n_estimators=200, random_state=None)

ada.fit(X_train, y_train)

In [None]:
adaPreds = ada.predict(X_test)
print rfPreds

adaProbs = ada.predict_proba(X_test)

print "The AdaBoost Classifier's accuracy score is", metrics.accuracy_score(y_test, adaPreds)
print "The AdaBoost Classifier's area under the curve is", metrics.roc_auc_score(y_test, adaProbs[:, 1])

In [None]:
print "The Decision Tree Classifier's accuracy score is", metrics.accuracy_score(y_test, predicted)
print "The Decision Tree Classifier's area under the curve is", metrics.roc_auc_score(y_test, probs[:, 1])
print ' '

print "The Random Forest Classifier's accuracy score is", metrics.accuracy_score(y_test, rfPreds)
print "The Random Forest Classifier's area under the curve is", metrics.roc_auc_score(y_test, rfProbs[:, 1])

print "The AdaBoost Classifier's accuracy score is", metrics.accuracy_score(y_test, adaPreds)
print "The AdaBoost Classifier's area under the curve is", metrics.roc_auc_score(y_test, adaProbs[:, 1])

In [None]:
print confusion_matrix(y_test, predicted)

In [None]:
print confusion_matrix(y_test, rfPreds)

In [None]:
print confusion_matrix(y_test, adaPreds)

In [None]:
adaFalse_positive_rate, adaTrue_positive_rate, adaThresholds = metrics.roc_curve(y_test, adaProbs[:, 1])
adaRoc_auc = metrics.auc(adaFalse_positive_rate, adaTrue_positive_rate)

rfFalse_positive_rate, rfTrue_positive_rate, rfThresholds = metrics.roc_curve(y_test, rfProbs[:, 1])
rfRoc_auc = metrics.auc(rfFalse_positive_rate, rfTrue_positive_rate)

false_positive_rate, true_positive_rate, thresholds = metrics.roc_curve(y_test, probs[:, 1])
roc_auc = metrics.auc(false_positive_rate, true_positive_rate)

plt.rcParams['figure.figsize']=17,8
plt.title('Receiver Operating Characteristic\n', fontsize=20)

plt.plot(adaFalse_positive_rate, adaTrue_positive_rate, 'b',
label='AdaBoost AUC = %0.2f'% adaRoc_auc)

plt.plot(rfFalse_positive_rate, rfTrue_positive_rate, 'm',
label='RandomForest AUC = %0.2f'% rfRoc_auc)

plt.plot(false_positive_rate, true_positive_rate, 'g',
label='DecisionTree AUC = %0.2f'% roc_auc)

plt.legend(loc=7, fontsize='x-large', )
plt.plot([0,1],[0,1],'r--')
plt.xlim([-0.1,1.2])
plt.ylim([-0.1,1.2])
plt.ylabel('True Positive Rate\n', fontsize=14)
plt.xlabel('\nFalse Positive Rate', fontsize=14)
plt.show()

In [None]:
probDf = pd.DataFrame(rfProbs, columns=['noWN', 'WN'])

probDf['actual'] = y_test

cmDf = probDf[['actual','WN']]
threshold = 0.08

cmDf['predicted'] = [1 if i >= threshold else 0 for i in cmDf['WN']]

In [None]:
print pd.crosstab(
        cmDf['actual'],
        cmDf['predicted'], 
        rownames=['actual'])

In [None]:
adaProbDf = pd.DataFrame(adaProbs, columns=['noWN', 'WN'])

adaProbDf['actual'] = y_test

adaCmDf = adaProbDf[['actual','WN']]
adaThreshold = 0.45

adaCmDf['predicted'] = [1 if i >= adaThreshold else 0 for i in adaCmDf['WN']]

In [None]:
print pd.crosstab(
        adaCmDf['actual'],
        adaCmDf['predicted'], 
        rownames=['actual'])