In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt

import seaborn as sns
sns.set()


import my_resample as ms
import my_functions as mf

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score



from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

# from sklearn.ensemble.partial_dependence import plot_partial_dependence
# from sklearn.model_selection import GridSearchCV

# from sklearn.model_selection import KFold
# from sklearn.metrics import roc_curve, auc
# from scipy import interp
# from random import *
# import matplotlib.pyplot as plt
# %matplotlib inline

# from sklearn.ensemble.partial_dependence import plot_partial_dependence
# from sklearn.ensemble.partial_dependence import partial_dependence

from importlib import reload

import warnings
warnings.filterwarnings('ignore')

# READ IN DATA

In [None]:
X =  pd.read_pickle('/Users/gandalf/Documents/coding/data_do_not_commit/X.pkl')
y1 =  pd.read_pickle('/Users/gandalf/Documents/coding/data_do_not_commit/y1.pkl')
y2 =  pd.read_pickle('/Users/gandalf/Documents/coding/data_do_not_commit/y2.pkl')

In [None]:
X.head()

In [None]:
y1.head()

In [None]:
y2.head()

# TRAIN TEST SPLIT

In [None]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X.as_matrix(), y1.as_matrix(), random_state=17)

# resample
X_train, y_train = ms.oversample(X_train, y_train, .5)

# scale data
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
def display_importances_trees(model, X):
    # show feature importances
    pd.options.display.float_format = '{:,.2f}'.format
    feature_df = pd.DataFrame([X.columns, model.feature_importances_]).T
    feature_df.columns = ['feature','coefficient']
    return feature_df.sort_values('coefficient', ascending=False)

# DECISION TREE CLASSIFIER

In [None]:
%%time

# fit model
model = DecisionTreeClassifier()
model.fit(X_train, y_train)

# show metrics
y_pred = model.predict(X_test)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# show importances
print(display_importances_trees(model, X).head(10))

# RANDOM FOREST CLASSIFIER

In [None]:
%%time

model = RandomForestClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# show metrics
y_pred = model.predict(X_test)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# show importances
print(display_importances_trees(model, X).head(10))

# GradientBoostingClassifier

In [None]:
%%time

# fit model
model = GradientBoostingClassifier(learning_rate= 0.4, 
                                   max_depth= 10, 
                                   min_samples_leaf= 2, 
                                   min_samples_split= 3, 
                                   n_estimators= 100, 
                                   subsample= 1)

# model = GradientBoostingClassifier()
model.fit(X_train, y_train)

# show metrics
y_pred = model.predict(X_test)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# show importances
print(display_importances_trees(model, X).head(10))

# ADABOOST CLASSIFIER

In [None]:
%%time

# fit model
model = AdaBoostClassifier()
model.fit(X_train, y_train)


# show metrics
y_pred = model.predict(X_test)

print("\nMETRICS")
print("Model recall: {}".format(recall_score(y_test, y_pred)))
print("Model precision: {}".format(precision_score(y_test, y_pred)))
print("Model accuracy: {}".format(model.score(X_test, y_test)))

print ("\nCONFUSION MATRIX")
print (confusion_matrix(y_test, y_pred))
print ("\nkey:")
print (" TN   FP ")
print (" FN   TP ")

# show importances
print(display_importances_trees(model, X).head(10))