In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Imports

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
race = "hispanic"
#race = "white"
#race = "mixed"

In [4]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

In [5]:
X

array([[0.0, 0.0, 0.0, ..., 0, 4, 30],
       [0.0, 0.0, 0.0, ..., 0, 2, 30],
       [0.0, 1.0, 0.0, ..., 0, 3, 24.65625],
       ...,
       [0.0, 1.0, 0.0, ..., 0, 1, 55],
       [0.0, 0.0, 0.0, ..., 0.031280517578125, 1, 76.30078125],
       [0.0, 1.0, 0.0, ..., 0.569580078125, 1, 21]], dtype=object)

In [None]:
# X_hispanic = np.load('../Data/hispanic/X.npy')
# Y_hispanic = np.load('../Data/hispanic/Y2.npy')
# X_white = np.load('../Data/white/X.npy')
# Y_white = np.load('../Data/white/Y2.npy')
#X_mixed = np.load('../Data/mixed/X.npy')
#Y_mixed = np.load('../Data/mixed/Y2.npy')

In [5]:
def split_train_test(X, Y):
    # shuffle
    np.random.seed(97)
    idx = np.random.permutation(len(X))
    X = X[idx]
    Y = Y[idx]
    
    #X = X[:size_hispanic]
    #Y = Y[:size_hispanic]

    # split into training and test sets
    TEST_SET_SIZE = int(0.1*len(Y))
    X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
    Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)
    return X_train, X_test, Y_train, Y_test

In [6]:
X_train, X_test, Y_train, Y_test = split_train_test(X, Y2)

In [7]:
#X_train_h, X_test_h, Y_train_h, Y_test_h = split_train_test(X_hispanic, Y_hispanic)
#X_train_w, X_test_w, Y_train_w, Y_test_w = split_train_test(X_white, Y_white)
#X_train_m, X_test_m, Y_train_m, Y_test_m = split_train_test(X_mixed, Y_mixed)

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [8]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def feature_scale(X_train, X_test):
    
    mm_scaler = MinMaxScaler(feature_range=(-1,1))
    X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
    X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])
    
    std_scaler = StandardScaler()
    X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
    X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])
    return X_train, X_test
    
    #mm_scaler = MinMaxScaler(feature_range=(-1,1))
    #X_train[:,-4:] = mm_scaler.fit_transform(X_train[:,-4:])
    #X_test[:,-4:] = mm_scaler.transform(X_test[:,-4:])
    
    #std_scaler = StandardScaler()
    #X_train[:,:-4] = std_scaler.fit_transform(X_train[:,:-4])
    #X_test[:,:-4] = std_scaler.transform(X_test[:,:-4])
    #return X_train, X_test

X_train, X_test = feature_scale(X_train, X_test)
#X_train_h, X_test_h = feature_scale(X_train_h, X_test_h)
#X_train_w, X_test_w = feature_scale(X_train_w, X_test_w)
#X_train_m, X_test_m = feature_scale(X_train_m, X_test_m)



# Results Function

In [9]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier):
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

In [None]:
import pickle
with open('white_codes.pkl', 'rb') as f_w:
    white_codes = pickle.load(f_w)
with open('hispanic_codes.pkl', 'rb') as f_h:
    hispanic_codes = pickle.load(f_h)

In [None]:
hispanic_codes

In [None]:
def feature_importance_rank(array):
    codes = None
    if race == "white":
        codes = white_codes
    elif race == "hispanic":
        codes = hispanic_codes
        
    array = np.abs(array)
    
    ranking = {}
    ranking['tumsiz'] = array[-1]
    ranking['maligcount'] = array[-2]
    ranking['eod10_pn'] = array[-3]
    ranking['age_dx'] = array[-4]
    
    for key, val in codes.items():
        varname = key[1]
        start_idx = key[2]
        end_idx = start_idx + len(val)
        ranking[varname] = np.sum(array[start_idx:end_idx])
    
    d_view = [(name, score) for name,score in ranking.items()]
    d_view.sort(key=lambda x:x[1], reverse=True)
    for rank, pair in enumerate(d_view, 1):
        print(rank, pair)
    

In [None]:
X_train.shape

## Decision Tree

In [None]:

# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[40,50,60], 'min_samples_leaf':[250,260,270,280,290]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=3, scoring=make_scorer(roc_auc_score), verbose=3)
dt_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

In [None]:
results(dt_grid_search, X_test, Y_test)

In [None]:
tree_clf_h = DecisionTreeClassifier(max_features=200, min_samples_leaf=150)
tree_clf_h.fit(X_train, Y_train)


In [None]:
results(tree_clf_h, X_test, Y_test)

In [None]:
feature_importance_rank(tree_clf_h.feature_importances_)

In [None]:
sorted(enumerate(tree_clf_h.feature_importances_), key=lambda x:x[1], reverse=True)

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

In [None]:
results(knn_clf)

## Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .75, 1, 1.5, 2]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, 
                              scoring=make_scorer(roc_auc_score), verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

In [None]:
results(lr_grid_search)

In [None]:
lr = LogisticRegression(C=1, class_weight={1:1, 0:2})
lr.fit(X_train, Y_train.astype(int))
results(lr)

In [None]:
for key, val in enumerate(lr.coef_[0]):
    print(key, val)

In [None]:
feature_importance_rank(lr.coef_[0])

In [None]:
sorted(enumerate(lr.coef_[0]), key=lambda x:x[1], reverse=True)

## Random Forest

In [None]:
#scoring=make_scorer(roc_auc_score)
param_grid = [{'max_features':[175,200], 'n_estimators':[10,15,20], 'min_samples_leaf':[150,175,200]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
rf_grid_search.fit(X_train_res, Y_train_res)

In [None]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

In [None]:
Counter(Y_train)

In [None]:
rf_clf = RandomForestClassifier(n_estimators=200, min_samples_leaf=150, max_features=200, random_state=42, 
                               class_weight={1:1, 0:3})
rf_clf.fit(X_train_res, Y_train_res)
results(rf_clf)

In [None]:
feature_importance_rank(rf_clf.feature_importances_)

In [None]:
sorted(enumerate(rf_clf.feature_importances_), key=lambda x:x[1], reverse=True)

In [None]:
results(rf_grid_search, X_test, Y_test)

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=.01)
svm_clf.fit(X_train, Y_train)

In [None]:
results(svm_clf, X_test, Y_test)

In [None]:
feature_importance_rank(svm_clf.coef_[0], "white")

In [None]:
sorted(enumerate(svm_clf.coef_[0]), key=lambda x:x[1], reverse=True)

# Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.mixture import GaussianMixture
clf1 = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225, class_weight={1:1, 0:5})
clf2 = LogisticRegression(C=1)
clf3 = GaussianNB()
clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
#clf4 = GaussianMixture()

eclf = VotingClassifier(estimators=[
    ('rf', clf1), ('lr', clf2), ('nb', clf3), ('ab', clf4)
], voting='soft')
eclf.fit(X_train, Y_train)

In [None]:
results(eclf)

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=3000,
                           bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train_res, Y_train_res)

In [None]:
results(bag_clf)

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(DecisionTreeClassifier(), n_estimators=200, max_samples=3000,
                           bootstrap=True, n_jobs=-1)
bbc.fit(X_train, Y_train)

In [None]:
results(bbc)

# AdaBoost

In [None]:
#scoring=make_scorer(roc_auc_score)
param_grid = [{"n_estimators":[100, 115, 130, 145], "learning_rate":[.6, .7, .8, .9, 1]}]
ab_grid_search = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R"), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
ab_grid_search.fit(X_train, Y_train)

In [None]:
cvres = ab_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(ab_grid_search.best_params_))

In [None]:
results(ab_grid_search)

In [None]:
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
ab_clf.fit(X_train, Y_train)

In [None]:
results(ab_clf)

In [None]:
feature_importance_rank(ab_clf.feature_importances_)

## Gradient Boosted Classifier

In [24]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=400)
gb_clf.fit(X_train_res, Y_train_res)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=400,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

In [25]:
results(gb_clf)

Test accuracy score: 0.7822730902315678
ROC: 0.8611901843439805
PPV: 0.943921568627451
NPV: 0.4407622203811102
Sensitivity: 0.780986372485399
Specificity: 0.7881481481481482
G-Mean: 0.7845590884078184
Confusion matrix:
[[ 532  143]
 [ 675 2407]]


# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio={0: 20000, 1:27731},random_state=42)
X_train_res, Y_train_res = sm.fit_sample(X_train,Y_train)

In [None]:
Counter(Y_train_res)

In [None]:
Counter(Y_train)

# Undersampling

In [14]:
from imblearn.under_sampling import RandomUnderSampler
us = RandomUnderSampler(ratio={0:Counter(Y_train)[0], 1:Counter(Y_train)[0]})
X_train_res, Y_train_res = us.fit_sample(X_train, Y_train)

In [15]:
Counter(Y_train_res)

Counter({0: 6087, 1: 6087})

# Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(ratio={0:20000, 1:27731})
X_train_res, Y_train_res = os.fit_sample(X_train, Y_train)