In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Imports

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
# uncomment the group you want to train on
#race = "hispanic"
#race = "white"
race = "mixed"

In [5]:
X = np.load('../Data/' + race + '/X.npy')
Y2 = np.load('../Data/' + race + '/Y2.npy')

if race == 'mixed':
    ethnicity_labels = np.load('../Data/mixed/ethnicity_labels.npy')

In [6]:
Counter(ethnicity_labels)

Counter({0: 275547, 1: 37575})

In [8]:
X = np.column_stack((X,ethnicity_labels))

In [9]:
X.shape

(313122, 373)

In [10]:
def split_train_test(X, Y):
    # shuffle
    np.random.seed(42)
    idx = np.random.permutation(len(X))
    X = X[idx]
    Y = Y[idx]

    # split into training and test sets
    TEST_SET_SIZE = int(0.1*len(Y))
    X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
    Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)
    return X_train, X_test, Y_train, Y_test
X_train, X_test, Y_train, Y_test = split_train_test(X, Y2)

In [11]:
X_train.shape

(281810, 373)

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [12]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def feature_scale(X_train, X_test):
    
    mm_scaler = MinMaxScaler(feature_range=(-1,1))
    X_train[:,:-4] = mm_scaler.fit_transform(X_train[:,:-4])
    X_test[:,:-4] = mm_scaler.transform(X_test[:,:-4])
    
    std_scaler = StandardScaler()
    X_train[:,-4:] = std_scaler.fit_transform(X_train[:,-4:])
    X_test[:,-4:] = std_scaler.transform(X_test[:,-4:])
    return X_train, X_test

X_train, X_test = feature_scale(X_train, X_test)



# Results Function

In [13]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier, X_test=X_test):
    #Y_pred_test = classifier.predict(X_test)
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    #print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

In [19]:
import pickle
with open('ethnicity_codes/white_codes.pkl', 'rb') as f_w:
    white_codes = pickle.load(f_w)
with open('ethnicity_codes/hispanic_codes.pkl', 'rb') as f_h:
    hispanic_codes = pickle.load(f_h)
with open('ethnicity_codes/mixed_codes.pkl', 'rb')  as f_m:
    mixed_codes = pickle.load(f_m)

In [20]:
def feature_importance_rank(array, ethnicity_label):
    codes = None
    if race == "white":
        codes = white_codes
    elif race == "hispanic":
        codes = hispanic_codes
    elif race == "mixed":
        codes = mixed_codes
        
    array = np.abs(array)
    
    ranking = {}
    if ethnicity_label:
        ranking['race'] = array[-1]
        ranking['tumsiz'] = array[-2]
        ranking['maligcount'] = array[-3]
        ranking['eod10_pn'] = array[-4]
        ranking['age_dx'] = array[-5]
    
    else:
        ranking['tumsiz'] = array[-1]
        ranking['maligcount'] = array[-2]
        ranking['eod10_pn'] = array[-3]
        ranking['age_dx'] = array[-4]
    
    for key, val in codes.items():
        varname = key[1]
        start_idx = key[2]
        end_idx = start_idx + len(val)
        ranking[varname] = np.sum(array[start_idx:end_idx])
    
    d_view = [(name, score) for name,score in ranking.items()]
    d_view.sort(key=lambda x:x[1], reverse=True)
    for rank, pair in enumerate(d_view, 1):
        print(rank, pair)
    

## Decision Tree

In [None]:

# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[40,50,60], 'min_samples_leaf':[250,260,270,280,290]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=3, scoring=make_scorer(roc_auc_score), verbose=3)
dt_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

In [None]:
results(dt_grid_search, X_test, Y_test)

In [None]:
tree_clf_h = DecisionTreeClassifier(max_features=200, min_samples_leaf=150)
tree_clf_h.fit(X_train, Y_train)


In [None]:
results(tree_clf_h, X_test, Y_test)

In [None]:
feature_importance_rank(tree_clf_h.feature_importances_)

In [None]:
sorted(enumerate(tree_clf_h.feature_importances_), key=lambda x:x[1], reverse=True)

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

In [None]:
results(knn_clf)

## Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .75, 1, 1.5, 2]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, 
                              scoring=make_scorer(roc_auc_score), verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

In [None]:
results(lr_grid_search)

In [21]:
# class_weight={1:1, 0:2}
lr = LogisticRegression(C=1)
lr.fit(X_train, Y_train.astype(int))
#lr.fit(X_train, Y_train.astype(int))
results(lr)

Test accuracy score: 0.855103474706183
ROC: 0.8718733215434405
PPV: 0.8837131616064138
NPV: 0.6787185354691075
Sensitivity: 0.9443144409629953
Specificity: 0.48630923102147894
G-Mean: 0.6776642454986038
Confusion matrix:
[[ 2966  3133]
 [ 1404 23809]]


In [22]:
feature_importance_rank(lr.coef_[0], True)

1 ('histo3v', 81.22515203470718)
2 ('cslymphn', 19.747983364643506)
3 ('csexten', 17.36882872274285)
4 ('surgprif', 5.902625863164482)
5 ('csmetsdx', 5.864142874717367)
6 ('reg', 3.5583889647114817)
7 ('age_dx', 1.983152159998009)
8 ('primsite', 1.4472213224181907)
9 ('no_surg', 1.106293304229804)
10 ('summ2k', 0.9045600682571576)
11 ('dx_conf', 0.822995423731435)
12 ('csmteval', 0.7554901126371617)
13 ('csrgeval', 0.6793666993749904)
14 ('grade', 0.5661703113367789)
15 ('mar_stat', 0.4106388661168552)
16 ('cstseval', 0.3981151837811313)
17 ('beho3v', 0.35956860352786124)
18 ('eod10_pn', 0.2788893717979616)
19 ('sex', 0.0907788678222121)
20 ('maligcount', 0.07134211915646624)
21 ('tumsiz', 0.06339024311122199)
22 ('race', 0.006509939863246353)


In [None]:
for key, val in enumerate(lr.coef_[0]):
    print(key, val)

In [None]:
sorted(enumerate(lr.coef_[0]), key=lambda x:x[1], reverse=True)

## Random Forest

In [None]:
#scoring=make_scorer(roc_auc_score)
param_grid = [{'max_features':[175,200], 'n_estimators':[10,15,20], 'min_samples_leaf':[150,175,200]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
rf_grid_search.fit(X_train_res, Y_train_res)

In [None]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

In [None]:
Counter(Y_train_res)

In [23]:
# class_weight={1:1,0:3}
rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=150, max_features=200, random_state=42)
rf_clf.fit(X_train, Y_train)
results(rf_clf)

Test accuracy score: 0.8534427695452222
ROC: 0.8731099180579104
PPV: 0.8873779113448534
NPV: 0.6609121909633419
Sensitivity: 0.9368976321738786
Specificity: 0.5084440072142974
G-Mean: 0.6901883702671856
Confusion matrix:
[[ 3101  2998]
 [ 1591 23622]]


In [24]:
feature_importance_rank(rf_clf.feature_importances_, True)

1 ('csmetsdx', 0.3724755773627602)
2 ('summ2k', 0.1870926635503274)
3 ('no_surg', 0.10790548042915481)
4 ('age_dx', 0.08759983570483185)
5 ('surgprif', 0.06683252563393596)
6 ('eod10_pn', 0.06653326035483957)
7 ('tumsiz', 0.033703507303795135)
8 ('grade', 0.016875652890442405)
9 ('cslymphn', 0.012439229464724132)
10 ('csexten', 0.009074781383398177)
11 ('primsite', 0.008652183626317292)
12 ('dx_conf', 0.007333646445526323)
13 ('mar_stat', 0.006122074376357596)
14 ('histo3v', 0.0059149450132651585)
15 ('maligcount', 0.003814146359879019)
16 ('sex', 0.0021769235803424)
17 ('reg', 0.0018854777886249514)
18 ('csmteval', 0.0014160279795975826)
19 ('csrgeval', 0.0011862608359948848)
20 ('cstseval', 0.0011392800356132648)
21 ('beho3v', 0.001119489155326443)
22 ('race', 0.00010659330564055625)


In [None]:
sorted(enumerate(rf_clf.feature_importances_), key=lambda x:x[1], reverse=True)

In [None]:
results(rf_grid_search, X_test, Y_test)

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=.01)
svm_clf.fit(X_train, Y_train)

In [None]:
results(svm_clf)

In [None]:
feature_importance_rank(svm_clf.coef_[0])

In [None]:
sorted(enumerate(svm_clf.coef_[0]), key=lambda x:x[1], reverse=True)

# Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.mixture import GaussianMixture
clf1 = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225)
clf2 = LogisticRegression(C=1)
clf3 = GaussianNB()
clf4 = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=20, 
                           algorithm="SAMME.R", learning_rate=1)

eclf = VotingClassifier(estimators=[
    ('rf', clf1), ('lr', clf2), ('ab', clf4)
], voting='soft')
eclf.fit(X_train, Y_train)

In [None]:
results(eclf)

# Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=50, max_samples=3000,
                           bootstrap=True, n_jobs=-1)
bag_clf.fit(X_train_res, Y_train_res)

In [None]:
results(bag_clf)

In [None]:
from imblearn.ensemble import BalancedBaggingClassifier
bbc = BalancedBaggingClassifier(DecisionTreeClassifier(), n_estimators=200, max_samples=3000,
                           bootstrap=True, n_jobs=-1)
bbc.fit(X_train, Y_train)

In [None]:
results(bbc)

# AdaBoost

In [None]:
#scoring=make_scorer(roc_auc_score)
param_grid = [{"n_estimators":[100, 115, 130, 145], "learning_rate":[.6, .7, .8, .9, 1]}]
ab_grid_search = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), algorithm="SAMME.R"), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
ab_grid_search.fit(X_train, Y_train)

In [None]:
cvres = ab_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(ab_grid_search.best_params_))

In [None]:
results(ab_grid_search)

In [25]:
ab_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=200, 
                           algorithm="SAMME.R", learning_rate=1)
ab_clf.fit(X_train, Y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=1, n_estimators=200, random_state=None)

In [None]:
results(ab_clf)

In [26]:
feature_importance_rank(ab_clf.feature_importances_, True)

1 ('csexten', 0.195)
2 ('age_dx', 0.12)
3 ('histo3v', 0.10500000000000001)
4 ('csmetsdx', 0.095)
5 ('tumsiz', 0.075)
6 ('eod10_pn', 0.075)
7 ('surgprif', 0.065)
8 ('primsite', 0.04)
9 ('summ2k', 0.04)
10 ('reg', 0.039999999999999994)
11 ('grade', 0.025)
12 ('no_surg', 0.025)
13 ('mar_stat', 0.02)
14 ('cslymphn', 0.02)
15 ('csrgeval', 0.02)
16 ('dx_conf', 0.015)
17 ('cstseval', 0.01)
18 ('maligcount', 0.005)
19 ('sex', 0.005)
20 ('beho3v', 0.005)
21 ('csmteval', 0.005)
22 ('race', 0.0)


## Gradient Boosted Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=400)
gb_clf.fit(X_train_res, Y_train_res)

In [None]:
results(gb_clf)

# SMOTE

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(ratio={0: 27723, 1:27723},random_state=42)
X_train_res, Y_train_res = sm.fit_sample(X_train,Y_train)

In [None]:
Counter(Y_train_res)

In [None]:
Counter(Y_train)

# Undersampling

In [None]:
from imblearn.under_sampling import RandomUnderSampler
us = RandomUnderSampler(ratio={0:Counter(Y_train)[0], 1:Counter(Y_train)[0]})
X_train_res, Y_train_res = us.fit_sample(X_train, Y_train)

In [None]:
Counter(Y_train_res)

# Oversampling

In [None]:
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler(ratio={0:20000, 1:27731})
X_train_res, Y_train_res = os.fit_sample(X_train, Y_train)

## t-SNE

In [None]:
from sklearn.manifold import TSNE

In [None]:
tsne = TSNE(n_components=2, init='pca', random_state=97, verbose=5)
size = 2000
X_tsne = tsne.fit_transform(X[:size])

In [None]:
Y_exp = Y2[:size]
idx_not_survive = np.where(Y_exp==0)[0]
idx_survive = np.where(Y_exp==1)[0]


plt.xlim((-100,200))
plt.ylim((-50,50))
plt.scatter(X_tsne[idx_not_survive][:,0], X_tsne[idx_not_survive][:,1], color='red')
plt.scatter(X_tsne[idx_survive][-200:,0], X_tsne[idx_survive][-200:,1], color='blue')
plt.show()

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter([1,2,3],[4,5,6])
plt.show()

In [None]:
tsne = TSNE(n_components=3, init='pca', random_state=97, verbose=5)
size = 5000
X_tsne = tsne.fit_transform(X[:size])

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

Y_exp = Y2[:size]
idx_not_survive = np.where(Y_exp==0)[0]
idx_survive = np.where(Y_exp==1)[0]


fig = plt.figure()
ax1 = fig.add_subplot(221, projection='3d')
ax1.set_zlim(-3,3)
ax1.set_ylim(-20,20)
ax1.set_xlim(-100,100)

ax2 = fig.add_subplot(222, projection='3d')
ax2.set_zlim(-3,3)
ax2.set_ylim(-20,20)
ax2.set_xlim(-100,100)

ax2.scatter(xs=X_tsne[idx_not_survive][:,0], ys=X_tsne[idx_not_survive][:,1], zs=X_tsne[idx_not_survive][:,2], color='red')

ax1.scatter(xs=X_tsne[idx_survive][begin:,0], ys=X_tsne[idx_survive][begin:,1], zs=X_tsne[idx_survive][begin:,2],color='blue')
plt.tight_layout()
plt.show()
