In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sklearn
import tensorflow as tf
from collections import Counter

%matplotlib inline

  from ._conv import register_converters as _register_converters


# Imports

In [2]:
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [3]:
X_h = np.load('../Data_Standardized/hispanic/X.npy')
Y_h = np.load('../Data_Standardized/hispanic/Y2.npy')
X_w = np.load('../Data_Standardized/white/X.npy')
Y_w = np.load('../Data_Standardized/white/Y2.npy')

In [4]:
X_h = np.hstack((X_h, np.ones((len(Y_h),1))))
X_w = np.hstack((X_w, -1*np.ones((len(Y_w),1))))

In [5]:
X = np.vstack((X_h, X_w))
Y = np.concatenate((Y_h, Y_w))

In [6]:
def split_train_test(X, Y):
    # shuffle
    np.random.seed(97)
    idx = np.random.permutation(len(X))
    X = X[idx]
    Y = Y[idx]
    
    #X = X[:size_hispanic]
    #Y = Y[:size_hispanic]

    # split into training and test sets
    TEST_SET_SIZE = int(0.1*len(Y))
    X_train, X_test = X[:-TEST_SET_SIZE], X[-TEST_SET_SIZE:]
    Y_train, Y_test = Y[:-TEST_SET_SIZE].astype(int), Y[-TEST_SET_SIZE:].astype(int)
    return X_train, X_test, Y_train, Y_test

In [7]:
X_train, X_test, Y_train, Y_test = split_train_test(X, Y)

# Feature Scaling
Fit scaler based on training data, then transform both the training and test data.

In [8]:
# feature scaling: scale features based on training data only
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def feature_scale(X_train, X_test):
    
    mm_scaler = MinMaxScaler(feature_range=(-1,1))
    X_train[:,-5:] = mm_scaler.fit_transform(X_train[:,-5:])
    X_test[:,-5:] = mm_scaler.transform(X_test[:,-5:])
    
    std_scaler = StandardScaler()
    X_train[:,-5:-1] = std_scaler.fit_transform(X_train[:,-5:-1])
    X_test[:,-5:-1] = std_scaler.transform(X_test[:,-5:-1])
    
    return X_train, X_test

X_train, X_test = feature_scale(X_train, X_test)



# Results Function

In [9]:
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.metrics import confusion_matrix

def results(classifier):
    Y_pred_test = classifier.predict(X_test)
    print("Test accuracy score: " + str(accuracy_score(Y_test.astype(int), Y_pred_test)))
    #print("ROC: " + str(roc_auc_score(Y_test, classifier.predict_proba(X_test)[:,1])))
    print("ROC: " + str(roc_auc_score(Y_test, Y_pred_test)))
    matrix = confusion_matrix(Y_test.astype(int), Y_pred_test)
    tn, fp, fn, tp = matrix.ravel()
    ppv = tp/(tp+fp)
    npv = tn/(tn+fn)
    sensitivity = tp/(tp+fn)
    specificity = tn/(tn+fp)
    g_mean = np.sqrt(sensitivity*specificity)
    print("PPV: " + str(ppv))
    print("NPV: " + str(npv))
    print("Sensitivity: " + str(sensitivity))
    print("Specificity: " + str(specificity))
    print("G-Mean: " + str(g_mean))
    print("Confusion matrix:\n" + str(matrix))
    

In [11]:
import pickle
with open('white_codes.pkl', 'rb') as f_w:
    white_codes = pickle.load(f_w)
with open('hispanic_codes.pkl', 'rb') as f_h:
    hispanic_codes = pickle.load(f_h)
with open('mixed_codes.pkl', 'rb') as f_m:
    mixed_codes = pickle.load(f_m)

In [23]:
X_train.shape

(281810, 375)

In [12]:
mixed_codes

{(0, 'mar_stat', 0): array(['1', '2', '3', '4', '5', '6', '9'], dtype='<U1'),
 (1, 'sex', 7): array(['1', '2'], dtype='<U1'),
 (2,
  'primsite',
  8): array(['C180', 'C181', 'C182', 'C183', 'C184', 'C185', 'C186', 'C187',
        'C188', 'C189', 'C199', 'C209', 'C260'], dtype='<U4'),
 (3,
  'histo3v',
  21): array(['8000', '8001', '8002', '8003', '8004', '8010', '8011', '8012',
        '8013', '8015', '8020', '8021', '8022', '8030', '8031', '8032',
        '8033', '8041', '8045', '8046', '8050', '8051', '8052', '8070',
        '8071', '8072', '8073', '8076', '8077', '8081', '8082', '8083',
        '8094', '8123', '8124', '8130', '8140', '8141', '8142', '8143',
        '8144', '8145', '8147', '8160', '8201', '8210', '8211', '8213',
        '8220', '8221', '8230', '8240', '8241', '8243', '8244', '8245',
        '8246', '8249', '8255', '8260', '8261', '8262', '8263', '8310',
        '8320', '8323', '8341', '8380', '8430', '8440', '8441', '8460',
        '8461', '8470', '8471', '8472', '84

In [22]:
X_train.shape

(281810, 375)

In [None]:
def feature_importance_rank(array):
    codes = None
    if race == "white":
        codes = white_codes
    elif race == "hispanic":
        codes = hispanic_codes
    elif race == "mixed":
        codes = mixed_codes
        
    array = np.abs(array)
    
    ranking = {}
    ranking['race'] array[-1]
    ranking['tumsiz'] = array[-2]
    ranking['maligcount'] = array[-3]
    ranking['eod10_pn'] = array[-4]
    ranking['age_dx'] = array[-5]
    
    for key, val in codes.items():
        varname = key[1]
        start_idx = key[2]
        end_idx = start_idx + len(val)
        ranking[varname] = np.sum(array[start_idx:end_idx])
    
    d_view = [(name, score) for name,score in ranking.items()]
    d_view.sort(key=lambda x:x[1], reverse=True)
    for pair in d_view:
        print(pair)
    

## Decision Tree

In [None]:

# min_sample_split: 300,400
# min_samples_leaf: 200
# max_depth: 130
# min_weight_fraction_leaf: .01
param_grid = [{'max_depth':[40,50,60], 'min_samples_leaf':[250,260,270,280,290]}]
tree_clf_reg = DecisionTreeClassifier()
dt_grid_search = GridSearchCV(tree_clf_reg, param_grid, cv=3, scoring=make_scorer(roc_auc_score), verbose=3)
dt_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = dt_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(dt_grid_search.best_params_))

In [None]:
results(dt_grid_search, X_test, Y_test)

In [None]:
tree_clf_h = DecisionTreeClassifier(max_features=200, min_samples_leaf=150)
tree_clf_h.fit(X_train, Y_train)


In [None]:
results(tree_clf_h, X_test, Y_test)

In [None]:
feature_importance_rank(tree_clf_h.feature_importances_)

In [None]:
sorted(enumerate(tree_clf_h.feature_importances_), key=lambda x:x[1], reverse=True)

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, Y_train.astype(int))

In [None]:
results(knn_clf)

## Logistic Regression

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
param_grid = [{'C':[.01, .1, .75, 1, 1.5, 2]}]
lr_grid_search = GridSearchCV(LogisticRegression(), param_grid, cv=3, 
                              scoring=make_scorer(roc_auc_score), verbose=5
                             )
lr_grid_search.fit(X_train, Y_train.astype(int))

In [None]:
cvres = lr_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(lr_grid_search.best_params_))

In [None]:
results(lr_grid_search)

In [15]:
lr = LogisticRegression(C=1)
lr.fit(X_train, Y_train.astype(int))
results(lr)

Test accuracy score: 0.854943791517629
ROC: 0.708248205947369
PPV: 0.8820798172237166
NPV: 0.678562874251497
Sensitivity: 0.9469124569801021
Specificity: 0.46958395491463617
G-Mean: 0.6668244870328711
Confusion matrix:
[[ 2833  3200]
 [ 1342 23937]]


In [None]:
feature_importance_rank(lr.coef_[0])

In [19]:
for key, val in enumerate(lr.coef_[0]):
    print(key, val)

0 -0.05921002491757239
1 0.4400121407717491
2 -0.07727917526832771
3 -0.15110454729288086
4 -0.17343587167325278
5 -0.09212470587922382
6 -0.045966895005032544
7 0.0402837571667663
8 -0.2243146398519068
9 0.13672838592436806
10 0.015549776076402712
11 0.006784764739845687
12 0.8097705622692519
13 -0.024456217728319515
14 0.3105801391656702
15 -0.011756896543885354
16 0.0816764410337821
17 0.07137002598472987
18 -0.08287803282807099
19 0.2811580677109233
20 0.28662706188327924
21 0.3390664646919436
22 -0.7512164474149851
23 -0.07464683877085512
24 0.5063027696766356
25 0.1399605441615723
26 0.22672983031639116
27 -0.5228619406344504
28 0.18552806125480403
29 -0.16594785991769603
30 -1.2317409903844865
31 0.6648383594381505
32 -0.9068503596322302
33 -1.020417016036565
34 0.15859071454112258
35 0.09677475711534174
36 -0.43636014204040163
37 -0.5971576404394431
38 -1.0179615783970868
39 -0.6035996563165054
40 -0.2064402823023212
41 -0.9515392760563707
42 -0.8385108472339524
43 0.3388338834

In [16]:
sorted(enumerate(lr.coef_[0]), key=lambda x:x[1], reverse=True)

[(171, 5.545576197584324),
 (90, 1.855060585403153),
 (143, 1.7670062740741215),
 (73, 1.6445307470542287),
 (44, 1.6130061192163132),
 (181, 1.4708550919748555),
 (268, 1.2375719767256415),
 (180, 1.0394224310844746),
 (155, 0.9456280624615608),
 (150, 0.9426809174808319),
 (160, 0.9391945761568811),
 (92, 0.9279210480546587),
 (47, 0.861083455865009),
 (130, 0.8332949358632442),
 (12, 0.8097705622692519),
 (312, 0.7896122712896763),
 (339, 0.779023736638092),
 (66, 0.7417908660869947),
 (139, 0.7097964585120132),
 (185, 0.7029549472039432),
 (133, 0.6666871453969091),
 (31, 0.6648383594381505),
 (182, 0.6611459963763591),
 (119, 0.6562851260102018),
 (54, 0.6473653329616427),
 (216, 0.6452373396777009),
 (127, 0.6173331419357722),
 (60, 0.6053592214565553),
 (267, 0.6010317079485119),
 (96, 0.593357816461438),
 (89, 0.5858386102665574),
 (161, 0.5822424045709917),
 (347, 0.5822424045709917),
 (320, 0.5813489932444474),
 (131, 0.578935903088852),
 (157, 0.575308969008762),
 (53, 0.575

## Random Forest

In [None]:
#scoring=make_scorer(roc_auc_score)
param_grid = [{'max_features':[175,200], 'n_estimators':[10,15,20], 'min_samples_leaf':[150,175,200]}]
rf_grid_search = GridSearchCV(RandomForestClassifier(), 
            param_grid, cv=3, scoring=make_scorer(roc_auc_score), 
            verbose=5
)
rf_grid_search.fit(X_train, Y_train)

In [None]:
cvres = rf_grid_search.cv_results_
for mean_score, params in zip(cvres["mean_test_score"], cvres["params"]):
    print(mean_score, params)
print("Best: " + str(rf_grid_search.best_params_))

In [None]:
rf_clf = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225, random_state=97)
rf_clf.fit(X_train, Y_train)
results(rf_clf)

In [None]:
feature_importance_rank(rf_clf.feature_importances_)

In [None]:
sorted(enumerate(rf_clf.feature_importances_), key=lambda x:x[1], reverse=True)

In [None]:
results(rf_grid_search, X_test, Y_test)

## Linear SVM

In [None]:
from sklearn.svm import LinearSVC
svm_clf = LinearSVC(C=.01)
svm_clf.fit(X_train, Y_train)

In [None]:
results(svm_clf, X_test, Y_test)

In [None]:
feature_importance_rank(svm_clf.coef_[0], "white")

In [None]:
sorted(enumerate(svm_clf.coef_[0]), key=lambda x:x[1], reverse=True)

# Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.mixture import GaussianMixture
clf1 = RandomForestClassifier(n_estimators=20, min_samples_leaf=300, max_features=225)
clf2 = LogisticRegression(C=1)
clf3 = GaussianNB()
clf4 = GaussianMixture()

eclf = VotingClassifier(estimators=[
    ('rf', clf1), ('lr', clf2), ('nb', clf3), ('gm', clf4)
], voting='soft')
eclf.fit(X_train, Y_train)

In [None]:
eclf.predict(X_test)

In [None]:
results(eclf)

In [None]:
gm_clf = GaussianMixture().fit(X_train, Y_train)

In [None]:
gm_clf.predict(X_test)