In [1]:
# Working directory set
import sys
sys.path.append("../scripts/")

In [2]:
import pandas as pd
import numpy as np
import json
import seaborn as sns

from clean_data import load_and_clean_data

pd.set_option('display.max_rows', 400)

In [3]:
data = load_and_clean_data()

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
data.shape

(222647, 298)

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

X = data.drop('target', axis = 1)
y = data['target']

# 70 20 10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state = 42)

def run_baseline_dt(X_train, y_train, X_val, y_val):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    
    importances = {}
    for name, importance in zip(X_val.columns, dt.feature_importances_):
        importances[name] = importance
        
    imp = pd.DataFrame.from_dict(importances, orient = 'index', columns = ['importance'])
    imp.sort_values(by = 'importance', ascending = False, inplace = True)
    
    return fpr, tpr, roc_auc, imp

In [6]:
fpr, tpr, roc_auc, imp = run_baseline_dt(data)

In [7]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(150286, 297)
(50096, 297)
(22265, 297)


# Feature Selection

## Drop 0 feature importance features

In [8]:
#drop columns that have 0 feature importance
imp.reset_index(inplace = True)

In [9]:
imp.drop(np.where(imp.importance == 0)[0], inplace = True)

In [10]:
cols_to_keep = np.array(imp['index'])

In [11]:
len(cols_to_keep)

263

In [12]:
X2_train = X_train[cols_to_keep]

In [13]:
X2_train.shape

(150286, 263)

## Recursive Feature Elimination (RFE)

In [14]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

In [15]:
estimator = DecisionTreeClassifier()
num_features = 125
step = 5
rfe_dt = RFE(estimator, n_features_to_select = num_features, step = step).fit(X2_train, y_train)

In [16]:
dt_feat = X2_train.columns[rfe_dt.support_]
dt_feat

Index(['OTHBLEED', 'DOPTODIS', 'OTHSYSEP', 'MORBPROB', 'OPTIME', 'PRHCT',
       'bmi', 'PRPLATE', 'MORTPROB', 'PRWBC',
       ...
       'COL_MALIGNANCYN_6', 'COL_NODESEVAL_9', 'TRANST_1', 'FNSTATUS2_1',
       'DISCHDEST_6', 'DYSPNEA_2', 'COL_NODESEVAL_20', 'RACE_NEW_1',
       'COL_MALIGNANCYN_7', 'COL_MALIGNANCYT_5'],
      dtype='object', length=125)

In [17]:
estimator = XGBClassifier()
rfe_xgb = RFE(estimator, n_features_to_select = num_features, step = step).fit(X2_train, y_train)





















































































































In [18]:
xgb_feat = X2_train.columns[rfe_xgb.support_]

#see how many features these two have in common by using set
all_feat_rfe = set().union(dt_feat, xgb_feat)
print(f"There are {len(all_feat_rfe)} features in the union of the 2 RFEs")

There are 162 features in the union of the 2 RFEs


In [19]:
rfe_feat = list(all_feat_rfe)
rfe_feat

['TOTHLOS',
 'COL_MALIGNANCYM_2',
 'COL_NODESEVAL_13',
 'DISCHDEST_8',
 'COL_MALIGNANCYM_1',
 'TRANSFUS',
 'COL_NODESEVAL_18',
 'ASACLAS_2',
 'COL_NODESEVAL_9',
 'COL_MECH_BOWEL_PREP_2',
 'SUPINFEC',
 'COL_MALIGNANCYT_3',
 'BLEEDIS_2',
 'COL_MALIGNANCYM_4',
 'COL_NODESEVAL_15',
 'DOPTODIS',
 'COL_NODESEVAL_5',
 'RACE_NEW_1',
 'BLEEDIS_1',
 'OPRENAFL',
 'PRNCPTX_3',
 'COL_MECH_BOWEL_PREP_1',
 'PRPLATE',
 'COL_MALIGNANCYM_5',
 'COL_NODESEVAL_16',
 'COL_NODESEVAL_20',
 'OTHCDIFF_1',
 'COL_NODESEVAL_17',
 'COL_ANASTOMOTIC_2',
 'COL_NODESEVAL_8',
 'COL_ANASTOMOTIC_3',
 'OTHCDIFF_2',
 'COL_ORAL_ANTIBIOTIC_1',
 'SSSIPATOS',
 'COL_MALIGNANCYN_4',
 'COL_NODESEVAL_36',
 'HXCOPD',
 'ETHNICITY_HISPANIC_1',
 'num_concurr_procs',
 'COL_ORAL_ANTIBIOTIC_2',
 'PULEMBOL',
 'COL_ANASTOMOTIC_5',
 'PRSODM',
 'PRNCPTX_2',
 'ASACLAS_3',
 'COL_NODESEVAL_10',
 'COL_NODESEVAL_26',
 'COL_MALIGNANCYN_2',
 'ANESTHES_5',
 'REINTUB',
 'COL_NODESEVAL_2',
 'ADMQTR',
 'COL_NODESEVAL_69',
 'URNINFEC',
 'female',
 'diabe

In [20]:
X3_train = X2_train[rfe_feat]
X3_train.shape

(150286, 162)

In [21]:
X2_val = X_val[cols_to_keep]
X3_val = X2_val[rfe_feat]
X3_val.shape

(50096, 162)

In [22]:
dt = DecisionTreeClassifier()
dt.fit(X3_train, y_train)
y_pred = dt.predict_proba(X3_val)[:,1]

fpr, tpr, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(fpr, tpr)
roc_auc

0.7119848303217398

## Select K Best

In [44]:
from sklearn.feature_selection import SelectKBest, f_classif
from collections import defaultdict

In [None]:
k_best = defaultdict(list)
k_vals = [125, 100, 90, 75, 65, 50]
cols = X3_train.columns
for k in k_vals:
    train = SelectKBest(chi2, k = k).fit(X3_train, y_train)
    mask = train.get_support()
    columns = cols[mask]
    val = X3_val[columns]
    run_baseline_dt(train, y_train, val, y_val)

In [38]:
train = SelectKBest(f_classif, k = 50).fit(X3_train, y_train)

In [40]:
train.get_support()

array([False, False, False,  True, False, False, False,  True, False,
       False,  True, False, False, False, False, False, False, False,
       False,  True,  True, False, False, False, False, False,  True,
       False,  True, False,  True,  True, False, False, False, False,
       False, False, False, False,  True,  True, False,  True,  True,
       False, False, False, False,  True, False, False, False,  True,
       False,  True, False, False, False,  True, False,  True, False,
       False, False,  True, False, False,  True, False, False,  True,
        True, False, False, False, False,  True, False, False,  True,
       False, False, False,  True, False,  True, False, False, False,
       False, False,  True,  True,  True,  True,  True, False,  True,
       False, False,  True, False, False,  True, False, False, False,
        True,  True,  True, False,  True, False, False, False, False,
        True, False, False, False,  True,  True, False, False, False,
        True,  True,

In [43]:
X3_train.columns[mask]

Index(['DISCHDEST_8', 'ASACLAS_2', 'SUPINFEC', 'OPRENAFL', 'PRNCPTX_3',
       'OTHCDIFF_1', 'COL_ANASTOMOTIC_2', 'COL_ANASTOMOTIC_3', 'OTHCDIFF_2',
       'PULEMBOL', 'COL_ANASTOMOTIC_5', 'PRNCPTX_2', 'ASACLAS_3', 'REINTUB',
       'URNINFEC', 'diabetes', 'COL_ILEUS_1', 'OTHDVT', 'ORGSPCSSI',
       'ASACLAS_1', 'STEROID', 'SEPSISPATOS', 'RETURNOR', 'ELECTSURG_1', 'CPT',
       'REOPERATION1', 'PRHCT', 'CDMI', 'DEHIS', 'PRNCPTX_5', 'FAILWEAN',
       'OTHSYSEP', 'COL_APPROACH_2', 'COL_CHEMO_1', 'RETORRELATED_2', 'OPTIME',
       'num_other_procs', 'MORBPROB', 'OSSIPATOS', 'CNSCVA',
       'COL_ANASTOMOTIC_1', 'REOPERATION2', 'RENAINSF', 'DISCHDEST_1',
       'DISCANCR', 'OTHBLEED', 'OUPNEUMO', 'WNDINFD', 'OTHSESHOCK', 'insulin'],
      dtype='object')

## Remove Highly Correlated Features

In [96]:
#correlation matrix
corr = X4_train.corr().abs()
#upper triangle
upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))


In [99]:
correlation = .85
to_drop = [c for c in upper_tri.columns if any(upper_tri[c] > correlation)]

In [100]:
to_drop

['TOTHLOS',
 'ADMYR',
 'DYSPNEA_1',
 'COL_MALIGNANCYM_5',
 'PUFYEAR',
 'WOUND_CLOSURE_3',
 'BLEEDIS_2',
 'COL_MALIGNANCYT_9',
 'REOPERATION1']

In [105]:
corr['REOPERATION1'].sort_values(ascending = False)

REOPERATION1             1.000000
RETORRELATED_2           0.973509
RETURNOR                 0.778066
COL_ANASTOMOTIC_3        0.362728
DOPTODIS                 0.235411
TOTHLOS                  0.201349
COL_ANASTOMOTIC_8        0.198129
COL_ANASTOMOTIC_5        0.186951
COL_ILEUS_1              0.174729
DISCHDEST_3              0.110311
ORGSPCSSI                0.105135
MORBPROB                 0.097193
STILLINHOSP              0.094063
DEHIS                    0.092742
REOPERATION2             0.091668
BLEEDIS_2                0.086678
OTHCDIFF_2               0.085973
OSSIPATOS                0.084908
OTHBLEED                 0.082583
DISCHDEST_1              0.076188
COL_NODESEVAL_102        0.068068
WOUND_CLOSURE_3          0.068068
OTHSYSEP                 0.066596
MORTPROB                 0.062967
OTHSESHOCK               0.060110
ELECTSURG_1              0.059499
EMERGNCY                 0.058867
CPT                      0.057512
DISCHDEST_8              0.057504
COL_APPROACH_2