In [1]:
# Working directory set
import sys
sys.path.append("../scripts/")

In [2]:
import pandas as pd
import numpy as np
import json
import seaborn as sns

from clean_data import load_and_clean_data

pd.set_option('display.max_rows', 400)

In [3]:
data = load_and_clean_data()

  if (await self.run_code(code, result,  async_=asy)):
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [4]:
data.shape

(222647, 298)

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc

X = data.drop('target', axis = 1)
y = data['target']

# 70 20 10 split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .1, random_state = 42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = .25, random_state = 42)

def run_baseline_dt(X_train, y_train, X_val, y_val):
    dt = DecisionTreeClassifier()
    dt.fit(X_train, y_train)
    y_pred = dt.predict_proba(X_val)[:,1]
    
    fpr, tpr, thresholds = roc_curve(y_val, y_pred)
    roc_auc = auc(fpr, tpr)
    
    importances = {}
    for name, importance in zip(X_val.columns, dt.feature_importances_):
        importances[name] = importance
        
    imp = pd.DataFrame.from_dict(importances, orient = 'index', columns = ['importance'])
    imp.sort_values(by = 'importance', ascending = False, inplace = True)
    
    return fpr, tpr, roc_auc, imp

In [6]:
fpr, tpr, roc_auc, imp = run_baseline_dt(data)

In [7]:
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(150286, 297)
(50096, 297)
(22265, 297)


# Feature Selection

## Drop 0 feature importance features

In [8]:
#drop columns that have 0 feature importance
imp.reset_index(inplace = True)

In [9]:
imp.drop(np.where(imp.importance == 0)[0], inplace = True)

In [10]:
cols_to_keep = np.array(imp['index'])

In [11]:
len(cols_to_keep)

263

In [12]:
X2_train = X_train[cols_to_keep]

In [13]:
X2_train.shape

(150286, 263)

## Recursive Feature Elimination (RFE)

In [14]:
from sklearn.feature_selection import RFE
from xgboost import XGBClassifier

In [15]:
estimator = DecisionTreeClassifier()
num_features = 125
step = 5
rfe_dt = RFE(estimator, n_features_to_select = num_features, step = step).fit(X2_train, y_train)

In [16]:
dt_feat = X2_train.columns[rfe_dt.support_]
dt_feat

Index(['OTHBLEED', 'DOPTODIS', 'OTHSYSEP', 'MORBPROB', 'OPTIME', 'PRHCT',
       'bmi', 'PRPLATE', 'MORTPROB', 'PRWBC',
       ...
       'COL_MALIGNANCYN_6', 'COL_NODESEVAL_9', 'TRANST_1', 'FNSTATUS2_1',
       'DISCHDEST_6', 'DYSPNEA_2', 'COL_NODESEVAL_20', 'RACE_NEW_1',
       'COL_MALIGNANCYN_7', 'COL_MALIGNANCYT_5'],
      dtype='object', length=125)

In [17]:
estimator = XGBClassifier()
rfe_xgb = RFE(estimator, n_features_to_select = num_features, step = step).fit(X2_train, y_train)





















































































































In [18]:
xgb_feat = X2_train.columns[rfe_xgb.support_]

#see how many features these two have in common by using set
all_feat_rfe = set().union(dt_feat, xgb_feat)
print(f"There are {len(all_feat_rfe)} features in the union of the 2 RFEs")

There are 162 features in the union of the 2 RFEs


In [19]:
rfe_feat = list(all_feat_rfe)
rfe_feat

['TOTHLOS',
 'COL_MALIGNANCYM_2',
 'COL_NODESEVAL_13',
 'DISCHDEST_8',
 'COL_MALIGNANCYM_1',
 'TRANSFUS',
 'COL_NODESEVAL_18',
 'ASACLAS_2',
 'COL_NODESEVAL_9',
 'COL_MECH_BOWEL_PREP_2',
 'SUPINFEC',
 'COL_MALIGNANCYT_3',
 'BLEEDIS_2',
 'COL_MALIGNANCYM_4',
 'COL_NODESEVAL_15',
 'DOPTODIS',
 'COL_NODESEVAL_5',
 'RACE_NEW_1',
 'BLEEDIS_1',
 'OPRENAFL',
 'PRNCPTX_3',
 'COL_MECH_BOWEL_PREP_1',
 'PRPLATE',
 'COL_MALIGNANCYM_5',
 'COL_NODESEVAL_16',
 'COL_NODESEVAL_20',
 'OTHCDIFF_1',
 'COL_NODESEVAL_17',
 'COL_ANASTOMOTIC_2',
 'COL_NODESEVAL_8',
 'COL_ANASTOMOTIC_3',
 'OTHCDIFF_2',
 'COL_ORAL_ANTIBIOTIC_1',
 'SSSIPATOS',
 'COL_MALIGNANCYN_4',
 'COL_NODESEVAL_36',
 'HXCOPD',
 'ETHNICITY_HISPANIC_1',
 'num_concurr_procs',
 'COL_ORAL_ANTIBIOTIC_2',
 'PULEMBOL',
 'COL_ANASTOMOTIC_5',
 'PRSODM',
 'PRNCPTX_2',
 'ASACLAS_3',
 'COL_NODESEVAL_10',
 'COL_NODESEVAL_26',
 'COL_MALIGNANCYN_2',
 'ANESTHES_5',
 'REINTUB',
 'COL_NODESEVAL_2',
 'ADMQTR',
 'COL_NODESEVAL_69',
 'URNINFEC',
 'female',
 'diabe

In [20]:
X3_train = X2_train[rfe_feat]
X3_train.shape

(150286, 162)

In [21]:
X2_val = X_val[cols_to_keep]
X3_val = X2_val[rfe_feat]
X3_val.shape

(50096, 162)

In [22]:
dt = DecisionTreeClassifier()
dt.fit(X3_train, y_train)
y_pred = dt.predict_proba(X3_val)[:,1]

fpr, tpr, thresholds = roc_curve(y_val, y_pred)
roc_auc = auc(fpr, tpr)
roc_auc

0.7119848303217398

## Select K Best

In [44]:
from sklearn.feature_selection import SelectKBest, f_classif
from collections import defaultdict

In [56]:
k_best = defaultdict(list)
k_vals = [125, 110, 100, 90, 85, 75, 65, 50]
cols = X3_train.columns
for k in k_vals:
    train = SelectKBest(f_classif, k).fit(X3_train, y_train)
    mask = train.get_support()
    columns = cols[mask]
    train = X3_train[columns]
    val = X3_val[columns]
    fpr, tpr, roc_auc, imp = run_baseline_dt(train, y_train, val, y_val)
    roc_auc = auc(fpr, tpr)
    k_best[k] = [roc_auc, columns]



In [70]:
best_cols = k_best[125][1]
X4_train = X3_train[best_cols]

In [71]:
X4_train.shape

(150286, 125)

## Remove Highly Correlated Features

In [72]:
from sklearn.feature_selection import mutual_info_classif

def mutual_information_ranking(X, y):
    """
    Use mutual information to rank features by importance
    """
    
    mi_scores = mutual_info_classif(X, y)

    mi_features = pd.Series(dict(zip(X.columns, mi_scores)))

    return mi_features.sort_values(ascending=False)

In [73]:
def pairwise_correlation_ranking(X, threshold):
    """
    Returns pairwise correlation of features ranked by absolute value, if above a certain threshold
    """
    
    corr = X.corr()
    
    corrdict = {}
    for i in range(len(corr)):
        for j in range(len(corr.columns)):
            if i != j and np.abs(corr.iloc[i,j] > threshold):
                corrdict[tuple(sorted([corr.columns[i], corr.columns[j]]))] = corr.iloc[i,j]
    return np.array(sorted(corrdict.items(), key=lambda x: np.abs(x[1]), reverse=True), dtype=object)

In [74]:
def filter_by_correlation(X, y, hp=None):
    """
    Greedy filtering of features by correlation
    """
    threshold = 0.85
    if hp is not None:
        threshold = hp['threshold']
    correlations = pairwise_correlation_ranking(X, threshold)
    while len(correlations) > 0:
        worst_feature = mutual_information_ranking(X[list(correlations[0][0])], y).idxmin()
        X.drop(worst_feature, axis=1, inplace=True)
        correlations = pairwise_correlation_ranking(X, threshold)
    return X

In [75]:
X5_train = filter_by_correlation(X4_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [77]:
X5_train.columns

Index(['COL_MALIGNANCYM_2', 'DISCHDEST_8', 'COL_MALIGNANCYM_1', 'TRANSFUS',
       'ASACLAS_2', 'COL_MECH_BOWEL_PREP_2', 'SUPINFEC', 'COL_MALIGNANCYT_3',
       'COL_MALIGNANCYM_4', 'DOPTODIS',
       ...
       'HYPERMED', 'OUPNEUMO', 'DSSIPATOS', 'COL_MALIGNANCYN_9',
       'COL_NODESEVAL_57', 'SMOKE', 'COL_ANASTOMOTIC_4', 'WNDINFD', 'PRBUN',
       'OTHSESHOCK'],
      dtype='object', length=122)