## Importing Data and Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
df = pd.read_csv('diabetes_50k_for_feature_selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,1,85,Transferred to another medical facility,Transferred from another health care facility,4,56,0,4,Genitourinary,Infectious and Parasitic,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,0,-2,-2,-2,-2,-2,-2,1,0,1,0,other_meds
1,Caucasian,1,75,Discharged to home,Referral,1,46,3,13,Circulatory,Circulatory,Circulatory,8,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
2,AfricanAmerican,1,55,Not Available,Emergency,5,52,1,23,Musculoskeletal System and Connective Tissue,Infectious and Parasitic,Mental Disorders,5,,,-2,-2,-2,-2,-2,-2,-2,-2,0,-2,1,0,2,0,insulin_only
3,Caucasian,1,65,Discharged to home,Referral,5,27,2,28,Neoplasms,Neoplasms,Respiratory,8,,,-2,-2,-2,-2,-2,1,-2,-2,0,-2,1,0,0,1,insulin_combo
4,Caucasian,0,85,Transferred to another medical facility,Referral,11,73,0,23,Circulatory,Circulatory,Circulatory,9,,>8,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,3,1,insulin_only


In [3]:
df.shape

(50000, 30)

## Preparing Data

In [4]:
X = df.drop('readmitted', 1)
X_dum = pd.get_dummies(X, drop_first = True)
y = df['readmitted']
X.shape, X_dum.shape, y.shape

((50000, 29), (50000, 97), (50000,))

In [5]:
X_dum.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_dum.columns]

In [6]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

## Sequential Forward Selection

In [7]:
X_dum.shape

(50000, 97)

In [None]:
model = ExtraTreesClassifier(random_state = 0, n_jobs = -1)

# Create an SFS object
sfs = SFS(estimator = model,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 5)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sfs = sfs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sfs_results = pd.DataFrame.from_dict(sfs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs_results

In [None]:
sfs_results.to_csv('50K_ExtraTrees_Forward_Results_Saurabh.csv', index = False)

## Sequential Backward Selector

In [None]:
model = ExtraTreesClassifier(random_state = 0, n_jobs = -1)

# Create an SFS object
sbs = SFS(estimator = model,
          k_features = (1, 97),
          forward = False,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 5)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sbs = sbs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sbs_results = pd.DataFrame.from_dict(sbs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sbs_results

In [None]:
sbs_results.to_csv('50K_ExtraTrees_Backward_Results_Saurabh.csv', index = False)