## Importing Data and Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  import pandas.util.testing as tm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/diabetes_20k_for_feature_selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,1,85,Transferred to another medical facility,Transferred from another health care facility,4,56,0,4,Genitourinary,Infectious and Parasitic,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,0,-2,-2,-2,-2,-2,-2,1,0,1,0,other_meds
1,Caucasian,1,75,Discharged to home,Referral,1,46,3,13,Circulatory,Circulatory,Circulatory,8,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
2,AfricanAmerican,1,55,Not Available,Emergency,5,52,1,23,Musculoskeletal System and Connective Tissue,Infectious and Parasitic,Mental Disorders,5,,,-2,-2,-2,-2,-2,-2,-2,-2,0,-2,1,0,2,0,insulin_only
3,Caucasian,1,65,Discharged to home,Referral,5,27,2,28,Neoplasms,Neoplasms,Respiratory,8,,,-2,-2,-2,-2,-2,1,-2,-2,0,-2,1,0,0,1,insulin_combo
4,Caucasian,0,85,Transferred to another medical facility,Referral,11,73,0,23,Circulatory,Circulatory,Circulatory,9,,>8,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,3,1,insulin_only


In [None]:
df.shape

(20000, 30)

## Preparing Data

In [None]:
X = df.drop('readmitted', 1)
X_dum = pd.get_dummies(X, drop_first = True)
y = df['readmitted']
X.shape, X_dum.shape, y.shape

((20000, 29), (20000, 97), (20000,))

In [None]:
X_dum.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_dum.columns]

In [None]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

## Sequential Forward Selection

In [None]:
X_dum.shape

(20000, 97)

In [None]:
model = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced')

# Create an SFS object
sfs = SFS(estimator = model,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 3)               # The number of cross-validations to perform is 3

# Train SFS with our dataset
sfs = sfs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sfs_results = pd.DataFrame.from_dict(sfs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs_results

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
1,"(84,)","[1.0, 1.0, 0.998730964467005]",0.999577,"(diag_3_Pregnancy, Childbirth,)"
3,"(51, 67, 84)","[1.0, 1.0, 0.998730964467005]",0.999577,"(diag_2_Congenital Anomalies, diag_2_Sense Org..."
2,"(51, 84)","[1.0, 1.0, 0.998730964467005]",0.999577,"(diag_2_Congenital Anomalies, diag_3_Pregnancy..."
4,"(51, 65, 67, 84)","[0.9974651457541192, 1.0, 0.998730964467005]",0.998732,"(diag_2_Congenital Anomalies, diag_2_Pregnancy..."
5,"(46, 51, 65, 67, 84)","[0.9974651457541192, 0.9987325728770595, 0.996...",0.997464,"(diag_1_Pregnancy, Childbirth, diag_2_Congenit..."
6,"(25, 46, 51, 65, 67, 84)","[0.9974651457541192, 0.9987325728770595, 0.996...",0.997464,"(discharge_disposition_id_Left AMA, diag_1_Pre..."
7,"(25, 46, 48, 51, 65, 67, 84)","[0.9961977186311787, 0.9961977186311787, 0.994...",0.995773,"(discharge_disposition_id_Left AMA, diag_1_Pre..."
8,"(25, 46, 48, 51, 63, 65, 67, 84)","[0.9949302915082383, 0.9949302915082383, 0.992...",0.994082,"(discharge_disposition_id_Left AMA, diag_1_Pre..."
9,"(25, 46, 48, 51, 63, 65, 67, 84, 86)","[0.991128010139417, 0.9949302915082383, 0.9898...",0.991969,"(discharge_disposition_id_Left AMA, diag_1_Pre..."
10,"(20, 25, 46, 48, 51, 63, 65, 67, 84, 86)","[0.9873257287705957, 0.9923954372623575, 0.980...",0.986895,"(race_Asian, discharge_disposition_id_Left AMA..."


In [None]:
sfs_results.iloc[0, 3]

('diag_3_Pregnancy, Childbirth',)

In [None]:
sfs_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/20K_LGBM_Forward_Results_Saurabh.csv', index = False)

## Sequential Forward Selector

In [None]:
X_dum.shape, y.shape

((20000, 97), (20000,))

In [None]:
model1 = LGBMClassifier(random_state = 0, n_jobs = -1, class_weight = 'balanced')

# Create an SFS object
sfs1 = SFS(estimator = model1,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'f1',
          cv = 3)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sfs1 = sfs1.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sfs1_results = pd.DataFrame.from_dict(sfs1.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs1_results

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
24,"(16, 18, 22, 23, 25, 27, 28, 41, 43, 46, 47, 4...","[0.2824644549763033, 0.2667087011349306, 0.267...",0.2721,"(glyburide-metformin, preceding_year_visits, r..."
26,"(16, 18, 22, 23, 25, 27, 28, 41, 43, 46, 47, 4...","[0.2824117463292721, 0.2698059179128221, 0.263...",0.271929,"(glyburide-metformin, preceding_year_visits, r..."
22,"(16, 18, 22, 23, 25, 27, 28, 43, 46, 47, 49, 5...","[0.2807680201447907, 0.26803482587064675, 0.26...",0.271837,"(glyburide-metformin, preceding_year_visits, r..."
21,"(16, 18, 22, 23, 25, 27, 28, 43, 46, 47, 49, 5...","[0.2813089993706734, 0.26762320648783533, 0.26...",0.271685,"(glyburide-metformin, preceding_year_visits, r..."
23,"(16, 18, 22, 23, 25, 27, 28, 43, 46, 47, 48, 4...","[0.2802627463246794, 0.26824703680598877, 0.26...",0.271614,"(glyburide-metformin, preceding_year_visits, r..."
19,"(16, 18, 22, 25, 27, 28, 43, 46, 47, 49, 51, 5...","[0.2807017543859649, 0.2663769015833592, 0.267...",0.271597,"(glyburide-metformin, preceding_year_visits, r..."
25,"(16, 18, 22, 23, 25, 27, 28, 41, 43, 46, 47, 4...","[0.2810071495181847, 0.26756066411238827, 0.26...",0.27159,"(glyburide-metformin, preceding_year_visits, r..."
20,"(16, 18, 22, 25, 27, 28, 43, 46, 47, 49, 51, 5...","[0.27972465581977474, 0.26712328767123283, 0.2...",0.271282,"(glyburide-metformin, preceding_year_visits, r..."
18,"(16, 18, 22, 25, 27, 28, 43, 46, 47, 49, 56, 5...","[0.2802507836990596, 0.26645962732919254, 0.26...",0.271278,"(glyburide-metformin, preceding_year_visits, r..."
16,"(16, 18, 22, 25, 27, 28, 43, 46, 47, 49, 59, 6...","[0.28201970443349755, 0.26634086744043983, 0.2...",0.271079,"(glyburide-metformin, preceding_year_visits, r..."


In [None]:
sfs1_results.iloc[0, 3]

('glyburide-metformin',
 'preceding_year_visits',
 'race_Hispanic',
 'race_Other',
 'discharge_disposition_id_Left AMA',
 'discharge_disposition_id_Still patient/referred to this institution',
 'discharge_disposition_id_Transferred to another medical facility',
 'diag_1_Mental Disorders',
 'diag_1_Neoplasms',
 'diag_1_Pregnancy, Childbirth',
 'diag_1_Respiratory',
 'diag_1_Sense Organs',
 'diag_1_Skin and Subcutaneous Tissue',
 'diag_2_Congenital Anomalies',
 'diag_2_Genitourinary',
 'diag_2_Mental Disorders',
 'diag_2_Musculoskeletal System and Connective Tissue',
 'diag_2_Nervous',
 'diag_2_Not Required',
 'diag_2_Other Symptoms',
 'diag_2_Pregnancy, Childbirth',
 'diag_2_Sense Organs',
 'diag_3_Pregnancy, Childbirth',
 'diag_3_Sense Organs')

In [None]:
sfs1_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/20K_LGBM_ForwardF1_Results_Saurabh.csv', index = False)