## Importing Data and Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  import pandas.util.testing as tm


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/diabetes_25k_for_feature_selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,1,85,Transferred to another medical facility,Transferred from another health care facility,4,56,0,4,Genitourinary,Infectious and Parasitic,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,0,-2,-2,-2,-2,-2,-2,1,0,1,0,other_meds
1,Caucasian,1,75,Discharged to home,Referral,1,46,3,13,Circulatory,Circulatory,Circulatory,8,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
2,AfricanAmerican,1,55,Not Available,Emergency,5,52,1,23,Musculoskeletal System and Connective Tissue,Infectious and Parasitic,Mental Disorders,5,,,-2,-2,-2,-2,-2,-2,-2,-2,0,-2,1,0,2,0,insulin_only
3,Caucasian,1,65,Discharged to home,Referral,5,27,2,28,Neoplasms,Neoplasms,Respiratory,8,,,-2,-2,-2,-2,-2,1,-2,-2,0,-2,1,0,0,1,insulin_combo
4,Caucasian,0,85,Transferred to another medical facility,Referral,11,73,0,23,Circulatory,Circulatory,Circulatory,9,,>8,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,3,1,insulin_only


In [None]:
df.shape

(25000, 30)

## Preparing Data

In [None]:
X = df.drop('readmitted', 1)
X_dum = pd.get_dummies(X, drop_first = True)
y = df['readmitted']
X.shape, X_dum.shape, y.shape

((25000, 29), (25000, 97), (25000,))

In [None]:
X_dum.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_dum.columns]

In [None]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

## Sequential Forward Selection

In [None]:
X_dum.shape

(25000, 97)

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear'))])

# Create an SFS object
sfs = SFS(estimator = model,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 3,
          verbose = 1)               # The number of cross-validations to perform is 3

# Train SFS with our dataset
sfs = sfs.fit(X_dum, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  97 out of  97 | elapsed:   14.1s finished
Features: 1/97[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  96 out of  96 | elapsed:   35.1s finished
Features: 2/97[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  95 out of  95 | elapsed:   44.0s finished
Features: 3/97[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  94 out of  94 | elapsed:   53.7s finished
Features: 4/97[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  93 out of  93 | elapsed:  1.0min finished
Features: 5/97[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  92 out of  92 | elapsed:  1.2min finished
Features: 6/97[Parallel(

In [None]:
# Show the performance of each subset of features considered by SFS
sfs_results = pd.DataFrame.from_dict(sfs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs_results

Unnamed: 0,feature_idx,cv_scores,avg_score,feature_names
49,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
50,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
72,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
71,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
70,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
69,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
68,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
67,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
66,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."
65,"(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...","[0.0010224948875255625, 0.0, 0.0]",0.000340832,"(gender, age, time_in_hospital, num_lab_proced..."


In [None]:
sfs_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/25K_LogReg_Forward_Results_Saurabh.csv', index = False)

In [None]:
sfs_results.iloc[0, 3], len(sfs_results.iloc[0, 3])

(('gender',
  'age',
  'time_in_hospital',
  'num_lab_procedures',
  'num_procedures',
  'num_medications',
  'number_diagnoses',
  'metformin',
  'repaglinide',
  'nateglinide',
  'glimepiride',
  'glipizide',
  'glyburide',
  'pioglitazone',
  'rosiglitazone',
  'insulin',
  'glyburide_metformin',
  'diabetesMed',
  'preceding_year_visits',
  'number_changes',
  'race_Asian',
  'race_Caucasian',
  'race_Hispanic',
  'race_Other',
  'discharge_disposition_id_Discharged_to_home_with_home_health_service',
  'discharge_disposition_id_Left_AMA',
  'discharge_disposition_id_Not_Available',
  'discharge_disposition_id_Still_patient_referred_to_this_institution',
  'admission_source_id_Not_Available',
  'admission_source_id_Referral',
  'admission_source_id_Transferred_from_another_health_care_facility',
  'diag_1_Circulatory',
  'diag_1_Congenital_Anomalies',
  'diag_1_Diabetes',
  'diag_1_Digestive',
  'diag_1_Endocrine__Nutritional__Metabolic__Immunity',
  'diag_1_External_causes_of_injur

## Sequential Backward Selector

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear'))])

# Create an SFS object
sbs = SFS(estimator = model,
          k_features = (1, 97),
          forward = False,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 3,
          verbose = 2)               # The number of cross-validations to perform is 3

# Train SFS with our dataset
sbs = sbs.fit(X_dum, y)

AttributeError: ignored

In [None]:
# Show the performance of each subset of features considered by SFS
sbs_results = pd.DataFrame.from_dict(sbs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sbs_results

In [None]:
sbs_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/25K_LogReg_Backward_Results_Saurabh.csv', index = False)