## Importing Data and Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  import pandas.util.testing as tm


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,-2,0,-2,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,-2,0,-2,-2,-2,0,-2,1,0,0,0,insulin_combo


## Reducing Sample Size

In [None]:
df['readmitted'].value_counts(1)

0    0.885443
1    0.114557
Name: readmitted, dtype: float64

In [None]:
df['age'].mean()

65.72967961265067

In [None]:
df['age'].value_counts(1)

75    0.254785
65    0.222128
55    0.172020
85    0.165262
45    0.096693
35    0.037993
95    0.025971
25    0.016493
15    0.007026
5     0.001628
Name: age, dtype: float64

In [5]:
df_1 = df.sample(n = 10000, random_state = 0)

In [6]:
df_1['readmitted'].value_counts(1)

0    0.8805
1    0.1195
Name: readmitted, dtype: float64

In [7]:
df_1['age'].mean()

65.622

In [8]:
df_1['age'].value_counts(1)

75    0.2577
65    0.2277
55    0.1783
85    0.1578
45    0.0937
35    0.0368
95    0.0239
25    0.0155
15    0.0069
5     0.0017
Name: age, dtype: float64

In [None]:
df_2 = df.sample(n = 50000, weights = 'age')

In [None]:
df_2['age'].mean()

68.5908

In [None]:
df_2['age'].value_counts(1)

75    0.28264
65    0.22300
85    0.19774
55    0.15368
45    0.07554
95    0.03338
35    0.02422
25    0.00744
15    0.00216
5     0.00020
Name: age, dtype: float64

In [None]:
df_2['readmitted'].value_counts(1)

0    0.8837
1    0.1163
Name: readmitted, dtype: float64

In [None]:
df_3 = df.sample(n = 25000)
df_3['readmitted'].value_counts(1)

0    0.88468
1    0.11532
Name: readmitted, dtype: float64

In [None]:
df_3.shape

(25000, 30)

In [None]:
from imblearn.under_sampling import TomekLinks
print('Original dataset shape:', df.shape)
tl = TomekLinks()
X_res, y_res = tl.fit_resample(X_dum, y)
print('Resampled dataset shape:', X_res.shape, y_res.shape)

Original dataset shape: (97070, 30)
Resampled dataset shape: (93242, 97) (93242,)




In [None]:
y_res.sum()

11120

In [None]:
df['readmitted'].value_counts()

0    85950
1    11120
Name: readmitted, dtype: int64

In [None]:
85950 - (93242 - 11120)

3828

In [None]:
df.columns

Index(['race', 'gender', 'age', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'glimepiride', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'insulin', 'glyburide-metformin',
       'diabetesMed', 'readmitted', 'preceding_year_visits', 'number_changes',
       'insulin_treatment'],
      dtype='object')

In [9]:
num_cols = ['age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'number_diagnoses', 'preceding_year_visits', 'number_changes']

In [36]:
df_1 = df.sample(n = 20000, random_state = 0)

In [37]:
df_1['readmitted'].value_counts(1)

0    0.8817
1    0.1183
Name: readmitted, dtype: float64

In [38]:
for i in num_cols:
    mu = df[i].mean()
    st, p = stats.ttest_1samp(df_1[i], mu)
    print(i, p)

age 0.29894553566016163
time_in_hospital 0.9489992871582211
num_lab_procedures 0.7061670659004424
num_procedures 0.35452684071838136
num_medications 0.6100210866421576
number_diagnoses 0.7289820336454846
preceding_year_visits 0.6676726589937962
number_changes 0.7578002857255146


In [39]:
for i in df.columns:
    if i not in num_cols:
        vc = df_1[i].value_counts().values
        vc_exp = df[i].value_counts(1).values
        #print(i, vc)
        n = df_1[i].value_counts().sum()
        #print(n)
        exp = []
        for j in vc_exp:
            exp.append(n * j)
        #print(exp)
        print(i)
        st, p = stats.chisquare(vc, exp)
        print(p)
        print()

race
0.33788160696438035

gender
0.43481196209730444

discharge_disposition_id
0.11939909658145319

admission_source_id
0.6713966404818751

diag_1
0.9995623438124036

diag_2
0.920096112639961

diag_3
0.9932943951010952

max_glu_serum
0.19550879528674175

A1Cresult
0.6378686211330085

metformin
0.40062384256029115

repaglinide
0.8109278741076068

nateglinide
0.16822541934447635

glimepiride
0.49999643471138444

glipizide
0.22448897257557276

glyburide
0.18127807319120515

pioglitazone
0.9945102576551399

rosiglitazone
0.6505164205061031

insulin
0.5474684793280375

glyburide-metformin
0.8156840148738375

diabetesMed
0.38347346790507486

readmitted
0.09645841044557923

insulin_treatment
0.6358504464492563



In [None]:
df_1.shape

(25000, 30)

In [40]:
df_1.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/diabetes_20k_for_feature_selection.csv', index = False)

In [41]:
df_1['readmitted'].value_counts(1)

0    0.8817
1    0.1183
Name: readmitted, dtype: float64

In [42]:
df_1['readmitted'].value_counts()

0    17634
1     2366
Name: readmitted, dtype: int64

## Preparing Data

In [None]:
X = df_1.drop('readmitted', 1)
X_dum = pd.get_dummies(X, drop_first = True)
y = df_1['readmitted']
X.shape, X_dum.shape, y.shape

((25000, 29), (25000, 97), (25000,))

In [None]:
X_dum.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_dum.columns]

## Cross Validation Scores

In [None]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

cv_score_mean = []
cv_score_std = []
names = []

In [None]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

cv_score_mean = []
cv_score_std = []
names = []

In [None]:
cv_scores_df = pd.DataFrame({'Model' : names, 'Mean Cross Val Recall Score' : cv_score_mean,
                            'Cross Val Score STD' : cv_score_std})
cv_scores_df.sort_values(by = 'Mean Cross Val Recall Score')

Unnamed: 0,Model,Mean Cross Val Recall Score,Cross Val Score STD
1,LR,0.000341,0.000482
9,XGB,0.000682,0.000964
6,RF,0.000682,0.000482
7,ADA,0.001363,0.001928
5,ET,0.002046,0.000836
8,GB,0.003751,0.001929
0,LGB,0.005798,0.000481
2,KNN,0.021828,0.000488
4,BC,0.022167,0.005096
3,DT,0.169508,0.002658


## Validation Set Scores

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, random_state = 0, test_size = 1/3, stratify = y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16666, 97), (16666,), (8334, 97), (8334,))

In [None]:
precision = []
recall = []
names = []
f1 = []
f1_weighted = []
roc_auc = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    f1_weighted.append(f1_score(y_test, y_pred, average = 'weighted'))
    roc_auc.append(roc_auc_score(y_test, y_proba))
    names.append(name)


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
models_df = pd.DataFrame({'Model' : names, 'Precision' : precision, 'Recall' : recall,
                          'F1' : f1, 'F1-Weighted' : f1_weighted, 'ROC_AUC' : roc_auc})
models_df

Unnamed: 0,Model,Precision,Recall,F1,F1-Weighted,ROC_AUC
0,LGB,0.571429,0.008188,0.016145,0.82976,0.621121
1,LR,0.0,0.0,0.0,0.827804,0.627035
2,KNN,0.138889,0.020471,0.035682,0.825573,0.516031
3,DT,0.140721,0.163767,0.151372,0.7917,0.515484
4,BC,0.252632,0.024565,0.044776,0.830058,0.566816
5,ET,0.2,0.003071,0.006048,0.827953,0.603376
6,RF,0.25,0.001024,0.002039,0.827916,0.611769
7,ADA,1.0,0.001024,0.002045,0.828096,0.629536
8,GB,0.384615,0.005118,0.010101,0.828773,0.63308
9,XGB,0.666667,0.002047,0.004082,0.828328,0.634056


In [None]:
MI_score = mutual_info_classif(df.drop(), , random_state = 0)
MI_df = pd.DataFrame({'Score' : MI_score, "Columns" : X_dum.columns}).sort_values(by = 'Score', ascending = False)
MI_df

## Sequential Forward Selection

In [None]:
X_dum.shape

(25000, 97)

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])

# Create an SFS object
sfs = SFS(estimator = model,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 3)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sfs = sfs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sfs_results = pd.DataFrame.from_dict(sfs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs_results

In [None]:
sfs_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/25K_KNN_Forward_Results_Saurabh.csv', index = False)

## Sequential Backward Selector

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])

# Create an SFS object
sbs = SFS(estimator = model,
          k_features = (1, 97),
          forward = False,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 3)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sbs = sbs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sbs_results = pd.DataFrame.from_dict(sbs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sbs_results

In [None]:
sbs_results.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/25K_KNN_Backward_Results_Saurabh.csv', index = False)