## Importing Data and Libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.preprocessing import StandardScaler, PowerTransformer, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
import scipy.stats as stats
from sklearn.feature_selection import mutual_info_classif
from mlxtend.feature_selection import ExhaustiveFeatureSelector as EFS
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

  import pandas.util.testing as tm


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/Diabetes_Preprocessed_Before_Feature_Selection.csv')
df.head()

Unnamed: 0,race,gender,age,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,diag_1,diag_2,diag_3,number_diagnoses,max_glu_serum,A1Cresult,metformin,repaglinide,nateglinide,glimepiride,glipizide,glyburide,pioglitazone,rosiglitazone,insulin,glyburide-metformin,diabetesMed,readmitted,preceding_year_visits,number_changes,insulin_treatment
0,Caucasian,0,5,Not Available,Referral,1,41,0,1,Diabetes,Not Required,Not Required,1,,,-2,-2,-2,-2,-2,-2,-2,-2,-2,-2,0,0,0,0,no_med
1,Caucasian,0,15,Discharged to home,Emergency,3,59,0,18,"Endocrine, Nutritional, Metabolic, Immunity",Diabetes,"Endocrine, Nutritional, Metabolic, Immunity",9,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
2,AfricanAmerican,0,25,Discharged to home,Emergency,2,11,5,13,"Pregnancy, Childbirth",Diabetes,External causes of injury,6,,,-2,-2,-2,-2,0,-2,-2,-2,-2,-2,1,0,3,0,other_meds
3,Caucasian,1,35,Discharged to home,Emergency,2,44,1,16,Infectious and Parasitic,Diabetes,Circulatory,7,,,-2,-2,-2,-2,-2,-2,-2,-2,1,-2,1,0,0,1,insulin_only
4,Caucasian,1,45,Discharged to home,Emergency,1,51,0,8,Neoplasms,Neoplasms,Diabetes,5,,,-2,-2,-2,-2,0,-2,-2,-2,0,-2,1,0,0,0,insulin_combo


## Reducing Sample Size

In [23]:
df['readmitted'].value_counts(1)

0    0.885443
1    0.114557
Name: readmitted, dtype: float64

In [18]:
df['age'].mean()

65.72967961265067

In [27]:
df['age'].value_counts(1)

75    0.254785
65    0.222128
55    0.172020
85    0.165262
45    0.096693
35    0.037993
95    0.025971
25    0.016493
15    0.007026
5     0.001628
Name: age, dtype: float64

In [72]:
df_1 = df.sample(n = 50000, random_state = 0)

In [73]:
df_1['readmitted'].value_counts(1)

0    0.88536
1    0.11464
Name: readmitted, dtype: float64

In [15]:
df_1['age'].mean()

65.7406

In [28]:
df_1['age'].value_counts(1)

75    0.25386
65    0.22266
55    0.17292
85    0.16584
45    0.09576
35    0.03728
95    0.02604
25    0.01656
15    0.00740
5     0.00168
Name: age, dtype: float64

In [14]:
df_2 = df.sample(n = 50000, weights = 'age')

In [19]:
df_2['age'].mean()

68.5908

In [30]:
df_2['age'].value_counts(1)

75    0.28264
65    0.22300
85    0.19774
55    0.15368
45    0.07554
95    0.03338
35    0.02422
25    0.00744
15    0.00216
5     0.00020
Name: age, dtype: float64

In [25]:
df_2['readmitted'].value_counts(1)

0    0.8837
1    0.1163
Name: readmitted, dtype: float64

In [37]:
df_3 = df.sample(n = 25000)
df_3['readmitted'].value_counts(1)

0    0.88468
1    0.11532
Name: readmitted, dtype: float64

In [38]:
df_3.shape

(25000, 30)

In [41]:
from imblearn.under_sampling import TomekLinks
print('Original dataset shape:', df.shape)
tl = TomekLinks()
X_res, y_res = tl.fit_resample(X_dum, y)
print('Resampled dataset shape:', X_res.shape, y_res.shape)

Original dataset shape: (97070, 30)
Resampled dataset shape: (93242, 97) (93242,)




In [43]:
y_res.sum()

11120

In [44]:
df['readmitted'].value_counts()

0    85950
1    11120
Name: readmitted, dtype: int64

In [46]:
85950 - (93242 - 11120)

3828

In [47]:
df.columns

Index(['race', 'gender', 'age', 'discharge_disposition_id',
       'admission_source_id', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin',
       'repaglinide', 'nateglinide', 'glimepiride', 'glipizide', 'glyburide',
       'pioglitazone', 'rosiglitazone', 'insulin', 'glyburide-metformin',
       'diabetesMed', 'readmitted', 'preceding_year_visits', 'number_changes',
       'insulin_treatment'],
      dtype='object')

In [48]:
num_cols = ['age', 'time_in_hospital', 'num_lab_procedures',
       'num_procedures', 'num_medications', 'preceding_year_visits', 'number_changes']

In [74]:
for i in num_cols:
    mu = df[i].mean()
    st, p = stats.ttest_1samp(df_1[i], mu)
    print(i, p)

age 0.8365344102164889
time_in_hospital 0.20713160512488285
num_lab_procedures 0.9997755690942407
num_procedures 0.7134598423037106
num_medications 0.6319388249538531
preceding_year_visits 0.7917138725925171
number_changes 0.20594959920488543


In [69]:
for i in df.columns:
    if i not in num_cols:
        print(i, df[i].value_counts(1).values)

race [0.76440713 0.19325229 0.02072731 0.01515401 0.00645926]
gender [0.53900278 0.46099722]
discharge_disposition_id [0.60524364 0.20775729 0.1318327  0.04784176 0.00628412 0.00104049]
admission_source_id [0.57684145 0.30608839 0.05875142 0.05831874]
diag_1 [0.29879468 0.14035232 0.09430308 0.08726692 0.06901205 0.05060266
 0.04945915 0.02980323 0.02630061 0.02612548 0.02562069 0.02548676
 0.02239621 0.01643144 0.01110539 0.00929226 0.00814876 0.00688163
 0.00261667]
diag_2 [0.31382507 0.12762955 0.10471824 0.08242505 0.08134336 0.04114557
 0.03604615 0.02885547 0.02647574 0.02557948 0.02383847 0.02370454
 0.02313794 0.01869785 0.01762646 0.01079633 0.00492428 0.00418255
 0.00344082 0.00160709]
diag_3 [0.29820748 0.1703719  0.08989389 0.07063974 0.06476769 0.0508602
 0.03895127 0.03118368 0.0257031  0.02474503 0.02335428 0.0190172
 0.0190069  0.01826517 0.01624601 0.01440198 0.01385598 0.00481096
 0.00310086 0.00261667]
number_diagnoses [4.84660554e-01 1.09199547e-01 1.04883074e-01 1.

In [75]:
for i in df.columns:
    if i not in num_cols:
        vc = df_1[i].value_counts().values
        vc_exp = df[i].value_counts(1).values
        #print(i, vc)
        n = df_1[i].value_counts().sum()
        #print(n)
        exp = []
        for j in vc_exp:
            exp.append(n * j)
        #print(exp)
        st, p = stats.chisquare(vc, exp)
        print(i, p)
        print()

race 0.8249628601720129

gender 0.7544640766188596

discharge_disposition_id 0.7856002826927017

admission_source_id 0.9202847806503914

diag_1 0.9682032010464862

diag_2 0.997571795248338

diag_3 0.9827576856017731

number_diagnoses 0.9999532663424706

max_glu_serum 0.8655485297189018

A1Cresult 0.16529089093208113

metformin 0.8601142724300386

repaglinide 0.5908199656689885

nateglinide 0.8038478940710672

glimepiride 0.8938733594598232

glipizide 0.23777955888038302

glyburide 0.7369974585305459

pioglitazone 0.9013380525381108

rosiglitazone 0.29590469526119817

insulin 0.4420303450258616

glyburide-metformin 0.920099240640449

diabetesMed 0.7940258386507028

readmitted 0.9532541185022003

insulin_treatment 0.8156805538886772



In [76]:
df_1.to_csv('/content/drive/My Drive/PGPDSE/Capstone/CSV Files/diabetes_50k_for_feature_selection.csv', index = False)

In [78]:
df_1.shape

(50000, 30)

## Preparing Data

In [80]:
X = df_1.drop('readmitted', 1)
X_dum = pd.get_dummies(X, drop_first = True)
y = df_1['readmitted']
X.shape, X_dum.shape, y.shape

((50000, 29), (50000, 97), (50000,))

In [84]:
X_dum.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in X_dum.columns]

## Cross Validation Scores

In [83]:
models = []

models.append(('LGB',LGBMClassifier(random_state = 0, n_jobs = -1)))
models.append(('LR', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("LogReg", LogisticRegression(random_state = 0, solver = 'liblinear', n_jobs = -1))])))
models.append(('KNN', Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])))
models.append(('DT', DecisionTreeClassifier(random_state = 0)))
models.append(('BC',BaggingClassifier(random_state = 0, n_jobs = -1)))
models.append(('ET',ExtraTreesClassifier(random_state = 0, n_jobs = -1)))
models.append(('RF', RandomForestClassifier(random_state = 0, n_jobs = -1)))
models.append(('ADA',AdaBoostClassifier(random_state = 0)))
models.append(('GB',GradientBoostingClassifier(random_state = 0)))
models.append(('XGB', XGBClassifier(random_state = 0, n_jobs = -1)))

cv_score_mean = []
cv_score_std = []
names = []

In [86]:
for name, model in models:
    cv_results = cross_val_score(model, X_dum, y, cv = 10, scoring = 'recall', n_jobs = -1)
    cv_score_mean.append(cv_results.mean())
    cv_score_std.append(cv_results.std())
    names.append(name)

In [89]:
cv_scores_df = pd.DataFrame({'Model' : names, 'Mean Cross Val Recall Score' : cv_score_mean,
                            'Cross Val Score STD' : cv_score_std})
cv_scores_df.sort_values(by = 'Mean Cross Val Recall Score')

Unnamed: 0,Model,Mean Cross Val Recall Score,Cross Val Score STD
0,LR,0.000349,0.000698
6,ADA,0.000698,0.000855
9,XGB,0.001396,0.001521
5,RF,0.002966,0.001754
7,GB,0.004187,0.002367
8,LGB,0.004884,0.002314
4,ET,0.005582,0.002034
3,BC,0.021283,0.003468
1,KNN,0.0253,0.006264
2,DT,0.163468,0.013705


## Validation Set Scores

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_dum, y, random_state = 0, test_size = 0.1, stratify = y)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((45000, 97), (45000,), (5000, 97), (5000,))

In [91]:
precision = []
recall = []
names = []
f1 = []
f1_weighted = []
roc_auc = []

for name, model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    precision.append(precision_score(y_test, y_pred))
    recall.append(recall_score(y_test, y_pred))
    f1.append(f1_score(y_test, y_pred))
    f1_weighted.append(f1_score(y_test, y_pred, average = 'weighted'))
    roc_auc.append(roc_auc_score(y_test, y_proba))
    names.append(name)


  " = {}.".format(effective_n_jobs(self.n_jobs)))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [92]:
models_df = pd.DataFrame({'Model' : names, 'Precision' : precision, 'Recall' : recall,
                          'F1' : f1, 'F1-Weighted' : f1_weighted, 'ROC_AUC' : roc_auc})
models_df

Unnamed: 0,Model,Precision,Recall,F1,F1-Weighted,ROC_AUC
0,LR,0.0,0.0,0.0,0.831583,0.661485
1,KNN,0.243902,0.034904,0.061069,0.83413,0.535625
2,DT,0.184496,0.207679,0.195402,0.80899,0.544431
3,BC,0.27907,0.020942,0.038961,0.834008,0.599511
4,ET,0.181818,0.00349,0.006849,0.831647,0.626325
5,RF,0.5,0.001745,0.003478,0.83197,0.630084
6,ADA,0.0,0.0,0.0,0.831583,0.65712
7,GB,0.428571,0.005236,0.010345,0.832634,0.667151
8,LGB,0.4,0.00349,0.00692,0.832253,0.666549
9,XGB,1.0,0.00349,0.006957,0.832557,0.6688


In [None]:
MI_score = mutual_info_classif(df.drop(), , random_state = 0)
MI_df = pd.DataFrame({'Score' : MI_score, "Columns" : X_dum.columns}).sort_values(by = 'Score', ascending = False)
MI_df

## Sequential Forward Selection

In [95]:
X_dum.shape

(50000, 97)

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])

# Create an SFS object
sfs = SFS(estimator = model,
          k_features = (1, 97),
          forward = True,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 5)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sfs = sfs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sfs_results = pd.DataFrame.from_dict(sfs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sfs_results

## Sequential Backward Selector

In [None]:
model = Pipeline([("Transformer", PowerTransformer()),
                               ("Scaler", StandardScaler()),
                               ("KNN", KNeighborsClassifier(n_jobs = -1))])

# Create an SFS object
sbs = SFS(estimator = model,
          k_features = (1, 97),
          forward = False,       # Set forward to True when we want to perform SFS
          scoring = 'recall',
          cv = 5)               # The number of cross-validations to perform is 5

# Train SFS with our dataset
sbs = sbs.fit(X_dum, y)

In [None]:
# Show the performance of each subset of features considered by SFS
sbs_results = pd.DataFrame.from_dict(sbs.subsets_).T.sort_values(by = 'avg_score', ascending = False) 
sbs_results