In [82]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bokeh.io import show, output_notebook
from bokeh.plotting import figure
from bokeh.transform import factor_cmap

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import classification_report, roc_auc_score, auc
from sklearn.impute import SimpleImputer

import xgboost 
from sklearn.linear_model import LogisticRegression, Lasso, RidgeCV
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

output_notebook()
sns.set(color_codes=True)
sns.set_style('white')

In [2]:
df = pd.read_csv('data/TrainingWiDS2021.csv')
# for col in df.columns:
#     if 'id' in col:
#         df = df.drop(col, axis=1)
df.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,1,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,,,0,0,0,0,0,0,0,1
1,2,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,51.0,51.0,0,0,0,0,0,0,0,1
2,3,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Emergency Department,...,,,0,0,0,0,0,0,0,0
3,4,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room,...,337.0,337.0,0,0,0,0,0,0,0,0
4,5,201746,33,19.0,,0,Caucasian,M,188.0,,...,,,0,0,0,0,0,0,0,0


In [3]:
def print_cols_types(df):
    for col_name, col_type in zip(df.columns, df.dtypes):
        spaces = 30
        print(col_name, (spaces-len(col_name))*' ', col_type)

print_cols_types(df)

Unnamed: 0                      int64
encounter_id                    int64
hospital_id                     int64
age                             float64
bmi                             float64
elective_surgery                int64
ethnicity                       object
gender                          object
height                          float64
hospital_admit_source           object
icu_admit_source                object
icu_id                          int64
icu_stay_type                   object
icu_type                        object
pre_icu_los_days                float64
readmission_status              int64
weight                          float64
albumin_apache                  float64
apache_2_diagnosis              float64
apache_3j_diagnosis             float64
apache_post_operative           int64
arf_apache                      int64
bilirubin_apache                float64
bun_apache                      float64
creatinine_apache               float64
fio2_apache           

In [4]:
def print_nulls(df):
    for col_name, nulls in zip(df.columns, df.isna().sum()):
        spaces = 50
        print(col_name, (spaces-len(col_name))*' ', nulls)

print_nulls(df)

Unnamed: 0                                          0
encounter_id                                        0
hospital_id                                         0
age                                                 4988
bmi                                                 4490
elective_surgery                                    0
ethnicity                                           1587
gender                                              66
height                                              2077
hospital_admit_source                               33198
icu_admit_source                                    240
icu_id                                              0
icu_stay_type                                       0
icu_type                                            0
pre_icu_los_days                                    0
readmission_status                                  0
weight                                              3463
albumin_apache                                      78163
ap

In [5]:
df.describe()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,readmission_status,...,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
count,130157.0,130157.0,130157.0,125169.0,125667.0,130157.0,128080.0,130157.0,130157.0,130157.0,...,16760.0,16760.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0
mean,65079.0,213000.856519,106.102131,61.995103,29.11026,0.18984,169.607219,662.428344,0.839933,0.0,...,247.525419,239.617358,0.00103,0.016081,0.013599,0.025669,0.007307,0.004187,0.020852,0.216285
std,37573.233831,38109.828146,63.482277,16.82288,8.262776,0.392176,10.833085,304.259843,2.485337,0.0,...,131.440167,128.562211,0.03207,0.125786,0.115819,0.158146,0.085166,0.064574,0.142888,0.411712
min,1.0,147000.0,1.0,0.0,14.844926,0.0,137.2,82.0,-0.25,0.0,...,42.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,32540.0,180001.0,49.0,52.0,23.598006,0.0,162.5,427.0,0.045833,0.0,...,144.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,65079.0,213014.0,112.0,64.0,27.564749,0.0,170.1,653.0,0.155556,0.0,...,228.125,218.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,97618.0,246002.0,165.0,75.0,32.803127,0.0,177.8,969.0,0.423611,0.0,...,333.0,324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,130157.0,279000.0,204.0,89.0,67.81499,1.0,195.59,1111.0,175.627778,0.0,...,720.0,654.813793,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [6]:
categ_cols = [c for c in df.columns if (1<df[c].nunique()) & (df[c].dtype != np.number)& (df[c].dtype != int) ]

In [7]:
df[categ_cols]

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type
0,Caucasian,M,Floor,Floor,admit,CTICU
1,Caucasian,F,Floor,Floor,admit,Med-Surg ICU
2,Caucasian,F,Emergency Department,Accident & Emergency,admit,Med-Surg ICU
3,Caucasian,F,Operating Room,Operating Room / Recovery,admit,CTICU
4,Caucasian,M,,Accident & Emergency,admit,Med-Surg ICU
...,...,...,...,...,...,...
130152,Caucasian,M,Emergency Department,Accident & Emergency,admit,Cardiac ICU
130153,Caucasian,F,Direct Admit,Accident & Emergency,admit,MICU
130154,African American,M,Emergency Department,Accident & Emergency,admit,Cardiac ICU
130155,Caucasian,M,Emergency Department,Accident & Emergency,admit,Med-Surg ICU


In [8]:
df_2 = pd.get_dummies(df)

In [9]:
for col_name, col_type in zip(df_2.columns, df_2.dtypes):
    if col_type == 'object':
        print("SHHHHHHHHHHHHIIIIIIIIIIIIITTTTTTTTTTTTTTTTTTTTTTT", '\n')
    spaces = 50
    print(col_name, (spaces-len(col_name))*' ', col_type)

Unnamed: 0                                          int64
encounter_id                                        int64
hospital_id                                         int64
age                                                 float64
bmi                                                 float64
elective_surgery                                    int64
height                                              float64
icu_id                                              int64
pre_icu_los_days                                    float64
readmission_status                                  int64
weight                                              float64
albumin_apache                                      float64
apache_2_diagnosis                                  float64
apache_3j_diagnosis                                 float64
apache_post_operative                               int64
arf_apache                                          int64
bilirubin_apache                                    floa

In [10]:
df_3 = df_2.fillna(df_2.median())
print_nulls(df_3)

Unnamed: 0                                          0
encounter_id                                        0
hospital_id                                         0
age                                                 0
bmi                                                 0
elective_surgery                                    0
height                                              0
icu_id                                              0
pre_icu_los_days                                    0
readmission_status                                  0
weight                                              0
albumin_apache                                      0
apache_2_diagnosis                                  0
apache_3j_diagnosis                                 0
apache_post_operative                               0
arf_apache                                          0
bilirubin_apache                                    0
bun_apache                                          0
creatinine_apache           

In [11]:
# tsne = TSNE(learning_rate=50)
# tsne_features = tsne.fit_transform(df_3)

In [12]:
# df_4 = df
# df_4['x'] = tsne_features[:, 0]
# df_4['y'] = tsne_features[:, 1]

# sns.scatterplot(x='x', y='y', hue='diabetes_mellitus', data=df_4)  # Can change hue column to see the where other columns lie and it could give a insight
# plt.show()
# index_cmap = factor_cmap('diabetes_mellitus', palette=['red', 'blue'], 
#                          factors=['0', '1'])

# p = figure(title="TSNE Plot in Bokeh", 
#            x_axis_label='TSNE Feature x', 
#            y_axis_label='TSNE Feature y', 
#            plot_width=580, plot_height=380)

In [13]:
# p.scatter('x','y',source=df_4)
# show(p)

In [14]:
df_test = pd.read_csv('./data/UnlabeledWiDS2021.csv')
df_test.head()

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,...,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
0,1,144740,10141,72,,0,Caucasian,F,152.4,Floor,...,,,,0,0,0,0,0,0,0
1,2,141990,10141,86,,0,Caucasian,F,175.3,Emergency Department,...,,,,0,0,0,0,0,0,0
2,3,142038,10141,72,,0,Caucasian,F,162.6,Floor,...,,,,0,0,0,0,0,0,0
3,4,138628,10141,66,,0,Caucasian,M,177.8,Floor,...,,,,0,0,0,0,0,0,0
4,5,141682,10141,89,,0,Caucasian,M,170.2,Direct Admit,...,,,,0,0,0,0,0,0,0


In [72]:
print_cols_types(df_test)
print(len(df_test.columns))

Unnamed: 0                      int64
encounter_id                    int64
hospital_id                     int64
age                             int64
bmi                             float64
elective_surgery                int64
ethnicity                       object
gender                          object
height                          float64
hospital_admit_source           object
icu_admit_source                object
icu_id                          int64
icu_stay_type                   object
icu_type                        object
pre_icu_los_days                float64
readmission_status              int64
weight                          float64
albumin_apache                  float64
apache_2_diagnosis              float64
apache_3j_diagnosis             float64
apache_post_operative           int64
arf_apache                      int64
bilirubin_apache                float64
bun_apache                      float64
creatinine_apache               float64
fio2_apache             

In [16]:
df_test_2 = pd.get_dummies(df_test)
for col_name, col_type in zip(df_test_2.columns, df_test_2.dtypes):
    if col_type == 'object':
        print("SHHHHHHHHHHHHIIIIIIIIIIIIITTTTTTTTTTTTTTTTTTTTTTT", '\n')
    spaces = 50
    print(col_name, (spaces-len(col_name))*' ', col_type)

Unnamed: 0                                          int64
encounter_id                                        int64
hospital_id                                         int64
age                                                 int64
bmi                                                 float64
elective_surgery                                    int64
height                                              float64
icu_id                                              int64
pre_icu_los_days                                    float64
readmission_status                                  int64
weight                                              float64
albumin_apache                                      float64
apache_2_diagnosis                                  float64
apache_3j_diagnosis                                 float64
apache_post_operative                               int64
arf_apache                                          int64
bilirubin_apache                                    float6

In [71]:
df_test_3 = df_test_2.fillna(df_test_2.median())
print_nulls(df_test_3)
print(len(df_test_3.columns))

Unnamed: 0                                          0
encounter_id                                        0
hospital_id                                         0
age                                                 0
bmi                                                 0
elective_surgery                                    0
height                                              0
icu_id                                              0
pre_icu_los_days                                    0
readmission_status                                  0
weight                                              0
albumin_apache                                      0
apache_2_diagnosis                                  0
apache_3j_diagnosis                                 0
apache_post_operative                               0
arf_apache                                          0
bilirubin_apache                                    0
bun_apache                                          0
creatinine_apache           

In [18]:
# X_train = df_3.drop(['diabetes_mellitus'], axis=1)
# Y_train = df_3['diabetes_mellitus']

# X_test = df_3.drop(['diabetes_mellitus'], axis=1).iloc[0:1000,:]
# Y_test = df_3.iloc[0:1000]['diabetes_mellitus']

# Defining our features
features = df_3.drop('diabetes_mellitus', axis=1)  

# Defining our labels
labels = df_3['diabetes_mellitus']

# Scaling features
scaler = StandardScaler()  
scaled_features = scaler.fit_transform(features)  

# Train Test split
X_train_scaled, X_test_scaled, y_train_scaled, y_test_scaled = train_test_split(scaled_features, labels, test_size=0.1, stratify=labels)
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.1, stratify=labels)

In [20]:
total_majority_class = len(df[df['diabetes_mellitus']==0])
total_minority_class = len(df[df['diabetes_mellitus']==1])
print(total_majority_class, total_minority_class)

scale_pos_weight = total_majority_class / total_minority_class
print("scale_pos_weight should be", scale_pos_weight)

102006 28151
scale_pos_weight should be 3.6235302475933358


In [65]:
xgb = xgboost.XGBClassifier()
xgb.fit(X_train_scaled, y_train_scaled)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [66]:
print( accuracy_score(y_train_scaled, xgb.predict(X_train_scaled)) )
print( roc_auc_score(y_train_scaled, xgb.predict(X_train_scaled)) )

print( accuracy_score(y_test_scaled, xgb.predict(X_test_scaled)) )
print( roc_auc_score(y_test_scaled, xgb.predict(X_test_scaled)) )

0.8894921504853126
0.7974435572185089
0.8350491702519975
0.7116315695723767


In [23]:
df_test_4 = df_test
df_test_4['diabetes_mellitus'] = xgb.predict_proba(df_test_3)[:,1]

In [25]:
df_test_4[["encounter_id","diabetes_mellitus"]].to_csv('submission.csv', index=False)

In [26]:
df_test_4[["encounter_id","diabetes_mellitus"]]

Unnamed: 0,encounter_id,diabetes_mellitus
0,144740,0.087263
1,141990,0.182365
2,142038,0.056126
3,138628,0.030748
4,141682,0.379943
...,...,...
10229,143750,0.021894
10230,143813,0.094927
10231,137126,0.031526
10232,135652,0.007220


In [None]:
# Getting our explained variance ratios from PCA using all features
pca = PCA()
pca.fit(scaled_features)
exp_variance = pca.explained_variance_ratio_

# Plotting the explained variance using a barplot
fig, ax = plt.subplots()
plt.bar(range(pca.n_components_), exp_variance)
ax.set_xlabel('Principar Component #')

In [None]:
# Calculate ther cumulative explained variance
cum_exp_variance = np.cumsum(exp_variance)

# Plotting cumulative explained variance and drawing a dashed line at 0.90
fig, ax = plt.subplots()
ax.plot(range(0, pca.n_components_), cum_exp_variance)
ax.axhline(y=0.9, linestyle='--', c='red')
ax.axvline(x=80, linestyle='--', c='red')  

In [None]:
# pca_projection = pca.fit_transform(scaled_features)
# X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(pca_projection[:,0:80], labels, test_size=0.1, stratify=labels)

# xgb = xgboost.XGBClassifier()
# xgb.fit(X_train_pca, y_train_pca)

In [None]:
# X_test = scaler.fit_transform(X_test) 
# test_pca_projection = pca.fit_transform(X_test)
# print( accuracy_score(y_test, xgb.predict(test_pca_projection[:,0:80])) )

In [None]:
# logreg = LogisticRegression(max_iter=1000)
# logreg.fit(X_train_scaled, y_train_scaled)
# print( accuracy_score(y_test, logreg.predict(X_test)) )

In [40]:
lasso = Lasso()
lasso.fit(X_train, y_train)

print( accuracy_score(y_test, lasso.predict(X_test).round()) )
print( roc_auc_score(y_test, xgb.predict(X_test)) )

0.7963275968039336
0.7137513422481654


In [None]:
# knn = KNeighborsClassifier(n_jobs=-1)
# knn.fit(X_train, y_train)
# print( accuracy_score(y_test, knn.predict(X_test)) )

In [None]:
# rf = RandomForestClassifier(n_estimators=500, criterion='entropy', n_jobs=-1)
# rf.fit(X_train, y_train)
# print( accuracy_score(y_test, rf.predict(X_test)) )

In [None]:
# base_estimators = [
#                     ('lr', LogisticRegression()),
# #                     ('lasso', Lasso()),
#                     ('knn', KNeighborsClassifier()),
#                     ('rf', RandomForestClassifier())
# ]
# final_estimator = xgboost.XGBClassifier()
# model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator)
# model.fit(X_train_scaled, y_train_scaled)

In [None]:
# print( accuracy_score(y_test, model.predict(X_test)) )

In [None]:
rfe = RFE(estimator=xgboost.XGBClassifier(),
n_features_to_select=80, verbose=1)
rfe.fit(X_train,y_train)
print( accuracy_score(y_test, rfe.predict(X_test)) )

In [80]:
l = ['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'd1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_invasive_max', 'd1_mbp_invasive_min', 'd1_mbp_max', 'd1_mbp_min', 'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_resprate_max', 'd1_resprate_min', 'd1_spo2_max', 'd1_spo2_min', 'd1_sysbp_invasive_max', 'd1_sysbp_invasive_min', 'd1_sysbp_max', 'd1_sysbp_min', 'd1_sysbp_noninvasive_max', 'd1_sysbp_noninvasive_min', 'd1_temp_max', 'd1_temp_min', 'h1_diasbp_invasive_max', 'h1_diasbp_invasive_min', 'h1_diasbp_max', 'h1_diasbp_min', 'h1_diasbp_noninvasive_max', 'h1_diasbp_noninvasive_min', 'h1_heartrate_max', 'h1_heartrate_min', 'h1_mbp_invasive_max', 'h1_mbp_invasive_min', 'h1_mbp_max', 'h1_mbp_min', 'h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_invasive_max', 'h1_sysbp_invasive_min', 'h1_sysbp_max', 'h1_sysbp_min', 'h1_sysbp_noninvasive_max', 'h1_sysbp_noninvasive_min', 'h1_temp_max', 'h1_temp_min', 'd1_albumin_max', 'd1_albumin_min', 'd1_bilirubin_max', 'd1_bilirubin_min', 'd1_bun_max', 'd1_bun_min', 'd1_calcium_max', 'd1_calcium_min', 'd1_creatinine_max', 'd1_creatinine_min', 'd1_glucose_max', 'd1_glucose_min', 'd1_hco3_max', 'd1_hco3_min', 'd1_hemaglobin_max', 'd1_hemaglobin_min', 'd1_hematocrit_max', 'd1_hematocrit_min', 'd1_inr_max', 'd1_inr_min', 'd1_lactate_max', 'd1_lactate_min', 'd1_platelets_max', 'd1_platelets_min', 'd1_potassium_max', 'd1_potassium_min', 'd1_sodium_max', 'd1_sodium_min', 'd1_wbc_max', 'd1_wbc_min', 'h1_albumin_max', 'h1_albumin_min', 'h1_bilirubin_max', 'h1_bilirubin_min', 'h1_bun_max', 'h1_bun_min', 'h1_calcium_max', 'h1_calcium_min', 'h1_creatinine_max', 'h1_creatinine_min', 'h1_glucose_max', 'h1_glucose_min', 'h1_hco3_max', 'h1_hco3_min', 'h1_hemaglobin_max', 'h1_hemaglobin_min', 'h1_hematocrit_max', 'h1_hematocrit_min', 'h1_inr_max', 'h1_inr_min', 'h1_lactate_max', 'h1_lactate_min', 'h1_platelets_max', 'h1_platelets_min', 'h1_potassium_max', 'h1_potassium_min', 'h1_sodium_max', 'h1_sodium_min', 'h1_wbc_max', 'h1_wbc_min', 'd1_arterial_pco2_max', 'd1_arterial_pco2_min', 'd1_arterial_ph_max', 'd1_arterial_ph_min', 'd1_arterial_po2_max', 'd1_arterial_po2_min', 'd1_pao2fio2ratio_max', 'd1_pao2fio2ratio_min', 'h1_arterial_pco2_max', 'h1_arterial_pco2_min', 'h1_arterial_ph_max', 'h1_arterial_ph_min', 'h1_arterial_po2_max', 'h1_arterial_po2_min', 'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min', 'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis', 'apache_3j_bodysystem', 'apache_2_bodysystem']
for col in l:
    if col in df.columns:
#         print(True)
        pass
    else:
        print("no column", col)

no column patient_id
no column hospital_death
no column apache_4a_hospital_death_prob
no column apache_4a_icu_death_prob
no column apache_3j_bodysystem
no column apache_2_bodysystem


In [None]:
plt.bar(range(X_train.shape[1]), xgb.feature_importances_, color='orange', label="XGBoost's Feature importance")
cols = [col for col in ]

In [28]:
mask = []
for col_fi in xgb.feature_importances_:
    if col_fi == 0:
        mask.append(False)
    else:
        mask.append(True)
print(mask)

[True, True, True, True, True, True, True, True, True, False, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, True, True, True, True, True, True, True, False, True, True, True, True, False, False, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, Tr

In [29]:
cols = [col for i, col in enumerate(df.columns) if mask[i] == True]
df[cols]

Unnamed: 0.1,Unnamed: 0,encounter_id,hospital_id,age,bmi,elective_surgery,ethnicity,gender,height,icu_admit_source,...,h1_arterial_po2_min,h1_pao2fio2ratio_max,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,diabetes_mellitus
0,1,214826,118,68.0,22.732803,0,Caucasian,M,180.3,Floor,...,,,0,0,0,0,0,0,0,1
1,2,246060,81,77.0,27.421875,0,Caucasian,F,160.0,Floor,...,51.0,51.0,0,0,0,0,0,0,0,1
2,3,276985,118,25.0,31.952749,0,Caucasian,F,172.7,Accident & Emergency,...,,,0,0,0,0,0,0,0,0
3,4,262220,118,81.0,22.635548,1,Caucasian,F,165.1,Operating Room / Recovery,...,265.0,337.0,0,0,0,0,0,0,0,0
4,5,201746,33,19.0,,0,Caucasian,M,188.0,Accident & Emergency,...,,,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
130152,130153,164758,7,50.0,29.287256,0,Caucasian,M,175.3,Accident & Emergency,...,,,0,0,0,0,0,0,0,0
130153,130154,197653,7,79.0,29.653433,0,Caucasian,F,162.6,Accident & Emergency,...,,,0,0,0,0,0,0,0,0
130154,130155,219719,7,73.0,32.265371,0,African American,M,177.8,Accident & Emergency,...,163.0,163.0,0,0,0,0,0,0,0,1
130155,130156,222562,170,81.0,24.408579,0,Caucasian,M,185.4,Accident & Emergency,...,,,0,0,0,0,0,0,0,0


In [30]:
df_5 = df[cols]
df_5 = pd.get_dummies(df_5)
features_new = df_5.drop('diabetes_mellitus', axis=1)  

# Defining our labels
labels_new = df_5['diabetes_mellitus']

# Scaling features
scaler_new = StandardScaler()  
scaled_features_new = scaler_new.fit_transform(features)  

# Train Test split
X_train_scaled_new, X_test_scaled_new, y_train_scaled_new, y_test_scaled_new = train_test_split(scaled_features_new, labels_new, test_size=0.1, stratify=labels_new)
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(features_new, labels_new, test_size=0.1, stratify=labels)

In [43]:
xgb_new = xgboost.XGBClassifier()
xgb_new.fit(X_train_new, y_train_new)
print( accuracy_score(y_test_new, xgb_new.predict(X_test_new)) )
print( roc_auc_score(y_test_new, xgb_new.predict(X_test_new)) )

0.8354333128457283
0.7096903570384474


In [None]:
kf = KFold(n_splits=30)  # Split dataset into k consecutive folds (without shuffling by default).Each fold is then used once as a validation whil

xtreme_gb = xgboost.XGBClassifier()

# Cross validating to gain better performance idea 
xgb_score = cross_val_score(xtreme_gb, X_train_new ,y_train_new, cv=kf)

In [32]:
print(classification_report(y_test_new, xgb_new.predict(X_test_new)))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     10201
           1       0.66      0.49      0.56      2815

    accuracy                           0.84     13016
   macro avg       0.77      0.71      0.73     13016
weighted avg       0.82      0.84      0.83     13016



In [33]:
print(classification_report(y_test, xgb.predict(X_test)))

              precision    recall  f1-score   support

           0       0.87      0.93      0.90     10201
           1       0.67      0.49      0.57      2815

    accuracy                           0.84     13016
   macro avg       0.77      0.71      0.73     13016
weighted avg       0.83      0.84      0.83     13016



In [None]:
print(classification_report(y_test, rfe.predict(X_test_scaled)))

In [34]:
zero_only = df_3[df_3['diabetes_mellitus']==0]
one_only = df_3[df_3['diabetes_mellitus']==1]

zero_only = zero_only.sample(n=len(one_only), random_state=10)

df_balanced = pd.concat([zero_only, one_only])
df_balanced = pd.get_dummies(df_balanced)
features_balanced = df_balanced.drop('diabetes_mellitus', axis=1)  

# Defining our labels
labels_balanced = df_balanced['diabetes_mellitus']

# Scaling features
scaler_balanced = StandardScaler()  
scaled_features_balanced = scaler_new.fit_transform(features_balanced)  

# Train Test split
X_train_scaled_balanced, X_test_scaled_balanced, y_train_scaled_balanced, y_test_scaled_balanced = train_test_split(scaled_features_balanced, labels_balanced, test_size=0.1, stratify=labels_balanced)
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(features_balanced, labels_balanced, test_size=0.1, stratify=labels_balanced)

xgb_balanced = xgboost.XGBClassifier()
xgb_balanced.fit(X_train_balanced, y_train_balanced)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [57]:
print( accuracy_score(y_test_balanced, xgb_balanced.predict(X_test_balanced)) )
print( roc_auc_score(y_test_balanced, xgb_balanced.predict(X_test_balanced)) )

0.7789025039957379
0.7789027808614565


In [36]:
print( classification_report(y_test_balanced, xgb_balanced.predict(X_test_balanced)) )

              precision    recall  f1-score   support

           0       0.78      0.78      0.78      2816
           1       0.78      0.78      0.78      2815

    accuracy                           0.78      5631
   macro avg       0.78      0.78      0.78      5631
weighted avg       0.78      0.78      0.78      5631



In [48]:
print( accuracy_score(y_train, xgb_balanced.predict(X_train)) )
print( roc_auc_score(y_train, xgb_balanced.predict(X_train)) )

0.8164775783030707
0.8409640190429162


In [69]:
features = df_test_3

# Scaling features
scaler = StandardScaler()  
scaled_features = scaler.fit_transform(features) 

df_test_4 = df_test
df_test_4['diabetes_mellitus'] = xgb_balanced.predict_proba(features)[:,1]
df_test_4[["encounter_id","diabetes_mellitus"]].to_csv('submission-2.csv', index=False)

In [None]:
base_estimators = [
                    ('lr', LogisticRegression()),
                    ('svm', SVC()),
                    ('naive_bayes', GaussianNB()),
                    ('knn', KNeighborsClassifier()),
                    ('rf', RandomForestClassifier()),
                    ('xgb', xgboost.XGBClassifier())
]
final_estimator = xgboost.XGBClassifier()
model = StackingClassifier(estimators=base_estimators, final_estimator=final_estimator)
model.fit(X_train_balanced, y_train_balanced)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [60]:
print( accuracy_score(y_train, model.predict(X_train)) )
print( roc_auc_score(y_train, model.predict(X_train)) )

0.8447255871129664
0.8916207063725478


In [77]:
df_test_4 = df_test
df_test_3['d1']=[0]*10234
df_test_3['d2']=[0]*10234
df_test_3['d3']=[0]*10234
df_test_3['d4']=[0]*10234
df_test_3['d5']=[0]*10234

df_test_4['diabetes_mellitus'] = model.predict_proba(df_test_3)[:,1]
df_test_4[["encounter_id","diabetes_mellitus"]].to_csv('submission-2.csv', index=False)

In [73]:
sub_1[sub_1 == sub_2]

Unnamed: 0,encounter_id,diabetes_mellitus
0,144740,0.087263
1,141990,0.182365
2,142038,0.056126
3,138628,0.030748
4,141682,0.379943
...,...,...
10229,143750,0.021894
10230,143813,0.094927
10231,137126,0.031526
10232,135652,0.007220


In [74]:
[0]*10234

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
