****Reading Inputs****

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from lightgbm import LGBMClassifier
from sklearn import metrics

pd.set_option("display.max_columns", 181)
pd.set_option("display.min_rows", 200)

In [2]:
data_dictionary = pd.read_csv("DataDictionaryWiDS2021.csv")
unlabeled = pd.read_csv("UnlabeledWiDS2021.csv")
training = pd.read_csv("TrainingWiDS2021.csv")

In [3]:
column_datatype_mapping = dict(zip(data_dictionary['Variable Name'], data_dictionary['Data Type']))

In [4]:
del training['Unnamed: 0']
del unlabeled['Unnamed: 0']

In [5]:
training=training[['encounter_id','age','bmi','height','weight','ethnicity','gender','apache_2_diagnosis','glucose_apache','diabetes_mellitus']]

In [6]:
unlabeled=unlabeled[['encounter_id','age','bmi','height','weight','ethnicity','gender','apache_2_diagnosis','glucose_apache']]

In [7]:
#training.diabetes_mellitus.value_counts()

In [8]:
data_dictionary['Data Type'].value_counts()

numeric    150
binary      15
string      10
integer      6
Name: Data Type, dtype: int64

In [50]:
#print(training.hospital_id.nunique(), training.encounter_id.nunique())

In [51]:
#training.icu_stay_type.value_counts()

In [52]:
#training.icu_type.value_counts()

In [9]:
len(training), len(unlabeled)

(130157, 10234)

In [10]:
all_data = training.drop(['encounter_id'], axis=1).append(unlabeled)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [11]:
cat_cols = []
cont_cols = []
for col in all_data.columns:
    if all_data.dtypes[col] == "object":
        cat_cols.append(col)
        all_data[col] = all_data[col].fillna("NA")
        all_data[col] = LabelEncoder().fit_transform(all_data[col])
        all_data[col]= all_data[col].astype('category')
    elif column_datatype_mapping[col] == "binary":
        all_data[col] = all_data[col].fillna(-1)
    elif column_datatype_mapping[col] == "numeric":
        all_data[col] = all_data[col].fillna(0)
        cont_cols.append(col)
    else:
        all_data[col] = all_data[col].fillna(all_data[col].median())
        cont_cols.append(col)
    

In [12]:
len(all_data), all_data.encounter_id.nunique()

(140391, 10235)

In [13]:
df_train = all_data[:len(training)]
df_pred = all_data[len(training):].reset_index(drop=True)
Y = training['diabetes_mellitus']

In [14]:
X_train, X_val, y_train, y_val = train_test_split(df_train, Y, test_size=0.20, random_state=42,shuffle=True )

In [15]:
lgbmc = LGBMClassifier()

In [16]:
lgbmc.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [17]:
lgbmc.score(X_val, y_val)

1.0

In [18]:
lgbmc.fit(df_train, Y)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [19]:
AUC_FINAL=metrics.roc_auc_score(Y.values, lgbmc.predict(df_train))
AUC_FINAL

1.0

In [20]:
pd.DataFrame({'encounter_id':unlabeled['encounter_id'].values,
    'diabetes_mellitus':lgbmc.predict_proba(df_pred)[:,1]}).to_csv('Predictions_subset.csv',
                                                                   index=False)