In [24]:
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
# import pickle
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score
# from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import confusion_matrix, auc, roc_curve, roc_auc_score

In [26]:
df = pd.read_csv('~/data/wids2020/training_v2.csv', skipinitialspace=True)
features = df.columns
print(df.shape)

(91713, 186)


In [25]:
df_test = pd.read_csv('/home/deepta/data/wids2020/unlabeled.csv', skipinitialspace=True)
print(df_test.shape)

(39308, 186)


In [56]:
target_column = "hospital_death"

In [6]:
features[0:10]

Index(['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age',
       'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height'],
      dtype='object')

### Label Distribution

In [57]:
df[target_column].value_counts()

0    83798
1     7915
Name: hospital_death, dtype: int64

In [62]:
labels = df[target_column]
print(len(labels))

91713


### Find ID-based columns

In [20]:
for i, col in enumerate(features):
    if('_id' in col):
        print(i, col)

0 encounter_id
1 patient_id
2 hospital_id
12 icu_id


In [21]:
id_columns = ['encounter_id', 'patient_id', 'hospital_id', 'icu_id']

In [13]:
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


### find categorical columns

In [22]:
df.loc[:, df.dtypes == object].head()

Unnamed: 0,ethnicity,gender,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,apache_3j_bodysystem,apache_2_bodysystem
0,Caucasian,M,Floor,Floor,admit,CTICU,Sepsis,Cardiovascular
1,Caucasian,F,Floor,Floor,admit,Med-Surg ICU,Respiratory,Respiratory
2,Caucasian,F,Emergency Department,Accident & Emergency,admit,Med-Surg ICU,Metabolic,Metabolic
3,Caucasian,F,Operating Room,Operating Room / Recovery,admit,CTICU,Cardiovascular,Cardiovascular
4,Caucasian,M,,Accident & Emergency,admit,Med-Surg ICU,Trauma,Trauma


In [19]:
cat_columns = df.select_dtypes(include=['object']).columns
print(cat_columns)
print("Number of categorical columns: ", len(cat_columns))

Index(['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source',
       'icu_stay_type', 'icu_type', 'apache_3j_bodysystem',
       'apache_2_bodysystem'],
      dtype='object')
Number of categorical columns:  8


### Convert Categorical columns to Numerical

In [28]:
cat_labenc_mapping = {
    col: LabelEncoder()
    for col in cat_columns
}

In [29]:
for col in tqdm_notebook(cat_columns):
    df[col] = df[col].astype('str')
    cat_labenc_mapping[col] = cat_labenc_mapping[col].fit(
        np.unique(df[col].unique().tolist() + df_test[col].unique().tolist())
    )
    df[col] = cat_labenc_mapping[col].transform(df[col])

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




In [32]:
for col in tqdm_notebook(cat_columns):
    df_test[col] = df_test[col].astype('str')
    df_test[col] = cat_labenc_mapping[col].transform(df_test[col])

HBox(children=(IntProgress(value=0, max=8), HTML(value='')))




### Drop ID columns and target column

In [34]:
df = df.drop(id_columns, axis=1)
df.head()

Unnamed: 0,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,0,68.0,22.73,0,2,1,180.3,4,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9,0
1,0,77.0,27.42,0,2,0,160.0,4,1,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8,6
2,0,25.0,31.95,0,2,0,172.7,3,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,3
3,0,81.0,22.64,1,2,0,165.1,8,2,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0
4,0,19.0,,0,2,1,188.0,15,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,7


In [63]:
df = df.drop(target_column, axis=1)

In [81]:
df_test = df_test.drop(target_column, axis=1)

In [35]:
df_test = df_test.drop(id_columns, axis=1)

### Drop meaningless columns

- 'readmission_status' has all zeros

In [54]:
useless_columns = ['readmission_status']
df = df.drop(useless_columns, axis=1)
df_test = df_test.drop(useless_columns, axis=1)

In [42]:
df['readmission_status'].value_counts()

0    91713
Name: readmission_status, dtype: int64

### Imputing Missing Values

- frequent for categorical and binary columns
- median for numerical columns


In [36]:
# bin_columns = [col for col in df if np.isin(df[col].unique(), [0, 1]).all()]

In [51]:
binary_columns = [
    "apache_post_operative", "arf_apache", "cirrhosis", "diabetes_mellitus", "immunosuppression",
    "hepatic_failure", "leukemia", "lymphoma", "solid_tumor_with_metastasis", "gcs_unable_apache",
    "intubated_apache", "ventilated_apache", "elective_surgery"
]

In [74]:
len(binary_columns)

13

In [64]:
num_columns = list(cat_columns.union(binary_columns).symmetric_difference(df.columns))
print(len(num_columns))
print(num_columns)

159
['age', 'aids', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_4a_hospital_death_prob', 'apache_4a_icu_death_prob', 'bilirubin_apache', 'bmi', 'bun_apache', 'creatinine_apache', 'd1_albumin_max', 'd1_albumin_min', 'd1_arterial_pco2_max', 'd1_arterial_pco2_min', 'd1_arterial_ph_max', 'd1_arterial_ph_min', 'd1_arterial_po2_max', 'd1_arterial_po2_min', 'd1_bilirubin_max', 'd1_bilirubin_min', 'd1_bun_max', 'd1_bun_min', 'd1_calcium_max', 'd1_calcium_min', 'd1_creatinine_max', 'd1_creatinine_min', 'd1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_glucose_max', 'd1_glucose_min', 'd1_hco3_max', 'd1_hco3_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_hemaglobin_max', 'd1_hemaglobin_min', 'd1_hematocrit_max', 'd1_hematocrit_min', 'd1_inr_max', 'd1_inr_min', 'd1_lactate_max', 'd1_lactate_min', 'd1_mbp_invasive_max', 'd1_mbp_invasive_min', 'd1_mbp_max', 'd1_mbp_min', 'd

In [65]:
cat_col2imputer_mapping = {
    col: SimpleImputer(strategy='most_frequent')
    for col in cat_columns
}

bin_col2imputer_mapping = {
    col: SimpleImputer(strategy='most_frequent')
    for col in binary_columns
}

cont_col2imputer_mapping = {
    col: SimpleImputer(strategy='median')
    for col in num_columns
}

all_imp_dicts = [cat_col2imputer_mapping, cont_col2imputer_mapping, bin_col2imputer_mapping]

In [69]:
for imp_mapping_obj in tqdm_notebook(all_imp_dicts):
    for col, imp_object in imp_mapping_obj.items():
        data = df[col].values.reshape(-1, 1)
        imp_object.fit(data)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [70]:
for imp_mapping_obj in tqdm_notebook(all_imp_dicts):
    for col, imp_object in imp_mapping_obj.items():
#         print(col, imp_object)
        data = df[col].values.reshape(-1, 1)
        data = imp_object.transform(data)
        df[col] = list(data.reshape(-1,))



HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

ethnicity SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
gender SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
hospital_admit_source SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
icu_admit_source SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
icu_stay_type SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
icu_type SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
apache_3j_bodysystem SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
apache_2_bodysystem SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
age SimpleImputer(copy=True, fil

d1_wbc_max SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
d1_wbc_min SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
fio2_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
gcs_eyes_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
gcs_motor_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
gcs_verbal_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
glucose_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
h1_albumin_max SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
h1_albumin_min SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy=

sodium_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
temp_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
urineoutput_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
wbc_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
weight SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='median', verbose=0)
apache_post_operative SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
arf_apache SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
cirrhosis SimpleImputer(copy=True, fill_value=None, missing_values=nan,
       strategy='most_frequent', verbose=0)
diabetes_mellitus SimpleImputer(copy=True, fill_value=None, missing_values=nan,
 

In [71]:
# inputing on test 
for imp_mapping_obj in tqdm_notebook(all_imp_dicts):
    for col, imp_object in imp_mapping_obj.items():
        data = df_test[col].values.reshape(-1, 1)
        data = imp_object.transform(data)
        df_test[col] = list(data.reshape(-1,))

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))




In [72]:
traindf = df
traindf[target_column] = labels
traindf.to_csv('./train_processed.csv')

In [73]:
traindf.head()

Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_stay_type,icu_type,...,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem,hospital_death
0,68.0,22.73,0,2,1,180.3,4,1,0,2,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,9,0,0
1,77.0,27.42,0,2,0,160.0,4,1,0,5,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,8,6,0
2,25.0,31.95,0,2,0,172.7,3,0,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,3,0
3,81.0,22.64,1,2,0,165.1,8,2,0,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0
4,19.0,27.654655,0,2,1,188.0,15,0,0,5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10,7,0


In [82]:
df_test.to_csv('./unlabeled_processed.csv')

In [83]:
test_encids = pd.read_csv('/home/deepta/data/wids2020/unlabeled.csv', skipinitialspace=True)['encounter_id']

In [90]:
test_encids.to_csv('./resids.csv', index=False)

  """Entry point for launching an IPython kernel.


### Submission CSV

In [91]:
res = pd.read_csv('./resids.csv', names=['encounter_id', 'hospital_death'])
res.head()

Unnamed: 0,encounter_id,hospital_death
0,2,
1,5,
2,7,
3,8,
4,10,
