In [1]:
from scipy.stats import kurtosis, iqr, skew, gmean, hmean, mode, normaltest, shapiro, ks_2samp
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from logging import getLogger, Formatter, StreamHandler, FileHandler, INFO, ERROR
from sklearn.compose import make_column_selector as selector
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import os, gc, sys, time, random, math
from contextlib import contextmanager
from matplotlib import pyplot as plt
from IPython.display import display
from scipy import stats, special
from sklearn import set_config
from functools import partial
import lightgbm as lgb
import seaborn as sns
import pandas as pd
import typing as tp
import numpy as np
import warnings


warnings.simplefilter('ignore')
%matplotlib inline

In [2]:
def init_logger():
    handler = StreamHandler()
    handler.setLevel(ERROR)
    handler.setFormatter(Formatter(LOGFORMAT))
    fh_handler = FileHandler('{}.log'.format(MODELNAME))
    fh_handler.setFormatter(Formatter(LOGFORMAT))
    logger.setLevel(ERROR)
    logger.addHandler(handler)
    logger.addHandler(fh_handler)
    
@contextmanager
def timer(name : tp.Text):
    t0 = time.time()
    yield
    logger.info(f'[{name}] done in {time.time() - t0:.0f} s')

COMPETITION = 'WIDS2021'
logger = getLogger(COMPETITION)
LOGFORMAT = '%(asctime)s %(levelname)s %(message)s'
MODELNAME = 'LGBMCV'
init_logger()

In [3]:
train = pd.read_csv('../input/widsdatathon2021/TrainingWiDS2021.csv', index_col=[0])
test = pd.read_csv('../input/widsdatathon2021/UnlabeledWiDS2021.csv', index_col=[0])
test_id = test.encounter_id.values
y = train.diabetes_mellitus.values
del train['diabetes_mellitus']

In [4]:
train = train.rename(columns={'pao2_apache':'pao2fio2ratio_apache','ph_apache':'arterial_ph_apache'})
test = test.rename(columns={'pao2_apache':'pao2fio2ratio_apache','ph_apache':'arterial_ph_apache'})
train.loc[train.age == 0, 'age'] = np.nan
train = train.drop(['readmission_status','encounter_id','hospital_id'], axis=1)
test = test.drop(['readmission_status','encounter_id','hospital_id'], axis=1)
train = train.replace([np.inf, -np.inf], np.nan)
test = test.replace([np.inf, -np.inf], np.nan)

In [5]:
min_max_feats=[f[:-4] for f in train.columns if f[-4:]=='_min']
for col in min_max_feats:
    train.loc[train[f'{col}_min'] > train[f'{col}_max'], [f'{col}_min', f'{col}_max']] = train.loc[train[f'{col}_min'] > train[f'{col}_max'], [f'{col}_max', f'{col}_min']].values
    test.loc[test[f'{col}_min'] > test[f'{col}_max'], [f'{col}_min', f'{col}_max']] = test.loc[test[f'{col}_min'] > test[f'{col}_max'], [f'{col}_max', f'{col}_min']].values


In [6]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(train.head(2))
    display(train.describe())
    display(test.head(2))
    display(test.describe())


Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2fio2ratio_apache,arterial_ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
1,68.0,22.732803,0,Caucasian,M,180.3,Floor,Floor,92,admit,CTICU,0.541667,73.9,2.3,113.0,502.01,0,0,0.4,31.0,2.51,,3.0,6.0,0.0,4.0,168.0,118.0,27.4,0,40.0,,,,,36.0,134.0,39.3,,0,14.1,46.0,32.0,68.0,37.0,68.0,37.0,119.0,72.0,66.0,40.0,89.0,46.0,89.0,46.0,34.0,10.0,100.0,74.0,122.0,64.0,131.0,73.0,131.0,73.0,39.9,37.2,,,68.0,63.0,68.0,63.0,119.0,108.0,,,86.0,85.0,86.0,85.0,26.0,18.0,100.0,74.0,,,131.0,115.0,131.0,115.0,39.5,37.5,2.3,2.3,0.4,0.4,31.0,30.0,8.5,7.4,2.51,2.23,168.0,109.0,19.0,15.0,8.9,8.9,27.4,27.4,,,1.3,1.0,233.0,233.0,4.0,3.4,136.0,134.0,14.1,14.1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0
2,77.0,27.421875,0,Caucasian,F,160.0,Floor,Floor,90,admit,Med-Surg ICU,0.927778,70.2,,108.0,203.01,0,0,,9.0,0.56,1.0,1.0,3.0,0.0,1.0,145.0,120.0,36.9,0,46.0,37.0,37.0,51.0,7.45,33.0,145.0,35.1,,1,12.7,,,95.0,31.0,95.0,31.0,118.0,72.0,,,120.0,38.0,120.0,38.0,32.0,12.0,100.0,70.0,,,159.0,67.0,159.0,67.0,36.3,35.1,,,61.0,48.0,61.0,48.0,114.0,100.0,,,85.0,57.0,85.0,57.0,31.0,28.0,95.0,70.0,,,95.0,71.0,95.0,71.0,36.3,36.3,1.6,1.6,0.5,0.5,11.0,9.0,8.6,8.0,0.71,0.56,145.0,128.0,27.0,26.0,11.3,11.1,36.9,36.1,1.3,1.3,3.5,3.5,557.0,487.0,4.2,3.8,145.0,145.0,23.3,12.7,,,,,9.0,9.0,8.6,8.6,0.56,0.56,145.0,143.0,27.0,27.0,11.3,11.3,36.9,36.9,1.3,1.3,3.5,3.5,557.0,557.0,4.2,4.2,145.0,145.0,12.7,12.7,37.0,37.0,7.45,7.45,51.0,51.0,54.8,51.0,37.0,37.0,7.45,7.45,51.0,51.0,51.0,51.0,0,0,0,0,0,0,0


Unnamed: 0,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2fio2ratio_apache,arterial_ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
count,125139.0,125667.0,130157.0,128080.0,130157.0,130157.0,126694.0,51994.0,128472.0,129292.0,130157.0,130157.0,47597.0,104746.0,105275.0,30437.0,127967.0,127967.0,129448.0,127967.0,115461.0,129848.0,103399.0,130157.0,129737.0,30437.0,30437.0,30437.0,30437.0,129349.0,105638.0,123546.0,66990.0,130157.0,100682.0,35089.0,35089.0,129880.0,129880.0,128521.0,128521.0,129895.0,129895.0,35289.0,35289.0,129830.0,129830.0,127929.0,127929.0,129474.0,129474.0,129625.0,129625.0,35119.0,35119.0,129886.0,129886.0,128534.0,128534.0,125663.0,125663.0,25328.0,25328.0,124630.0,124630.0,118818.0,118818.0,126083.0,126083.0,25391.0,25391.0,123627.0,123627.0,116860.0,116860.0,123703.0,123703.0,123915.0,123915.0,25350.0,25350.0,124638.0,124638.0,118827.0,118827.0,100454.0,100454.0,58751.0,58751.0,53422.0,53422.0,116423.0,116423.0,113465.0,113465.0,116884.0,116884.0,121914.0,121914.0,110110.0,110110.0,113925.0,113925.0,114569.0,114569.0,48944.0,48944.0,34654.0,34654.0,111600.0,111600.0,117611.0,117611.0,116887.0,116887.0,112728.0,112728.0,11152.0,11152.0,10296.0,10296.0,25167.0,25167.0,24236.0,24236.0,25373.0,25373.0,55084.0,55084.0,23762.0,23762.0,27367.0,27367.0,27201.0,27201.0,48944.0,48944.0,11690.0,11690.0,24428.0,24428.0,29336.0,29336.0,28376.0,28376.0,24171.0,24171.0,45696.0,45696.0,45350.0,45350.0,46147.0,46147.0,36818.0,36818.0,22491.0,22491.0,22308.0,22308.0,22712.0,22712.0,16760.0,16760.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0,130157.0
mean,62.009965,29.11026,0.18984,169.607219,662.428344,0.839933,83.791104,2.886149,185.492683,565.994296,0.207111,0.027997,1.201222,25.71807,1.481629,0.595735,3.48829,5.484828,0.011441,4.030203,160.141416,99.85453,32.975817,0.156626,87.193046,42.161246,42.161246,132.061737,7.352154,25.150603,137.94526,36.420638,1800.803417,0.330432,12.187662,79.263558,46.967711,88.577241,49.861264,88.731032,49.915804,103.302621,70.402702,114.444926,62.199439,104.230517,64.351467,104.187252,64.380828,28.719774,12.690455,99.276104,90.366087,153.292776,93.6302,148.237724,96.663605,148.147657,96.68439,37.30272,36.26541,68.534112,56.647623,75.441387,62.479555,75.918396,62.878369,92.487996,83.759159,94.673728,75.973573,91.300212,78.833114,91.277241,79.064413,22.557998,17.069772,98.096905,95.254521,138.237988,114.755145,133.28923,115.925336,133.087127,116.054247,36.728636,36.607925,2.958809,2.876937,1.191379,1.117405,25.600952,23.53201,8.381618,8.162838,1.493678,1.363649,174.150229,114.439884,24.493324,23.151892,11.46527,10.896032,34.560088,32.935045,1.577788,1.463473,2.923555,2.076664,205.924326,194.884126,4.254398,3.92791,139.169122,137.672598,12.539672,11.298567,3.008563,3.007254,1.187979,1.187173,25.63122,25.606374,8.278655,8.260569,1.526582,1.524339,167.702625,158.908794,22.545417,22.484841,11.210783,11.081587,33.73705,33.344576,1.577788,1.463473,3.031212,2.973968,193.943057,193.123506,4.192069,4.143943,138.167205,137.879814,13.387873,13.336485,45.353532,38.523506,7.38798,7.322611,165.003814,102.957476,287.600071,224.005403,44.552966,43.341081,7.337793,7.327261,163.035835,145.949537,247.579067,239.56371,0.00103,0.016081,0.013599,0.025669,0.007307,0.004187,0.020852
std,16.797485,8.262776,0.392176,10.833085,304.259843,2.485337,24.963063,0.689812,85.858208,466.51085,0.405238,0.164965,2.351994,20.690041,1.543535,0.262922,0.939831,1.271039,0.106349,1.538528,90.701327,30.759505,6.834576,0.363449,41.908109,12.267414,12.267414,84.958826,0.098423,15.02473,5.30384,0.857584,1456.551481,0.47037,6.931023,21.690174,12.736733,20.15996,13.396441,20.152923,13.427034,21.976387,17.139095,49.170576,17.843389,20.907494,15.431621,20.797568,15.456788,10.56814,5.044717,1.722287,10.150349,31.468331,24.59669,25.888454,20.668778,25.928212,20.688318,0.697096,0.768179,16.265052,14.24555,18.530804,16.303363,18.634214,16.383062,21.778815,20.214075,30.46325,19.112166,20.387711,18.841043,20.403272,18.959959,7.529497,5.807542,3.114242,6.468139,28.911808,27.768759,27.441029,26.309743,27.56205,26.415845,0.76966,0.799932,0.673963,0.680754,2.319165,2.201525,20.526303,18.672573,0.739131,0.793455,1.526212,1.34128,86.545095,38.217854,4.371056,4.984444,2.149005,2.336922,6.194172,6.805741,0.946477,0.737639,3.05941,2.060768,89.575705,88.089036,0.666276,0.579471,4.817068,4.9308,6.782836,5.924633,0.73817,0.738457,2.375772,2.375223,21.377589,21.360676,0.893695,0.904525,1.572771,1.57007,94.033461,88.298827,5.13823,5.144943,2.350446,2.381068,6.804915,6.963673,0.946477,0.737639,2.904945,2.848362,92.486473,92.692759,0.762286,0.747139,5.711628,5.658036,6.933161,6.915142,14.624846,10.987335,0.085186,0.111716,107.084058,61.514357,130.31962,119.119675,14.631359,14.052015,0.105444,0.109082,112.646743,100.211935,131.460113,128.538492,0.03207,0.125786,0.115819,0.158146,0.085166,0.064574,0.142888
min,16.0,14.844926,0.0,137.2,82.0,-0.25,38.6,1.2,101.0,0.01,0.0,0.0,0.1,4.0,0.3,0.21,1.0,1.0,0.0,1.0,39.0,30.0,16.2,0.0,40.0,18.0,18.0,31.0,6.96054,4.0,117.0,32.1,0.0,0.0,0.9,37.0,5.0,46.0,13.0,46.0,13.0,58.0,0.0,38.0,2.0,60.0,22.0,60.0,22.0,14.0,0.0,0.0,0.0,71.0,10.0,90.0,41.0,90.0,41.03,35.1,31.889,33.0,19.0,37.0,22.0,37.0,22.0,46.0,36.0,35.625,8.0,49.0,32.0,49.0,32.0,10.0,0.0,0.0,0.0,65.0,31.44,75.0,53.0,75.0,53.0,33.4,32.9,1.2,1.1,0.2,0.2,4.0,3.0,6.2,5.5,0.34,0.3,73.0,33.0,12.0,7.0,6.8,5.3,20.4,16.1,0.9,0.9,0.4,0.4,27.0,18.55,2.8,2.4,123.0,117.0,1.2,0.9,1.1,1.1,0.2,0.2,4.0,4.0,5.6,5.3,0.33,0.33,59.0,42.0,6.0,6.0,5.1,5.0,16.0,15.5,0.9,0.9,0.4,0.4,20.0,20.0,2.5,2.5,114.0,114.0,1.1,1.0898,18.4,14.9,7.05428,6.89,39.0,28.0,54.8,36.0,15.0,14.997,6.93,6.9,34.0,31.0,42.0,38.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52.0,23.598006,0.0,162.5,427.0,0.045833,66.5,2.4,113.0,204.01,0.0,0.0,0.4,13.0,0.71,0.4,3.0,6.0,0.0,4.0,97.0,87.0,28.0,0.0,54.0,34.5,34.5,77.0,7.301,11.0,135.0,36.2,799.0488,0.0,7.5,65.0,40.0,75.0,41.0,75.0,41.0,88.0,60.0,89.0,54.0,90.0,54.0,90.0,54.0,22.0,10.0,99.0,89.0,133.0,80.0,130.0,83.0,130.0,83.0,36.9,36.1,58.0,47.0,62.0,51.0,63.0,52.0,77.0,69.0,78.0,64.0,77.0,66.0,77.0,66.0,18.0,13.0,97.0,94.0,119.0,95.0,113.0,97.0,113.0,98.0,36.4,36.3,2.5,2.4,0.4,0.4,13.0,12.0,7.9,7.7,0.75,0.71,117.0,91.0,22.0,21.0,9.9,9.2,30.0,28.0,1.1,1.1,1.2,1.0,147.0,136.0,3.8,3.6,137.0,135.0,8.0,7.4,2.5,2.5,0.4,0.4,13.0,13.0,7.7,7.7,0.78,0.78,111.0,106.0,20.0,20.0,9.6,9.4,29.0,28.4,1.1,1.1,1.3,1.24,131.0,130.0,3.7,3.7,136.0,135.0,8.6,8.5,36.2,32.0,7.34,7.27,88.0,68.0,192.205556,132.0,36.0,35.0,7.29,7.28,80.0,77.0,144.0,138.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,64.0,27.564749,0.0,170.1,653.0,0.155556,80.0,2.9,122.0,409.02,0.0,0.0,0.7,19.0,0.97,0.5,4.0,6.0,0.0,5.0,133.0,104.0,33.1,0.0,66.0,40.0,40.0,104.0,7.36,27.0,138.0,36.5,1454.976,0.0,10.47,76.0,46.0,86.0,50.0,87.0,50.0,101.0,70.0,101.0,62.0,102.0,64.0,102.0,64.0,26.0,13.0,100.0,93.0,150.0,92.0,146.0,95.0,146.0,96.0,37.2,36.4,67.0,55.0,74.0,61.0,74.0,62.0,90.0,82.0,90.0,74.0,89.0,78.0,89.0,78.0,21.0,16.0,99.0,96.0,136.0,112.0,131.0,114.0,130.0,115.0,36.7,36.7,3.0,2.9,0.7,0.6,19.0,17.0,8.4,8.2,1.0,0.94,150.0,108.0,24.0,23.0,11.4,10.9,34.5,33.1,1.3,1.2,1.9,1.5,194.0,185.0,4.2,3.9,139.0,138.0,11.1,10.1,3.1,3.1,0.7,0.7,18.0,18.0,8.3,8.3,1.01,1.01,140.0,134.0,23.0,23.0,11.2,11.1,33.7,33.3,1.3,1.2,2.0,2.0,179.0,179.0,4.1,4.1,139.0,138.0,12.1,12.1,42.9,37.0,7.39,7.34,127.0,85.0,275.0,205.0,42.0,41.0,7.35,7.34,119.9,108.0,228.333333,218.660714,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,75.0,32.803127,0.0,177.8,969.0,0.423611,96.8,3.4,301.0,703.03,0.0,0.0,1.1,31.0,1.53,0.85,4.0,6.0,0.0,5.0,195.0,120.0,37.9,0.0,124.0,47.0,47.0,156.0,7.414,36.0,141.0,36.7,2415.096,1.0,15.3,88.0,54.0,99.0,58.0,99.0,58.0,117.0,81.0,118.0,71.0,116.0,74.0,116.0,74.0,32.0,16.0,100.0,95.0,169.0,107.0,164.0,109.0,164.0,109.0,37.6,36.7,77.0,65.0,86.0,73.0,87.0,73.0,106.0,97.0,104.0,88.0,103.0,91.0,103.0,91.0,26.0,20.0,100.0,99.0,155.0,133.0,150.0,133.0,150.0,133.0,37.1,37.0,3.4,3.4,1.1,1.0,31.0,29.0,8.8,8.7,1.51,1.4,200.0,131.0,27.0,26.0,13.0,12.6,39.0,37.9,1.6,1.5,3.3,2.3,250.0,240.0,4.6,4.3,142.0,141.0,15.3,13.7,3.5,3.5,1.1,1.1,30.0,30.0,8.8,8.8,1.54,1.53,189.0,179.0,25.0,25.0,12.8,12.8,38.5,38.2,1.6,1.5,3.6,3.5,239.0,238.0,4.5,4.5,141.0,141.0,16.7,16.6,50.0,43.0,7.44,7.398,206.0,116.25,370.0,300.0,49.0,48.0,7.408,7.4,214.0,182.0,333.0,324.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,89.0,67.81499,1.0,195.59,1111.0,175.627778,186.0,4.6,308.0,2201.05,1.0,1.0,60.2,127.0,11.18,1.0,4.0,6.0,1.0,5.0,598.7,178.0,51.4,1.0,200.0,95.0,95.0,498.0,7.59,60.0,158.0,39.7,8716.669632,1.0,45.8,181.0,89.0,165.0,90.0,165.0,90.0,177.0,175.0,322.0,119.0,184.0,112.0,181.0,112.0,100.0,92.0,100.0,100.0,295.0,172.0,232.0,160.0,232.0,160.0,39.9,37.8,135.0,104.0,143.0,113.0,144.0,114.0,164.0,144.0,293.375,140.0,165.0,138.0,163.0,138.0,189.0,59.0,100.0,100.0,246.0,198.0,223.0,194.0,223.0,195.0,39.5,39.3,4.6,4.5,60.2,58.0,126.0,113.09,10.8,10.3,11.11,9.9379,611.0,288.0,40.0,39.0,17.2,16.7,51.5,50.0,7.756,6.127,19.8,15.1,585.0,557.45,7.0,5.8,158.0,153.0,46.08,40.898,4.7,4.7,58.0,58.0,135.0,135.0,11.4,11.315,11.6042,11.571,695.045,670.0,39.0,39.0,17.4,17.3,51.7,51.5,7.756,6.127,18.1,18.0195,585.0,585.0,7.2,7.1,157.0,157.0,44.102,44.102,111.0,85.912,7.62,7.55786,540.865,448.892,834.805,604.227778,111.505,107.0,7.57,7.563,534.905,514.905,720.0,654.813793,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Unnamed: 0,age,bmi,elective_surgery,ethnicity,gender,height,hospital_admit_source,icu_admit_source,icu_id,icu_stay_type,icu_type,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2fio2ratio_apache,arterial_ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
1,72,,0,Caucasian,F,152.4,Floor,Accident & Emergency,82,admit,Med-Surg ICU,0.015278,,2.8,110.0,104.01,0,0,1.9,44.0,1.49,,4.0,6.0,0.0,5.0,97.0,38.0,39.9,0,54.0,,,,,31.0,130.0,36.4,,0,5.4,,,104.0,40.0,104.0,40.0,66.0,38.0,,,123.0,54.0,123.0,54.0,35.0,15.0,100.0,96.0,,,149.0,76.0,149.0,76.0,38.7,35.6,,,59.0,59.0,59.0,59.0,46.0,44.0,,,80.0,80.0,80.0,80.0,19.0,19.0,97.0,96.0,,,116.0,116.0,116.0,116.0,36.4,36.4,2.8,2.8,1.9,1.9,44.0,40.0,9.8,9.3,1.49,1.44,104.0,97.0,26.0,23.0,14.5,14.2,42.8,39.9,,,,,173.0,173.0,5.7,4.9,132.0,130.0,5.6,5.4,2.8,2.8,1.9,1.9,40.0,40.0,9.8,9.8,1.49,1.49,104.0,104.0,26.0,26.0,14.5,14.5,42.8,42.8,,,,,173.0,173.0,4.9,4.9,132.0,132.0,5.6,5.6,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0
2,86,,0,Caucasian,F,175.3,Emergency Department,Accident & Emergency,82,admit,Med-Surg ICU,0.0,,,117.0,106.01,0,0,,19.0,0.92,,4.0,6.0,0.0,5.0,73.0,116.0,,0,41.0,,,,,53.0,142.0,36.3,,0,,,,101.0,27.0,101.0,27.0,116.0,56.0,,,129.0,41.0,129.0,41.0,51.0,14.0,100.0,95.0,,,181.0,65.0,181.0,65.0,36.8,36.4,,,85.0,65.0,85.0,65.0,116.0,104.0,,,129.0,94.0,129.0,94.0,31.0,24.0,97.0,95.0,,,181.0,119.0,181.0,119.0,36.7,36.7,,,,,19.0,19.0,8.5,8.5,1.0,1.0,102.0,73.0,27.0,27.0,,,,,2.2,2.2,,,,,3.9,3.8,144.0,144.0,,,,,,,,,,,,,,,,,,,,,2.2,2.2,,,,,,,,,,,,,,,,,,,,,,,,,,,0,0,0,0,0,0,0


Unnamed: 0,age,bmi,elective_surgery,height,icu_id,pre_icu_los_days,weight,albumin_apache,apache_2_diagnosis,apache_3j_diagnosis,apache_post_operative,arf_apache,bilirubin_apache,bun_apache,creatinine_apache,fio2_apache,gcs_eyes_apache,gcs_motor_apache,gcs_unable_apache,gcs_verbal_apache,glucose_apache,heart_rate_apache,hematocrit_apache,intubated_apache,map_apache,paco2_apache,paco2_for_ph_apache,pao2fio2ratio_apache,arterial_ph_apache,resprate_apache,sodium_apache,temp_apache,urineoutput_apache,ventilated_apache,wbc_apache,d1_diasbp_invasive_max,d1_diasbp_invasive_min,d1_diasbp_max,d1_diasbp_min,d1_diasbp_noninvasive_max,d1_diasbp_noninvasive_min,d1_heartrate_max,d1_heartrate_min,d1_mbp_invasive_max,d1_mbp_invasive_min,d1_mbp_max,d1_mbp_min,d1_mbp_noninvasive_max,d1_mbp_noninvasive_min,d1_resprate_max,d1_resprate_min,d1_spo2_max,d1_spo2_min,d1_sysbp_invasive_max,d1_sysbp_invasive_min,d1_sysbp_max,d1_sysbp_min,d1_sysbp_noninvasive_max,d1_sysbp_noninvasive_min,d1_temp_max,d1_temp_min,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_diasbp_max,h1_diasbp_min,h1_diasbp_noninvasive_max,h1_diasbp_noninvasive_min,h1_heartrate_max,h1_heartrate_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_mbp_max,h1_mbp_min,h1_mbp_noninvasive_max,h1_mbp_noninvasive_min,h1_resprate_max,h1_resprate_min,h1_spo2_max,h1_spo2_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_sysbp_max,h1_sysbp_min,h1_sysbp_noninvasive_max,h1_sysbp_noninvasive_min,h1_temp_max,h1_temp_min,d1_albumin_max,d1_albumin_min,d1_bilirubin_max,d1_bilirubin_min,d1_bun_max,d1_bun_min,d1_calcium_max,d1_calcium_min,d1_creatinine_max,d1_creatinine_min,d1_glucose_max,d1_glucose_min,d1_hco3_max,d1_hco3_min,d1_hemaglobin_max,d1_hemaglobin_min,d1_hematocrit_max,d1_hematocrit_min,d1_inr_max,d1_inr_min,d1_lactate_max,d1_lactate_min,d1_platelets_max,d1_platelets_min,d1_potassium_max,d1_potassium_min,d1_sodium_max,d1_sodium_min,d1_wbc_max,d1_wbc_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,h1_bun_max,h1_bun_min,h1_calcium_max,h1_calcium_min,h1_creatinine_max,h1_creatinine_min,h1_glucose_max,h1_glucose_min,h1_hco3_max,h1_hco3_min,h1_hemaglobin_max,h1_hemaglobin_min,h1_hematocrit_max,h1_hematocrit_min,h1_inr_max,h1_inr_min,h1_lactate_max,h1_lactate_min,h1_platelets_max,h1_platelets_min,h1_potassium_max,h1_potassium_min,h1_sodium_max,h1_sodium_min,h1_wbc_max,h1_wbc_min,d1_arterial_pco2_max,d1_arterial_pco2_min,d1_arterial_ph_max,d1_arterial_ph_min,d1_arterial_po2_max,d1_arterial_po2_min,d1_pao2fio2ratio_max,d1_pao2fio2ratio_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,aids,cirrhosis,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis
count,10234.0,9219.0,10234.0,9933.0,10234.0,10234.0,9326.0,4179.0,10064.0,10126.0,10234.0,10234.0,3924.0,8173.0,8191.0,2599.0,10029.0,10029.0,10173.0,10029.0,9024.0,10204.0,8120.0,10234.0,10182.0,2599.0,2599.0,2599.0,2599.0,10170.0,8241.0,9563.0,5044.0,10234.0,7858.0,2909.0,2909.0,10211.0,10211.0,10085.0,10085.0,10210.0,10210.0,2919.0,2919.0,10197.0,10197.0,10022.0,10022.0,10179.0,10179.0,10172.0,10172.0,2912.0,2912.0,10211.0,10211.0,10086.0,10086.0,9804.0,9804.0,2061.0,2061.0,9582.0,9582.0,9078.0,9078.0,9683.0,9683.0,2048.0,2048.0,9437.0,9437.0,8845.0,8845.0,9510.0,9510.0,9508.0,9508.0,2064.0,2064.0,9583.0,9583.0,9079.0,9079.0,7580.0,7580.0,4694.0,4694.0,4374.0,4374.0,9126.0,9126.0,8832.0,8832.0,9143.0,9143.0,9553.0,9553.0,8640.0,8640.0,8985.0,8985.0,9005.0,9005.0,4198.0,4198.0,2445.0,2445.0,8837.0,8837.0,9264.0,9264.0,9168.0,9168.0,8812.0,8812.0,869.0,869.0,827.0,827.0,2004.0,2004.0,1928.0,1928.0,2014.0,2014.0,4244.0,4244.0,1896.0,1896.0,2189.0,2189.0,2195.0,2195.0,4198.0,4198.0,813.0,813.0,2038.0,2038.0,2355.0,2355.0,2269.0,2269.0,1970.0,1970.0,3823.0,3823.0,3749.0,3749.0,3892.0,3892.0,3122.0,3122.0,1887.0,1887.0,1833.0,1833.0,1928.0,1928.0,1422.0,1422.0,10234.0,10234.0,10234.0,10234.0,10234.0,10234.0,10234.0
mean,62.853625,29.116463,0.200117,169.276414,677.823725,0.831339,83.478729,2.865877,187.961447,570.656532,0.21966,0.01925,1.148081,25.110221,1.397522,0.608391,3.497158,5.494965,0.014155,4.062419,161.399236,98.919922,32.888448,0.169533,85.461697,41.165092,41.165092,136.852546,7.358241,24.284435,138.011285,36.433963,1943.528351,0.328611,12.017957,80.118589,47.409708,87.120948,49.823112,87.287853,49.866384,102.501273,70.615279,115.550367,61.855211,102.879392,64.064267,102.801156,64.064003,28.223452,12.676196,99.271726,90.591133,153.032157,93.556071,146.568818,96.915072,146.520669,96.862086,37.312345,36.267993,68.919165,57.13541,74.366512,62.3421,74.836991,62.809881,91.994526,83.798781,94.068164,75.764863,90.273994,78.424159,90.126512,78.559186,22.194744,16.930961,98.0874,95.386832,138.168987,115.157028,132.204369,115.854847,131.884679,115.915079,36.71905,36.595204,2.952968,2.866012,1.163809,1.096015,25.023406,22.981569,8.300623,8.072101,1.416197,1.290832,174.317257,115.394785,24.624236,23.30831,11.522727,10.910161,34.647799,32.837469,1.592499,1.453421,2.951363,2.127227,202.636248,190.805395,4.239478,3.905425,139.246106,137.698986,12.416901,11.101709,3.018067,3.01637,1.162515,1.162152,25.367385,25.338942,8.193382,8.186172,1.491498,1.48982,167.162059,159.06894,22.590348,22.5125,11.220401,11.082832,33.668074,33.14497,1.592499,1.453421,2.969648,2.906861,188.520981,188.036192,4.155992,4.107973,138.433308,138.061338,13.139518,13.102954,44.669704,37.769997,7.39356,7.325659,173.481308,103.107914,297.79934,224.415951,43.364049,42.196057,7.345563,7.331446,176.303994,156.767998,254.687589,246.255826,0.000977,0.012019,0.010064,0.022572,0.006547,0.004104,0.017686
std,17.851661,8.342873,0.400107,10.805701,304.116531,2.411936,24.613964,0.70907,86.066103,477.42326,0.414036,0.137408,2.235654,20.464683,1.373137,0.271831,0.939067,1.267596,0.118136,1.521123,87.738124,30.846816,6.679005,0.37524,41.588964,11.808366,11.808366,91.905812,0.094708,15.157814,5.312423,0.912793,1675.662272,0.469731,6.868647,22.556306,12.813775,19.974876,13.523936,19.914701,13.546566,21.899329,17.37391,49.75807,18.158176,21.13928,15.671791,20.844801,15.684015,10.350495,5.241249,1.570515,10.325299,32.529564,25.610991,25.334159,20.694912,25.344775,20.643219,0.73459,0.813493,16.322829,13.686708,18.195973,16.106091,18.325114,16.226776,21.726694,20.275779,28.832388,18.480285,20.143541,18.711586,20.08581,18.815219,7.678873,5.925387,3.317249,6.676882,28.077008,26.086414,26.798464,26.134758,26.902274,26.260393,0.814963,0.855241,0.692475,0.697987,2.222178,2.145595,20.392872,18.529956,0.753895,0.802378,1.364639,1.195818,83.335191,38.133927,4.365851,4.983016,2.088643,2.296936,5.982917,6.676383,0.994007,0.72774,3.107259,2.210174,88.080877,86.830826,0.648077,0.568192,4.854802,4.894314,6.607625,5.81622,0.752007,0.753892,2.457005,2.457081,22.610926,22.598947,0.903459,0.901304,1.56427,1.564461,86.573148,82.03504,4.906451,4.907947,2.336452,2.39929,6.730747,6.982436,0.994007,0.72774,2.75206,2.692012,89.867998,90.111379,0.725008,0.713423,5.694783,5.589341,6.618158,6.604238,14.140818,10.5819,0.081188,0.107707,113.066388,63.76445,136.576568,119.435927,13.984545,13.710472,0.103545,0.109309,118.798786,105.623078,132.087636,130.745488,0.031245,0.108975,0.099821,0.148541,0.080651,0.063934,0.131814
min,18.0,14.9,0.0,137.0,82.0,-0.208333,38.1,1.1,101.0,0.1,0.0,0.0,0.1,4.0,0.3095,0.21,1.0,1.0,0.0,1.0,40.0,30.015,15.9,0.0,40.0,18.999,18.999,29.099,6.9999,4.0,117.0,31.7,0.0,0.0,0.6,40.73,6.865,45.0,13.0,45.0,13.0,57.0,0.0,25.68,1.0,58.33,22.0,58.435,22.0,13.0,0.0,45.0,0.0,6.88,1.0,91.0,42.0,90.765,42.0,34.835,31.5,35.54,25.54,37.0,21.0,37.0,21.0,46.0,37.725,18.36,3.0,49.0,31.465,49.0,31.0,9.0,0.0,0.0,0.0,74.0,50.11,77.0,52.0,76.0,52.0,33.2,32.595,1.105,1.1,0.2,0.2,4.0,3.0,6.1,5.5,0.35,0.3093,73.0,32.0,12.0,7.0,7.0,5.4,20.9,15.8175,0.9,0.9,0.4871,0.4,27.0,18.34,2.9,2.4,123.0,117.0,0.814,0.6,1.1,1.0465,0.2,0.2,4.0,4.0,5.5,5.5,0.37,0.37,62.0,43.0,6.0,6.0,5.129,5.1,15.865,15.065,0.9,0.9,0.4,0.4,25.06,23.06,2.6,2.5155,111.725,111.725,0.737,0.737,19.864,15.0,7.08608,6.92,36.9475,27.0,56.0,35.0,14.74,14.74,6.95,6.90047,33.0,30.0,42.62,41.417,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,52.0,23.7,0.0,162.5,451.0,0.032639,66.525,2.4,113.0,203.01,0.0,0.0,0.4,12.0,0.7,0.4,3.0,6.0,0.0,4.0,98.0,84.0,28.0,0.0,53.0,34.0,34.0,77.55,7.31,10.0,135.0,36.2,867.0,0.0,7.5,66.0,40.0,73.0,41.0,74.0,41.0,87.0,60.0,89.0,54.0,88.0,54.0,88.0,54.0,22.0,10.0,99.0,90.0,133.0,80.0,129.0,84.0,128.0,84.0,36.9,36.1,58.0,48.0,61.0,51.0,62.0,52.0,76.0,69.0,78.0,63.0,76.0,66.0,76.0,66.0,17.0,13.0,97.0,94.0,120.0,96.0,113.0,98.0,112.0,98.0,36.4,36.3,2.5,2.4,0.4,0.4,12.0,11.0,7.8,7.6,0.74,0.7,118.0,92.0,22.0,21.0,10.0,9.2,30.2,28.0,1.1,1.1,1.2,1.0,144.0,132.0,3.8,3.5,137.0,135.0,8.1,7.4,2.5,2.5,0.4,0.4,12.0,12.0,7.6,7.6,0.74,0.74,114.0,108.0,20.0,20.0,9.6,9.4,29.1,28.2,1.1,1.1,1.2,1.2,123.25,123.0,3.7,3.7,136.0,136.0,8.6,8.6,36.1,31.1,7.35,7.273,90.0,67.0,196.7,132.5,35.7,34.4,7.293,7.287,84.0,79.0,151.4,144.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,65.0,27.6,0.0,170.0,687.0,0.134028,80.0,2.9,123.0,407.01,0.0,0.0,0.7,18.0,0.94,0.5,4.0,6.0,0.0,5.0,139.0,103.0,33.0,0.0,65.0,39.6,39.6,104.0,7.37,26.0,138.0,36.5,1569.5,0.0,10.3,76.0,47.0,85.0,49.0,85.0,50.0,100.0,70.0,101.0,62.0,100.0,63.0,100.0,63.0,26.0,13.0,100.0,93.0,150.0,92.0,145.0,96.0,145.0,96.0,37.2,36.4,67.0,56.0,72.0,61.0,73.0,62.0,90.0,82.0,90.0,75.0,88.0,77.0,88.0,78.0,21.0,16.0,99.0,97.0,135.0,114.0,130.0,114.0,129.0,114.0,36.7,36.6,3.0,2.9,0.7,0.6,18.0,17.0,8.3,8.1,0.99,0.91,151.0,108.0,25.0,24.0,11.5,10.9,34.5,33.0,1.3,1.2,1.9,1.5,193.0,182.0,4.1,3.9,139.0,138.0,11.12,10.0,3.1,3.1,0.6,0.6,18.0,18.0,8.2,8.2,1.0,1.0,143.0,136.0,23.0,23.0,11.2,11.1,33.6,33.1,1.3,1.2,2.0,1.9,176.0,176.0,4.1,4.0,139.0,138.0,12.1,12.0,42.0,36.5,7.4,7.34,133.0,85.0,288.0,205.45,41.0,39.8,7.36,7.35,132.0,118.0,234.0,224.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,76.0,32.6,0.0,177.8,962.0,0.430382,96.0,3.4,302.0,801.01,0.0,0.0,1.1,30.0,1.47,1.0,4.0,6.0,0.0,5.0,198.0,119.0,37.7,0.0,122.0,45.7,45.7,161.0,7.42,35.0,141.0,36.8,2523.75,1.0,15.0,89.0,54.0,98.0,58.0,98.0,58.0,116.0,81.0,120.0,71.0,114.0,74.0,114.0,74.0,32.0,16.0,100.0,95.0,169.0,107.0,161.0,110.0,161.0,110.0,37.7,36.7,78.0,65.0,85.0,73.0,86.0,73.0,105.0,96.0,104.0,88.0,102.0,90.0,102.0,90.0,25.0,20.0,100.0,99.0,154.0,132.0,149.0,133.0,148.0,133.0,37.1,37.0,3.5,3.4,1.1,1.0,30.0,28.0,8.8,8.6,1.46,1.33,202.0,131.0,27.0,26.0,13.0,12.6,38.9,37.7,1.6,1.5,3.4,2.3,247.0,236.0,4.5,4.2,142.0,141.0,15.165,13.5,3.6,3.6,1.1,1.1,29.0,29.0,8.8,8.8,1.5,1.5,190.0,181.0,25.0,25.0,12.8,12.8,38.1,38.0,1.6,1.5,3.7,3.5,235.0,234.75,4.5,4.4,141.0,141.0,16.3,16.3,49.15,42.0,7.443,7.4,223.0,116.0,376.7,302.5,48.0,46.9,7.41,7.4,244.0,206.075,345.525,336.525,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,96.0,69.944,1.0,195.6,1111.0,65.945833,188.009,4.5,308.0,2201.05,1.0,1.0,53.8,128.14,9.962,1.0,4.0,6.0,1.0,5.0,565.885,176.0,50.0,1.0,200.0,99.822,99.822,504.06,7.58,60.0,158.0,39.8,33695.0,1.0,46.16865,181.135,88.0,164.0,92.6,164.0,92.24,181.0,174.0,317.0,186.0,189.0,113.0,182.0,113.0,86.76,63.0,100.0,100.0,300.0,167.24,227.6,162.6,228.235,162.0,39.865,37.9,142.0,100.46,141.79,111.0,142.76,112.38,165.0,145.0,284.0,128.52,166.535,138.0,164.0,137.0,96.0,58.18,100.0,100.0,238.89,194.445,219.785,198.0,219.0,198.0,39.6,39.4,4.6,4.5,53.8,53.8,130.0,115.0,10.9,10.2,10.0,9.06,569.12,292.03,40.0,39.0,17.0,16.6,50.4825,49.6,8.1,6.0,20.829,16.43,564.0,544.32,6.847,5.7,158.0,153.0,45.978,41.62,4.7,4.7,39.7,39.7,148.44,148.44,11.324,11.324,11.507,11.507,630.49,594.39,38.0,38.0,17.571,17.442,52.0675,52.0,8.1,6.0,14.841,14.8145,565.41,565.41,7.0,7.0,154.0,154.0,43.40575,43.40575,113.36,85.36,7.61,7.55,548.015,462.03,844.09,611.225,106.026,106.026,7.55159,7.54,538.18,518.06,672.824,651.656,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
lbls = {}
for col in train.select_dtypes(exclude = np.number).columns.tolist():
    le = LabelEncoder().fit(pd.concat([train[col].astype(str),test[col].astype(str)]))   
    train[col] = le.transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))
    lbls[col] = le
print('Categorical columns:', list(lbls.keys()))

Categorical columns: ['ethnicity', 'gender', 'hospital_admit_source', 'icu_admit_source', 'icu_stay_type', 'icu_type']


In [8]:
print(f'Percent of Nans in Train Data : {round(train.isna().sum().sum()/len(train), 2)}')
print(f'Percent of Nans in Test  Data : {round(test.isna().sum().sum()/len(test), 2)}')

Percent of Nans in Train Data : 61.33
Percent of Nans in Test  Data : 61.41


In [9]:
train['comorbidity_score'] = train['aids'].values * 23 + train['cirrhosis'] * 4  + train['hepatic_failure'] * 16 + train['immunosuppression'] * 10 + train['leukemia'] * 10 + train['lymphoma'] * 13 + train['solid_tumor_with_metastasis'] * 11
test['comorbidity_score'] = test['aids'].values * 23 + test['cirrhosis'] * 4  + test['hepatic_failure'] * 16 + test['immunosuppression'] * 10 + test['leukemia'] * 10 + test['lymphoma'] * 13 + test['solid_tumor_with_metastasis'] * 11
train['comorbidity_score'] = train['comorbidity_score'].fillna(0)
test['comorbidity_score'] = test['comorbidity_score'].fillna(0)
train['gcs_sum'] = train['gcs_eyes_apache']+train['gcs_motor_apache']+train['gcs_verbal_apache']
test['gcs_sum'] = test['gcs_eyes_apache']+test['gcs_motor_apache']+test['gcs_verbal_apache']
train['gcs_sum'] = train['gcs_sum'].fillna(0)
test['gcs_sum'] = test['gcs_sum'].fillna(0)
train['apache_2_diagnosis_type'] = train.apache_2_diagnosis.round(-1).fillna(-100).astype('int32')
test['apache_2_diagnosis_type'] = test.apache_2_diagnosis.round(-1).fillna(-100).astype('int32')
train['apache_3j_diagnosis_type'] = train.apache_3j_diagnosis.round(-2).fillna(-100).astype('int32')
test['apache_3j_diagnosis_type'] = test.apache_3j_diagnosis.round(-2).fillna(-100).astype('int32')
train['bmi_type'] = train.bmi.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
test['bmi_type'] = test.bmi.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
train['height_type'] = train.height.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
test['height_type'] = test.height.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
train['weight_type'] = train.weight.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
test['weight_type'] = test.weight.fillna(0).apply(lambda x: 5 * (round(int(x)/5)))
train['age_type'] = train.age.fillna(0).apply(lambda x: 10 * (round(int(x)/10)))
test['age_type'] = test.age.fillna(0).apply(lambda x: 10 * (round(int(x)/10)))
train['gcs_sum_type'] = train.gcs_sum.fillna(0).apply(lambda x: 2.5 * (round(int(x)/2.5))).divide(2.5)
test['gcs_sum_type'] = test.gcs_sum.fillna(0).apply(lambda x: 2.5 * (round(int(x)/2.5))).divide(2.5)
train['apache_3j_diagnosis_x'] = train['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
train['apache_2_diagnosis_x'] = train['apache_2_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
test['apache_3j_diagnosis_x'] = test['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
test['apache_2_diagnosis_x'] = test['apache_2_diagnosis'].astype('str').str.split('.',n=1,expand=True)[0]
train['apache_3j_diagnosis_split1'] = np.where(train['apache_3j_diagnosis'].isna() , np.nan , train['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[1]  )
test['apache_3j_diagnosis_split1']  = np.where(test['apache_3j_diagnosis'].isna() , np.nan , test['apache_3j_diagnosis'].astype('str').str.split('.',n=1,expand=True)[1]  )
train['apache_2_diagnosis_split1'] = np.where(train['apache_2_diagnosis'].isna() , np.nan , train['apache_2_diagnosis'].apply(lambda x : x % 10)  )
test['apache_2_diagnosis_split1']  = np.where(test['apache_2_diagnosis'].isna() , np.nan , test['apache_2_diagnosis'].apply(lambda x : x % 10) )

IDENTIFYING_COLS = ['age_type', 'height_type',  'ethnicity', 'gender', 'bmi_type'] 
train['profile'] = train[IDENTIFYING_COLS].apply(lambda x: hash(tuple(x)), axis = 1)
test['profile'] = test[IDENTIFYING_COLS].apply(lambda x: hash(tuple(x)), axis = 1)
print(f'Number of unique Profiles : {train["profile"].nunique()}')


Number of unique Profiles : 6903


In [10]:
df = pd.concat([train['icu_id'], test['icu_id']])
agg = df.value_counts().to_dict()
train['icu_id_counts'] = np.log1p(train['icu_id'].map(agg))
test['icu_id_counts'] = np.log1p(test['icu_id'].map(agg))
df = pd.concat([train['age'], test['age']])
agg = df.value_counts().to_dict()
train['age_counts'] = np.log1p(train['age'].map(agg))
test['age_counts'] = np.log1p(test['age'].map(agg))
train["diff_bmi"] = train['bmi'].copy() 
train['bmi'] = train['weight']/((train['height']/100)**2)
train["diff_bmi"] = train["diff_bmi"]-train['bmi']
test["diff_bmi"] = test['bmi'].copy()
test['bmi'] = test['weight']/((test['height']/100)**2)
test["diff_bmi"] = test["diff_bmi"]-test['bmi']
train['pre_icu_los_days'] = train['pre_icu_los_days'].apply(lambda x:special.expit(x) )
test['pre_icu_los_days']  = test['pre_icu_los_days'].apply(lambda x:special.expit(x) )
train['abmi'] = train['age']/train['bmi']
train['agi'] = train['weight']/train['age']
test['abmi'] = test['age']/train['bmi']
test['agi'] = test['weight']/train['age']

In [11]:
d_cols = [c for c in train.columns if(c.startswith("d1"))]
h_cols = [c for c in train.columns if(c.startswith("h1"))]
train["dailyLabs_row_nan_count"] = train[d_cols].isna().sum(axis=1)
train["hourlyLabs_row_nan_count"] = train[h_cols].isna().sum(axis=1)
train["diff_labTestsRun_daily_hourly"] = train["dailyLabs_row_nan_count"] - train["hourlyLabs_row_nan_count"]
test["dailyLabs_row_nan_count"] = test[d_cols].isna().sum(axis=1)
test["hourlyLabs_row_nan_count"] = test[h_cols].isna().sum(axis=1)
test["diff_labTestsRun_daily_hourly"] = test["dailyLabs_row_nan_count"] - test["hourlyLabs_row_nan_count"]

In [12]:
lab_col = [c for c in train.columns if((c.startswith("h1")) | (c.startswith("d1")))]
lab_col_names = list(set(list(map(lambda i: i[ 3 : -4], lab_col))))

print("len lab_col",len(lab_col))
print("len lab_col_names",len(lab_col_names))
print("lab_col_names\n",lab_col_names)

len lab_col 128
len lab_col_names 32
lab_col_names
 ['sysbp', 'mbp_invasive', 'sysbp_noninvasive', 'pao2fio2ratio', 'bun', 'diasbp_invasive', 'hco3', 'wbc', 'hematocrit', 'potassium', 'resprate', 'hemaglobin', 'sodium', 'spo2', 'albumin', 'arterial_pco2', 'glucose', 'lactate', 'mbp_noninvasive', 'arterial_po2', 'inr', 'creatinine', 'sysbp_invasive', 'temp', 'diasbp_noninvasive', 'bilirubin', 'platelets', 'arterial_ph', 'heartrate', 'calcium', 'mbp', 'diasbp']


In [13]:
first_h = []
for v in lab_col_names:
    first_h.append(v+"_started_after_firstHour")
    colsx = [x for x in test.columns if v in x]
    train[v+"_nans"] = train.loc[:, colsx].isna().sum(axis=1)
    test[v+"_nans"] = test.loc[:, colsx].isna().sum(axis=1)
    train[v+"_d1_value_range"] = train[f"d1_{v}_max"].subtract(train[f"d1_{v}_min"])    
    train[v+"_h1_value_range"] = train[f"h1_{v}_max"].subtract(train[f"h1_{v}_min"])
    train[v+"_d1_h1_max_eq"] = (train[f"d1_{v}_max"]== train[f"h1_{v}_max"]).astype(np.int8)
    train[v+"_d1_h1_min_eq"] = (train[f"d1_{v}_min"]== train[f"h1_{v}_min"]).astype(np.int8)
    train[v+"_d1_zero_range"] = (train[v+"_d1_value_range"] == 0).astype(np.int8)
    train[v+"_h1_zero_range"] =(train[v+"_h1_value_range"] == 0).astype(np.int8)
    train[v+"_tot_change_value_range_normed"] = abs((train[v+"_d1_value_range"].div(train[v+"_h1_value_range"])))#.div(df[f"d1_{v}_max"]))
    train[v+"_started_after_firstHour"] = ((train[f"h1_{v}_max"].isna()) & (train[f"h1_{v}_min"].isna())) & (~train[f"d1_{v}_max"].isna())
    train[v+"_day_more_extreme"] = ((train[f"d1_{v}_max"]>train[f"h1_{v}_max"]) | (train[f"d1_{v}_min"]<train[f"h1_{v}_min"]))
    train[v+"_day_more_extreme"].fillna(False)    
    test[v+"_d1_value_range"] = test[f"d1_{v}_max"].subtract(test[f"d1_{v}_min"])   
    test[v+"_h1_value_range"] = test[f"h1_{v}_max"].subtract(test[f"h1_{v}_min"])
    test[v+"_d1_h1_max_eq"] = (test[f"d1_{v}_max"]== test[f"h1_{v}_max"]).astype(np.int8)
    test[v+"_d1_h1_min_eq"] = (test[f"d1_{v}_min"]== test[f"h1_{v}_min"]).astype(np.int8)
    test[v+"_d1_zero_range"] = (test[v+"_d1_value_range"] == 0).astype(np.int8)
    test[v+"_h1_zero_range"] =(test[v+"_h1_value_range"] == 0).astype(np.int8)
    test[v+"_tot_change_value_range_normed"] = abs((test[v+"_d1_value_range"].div(test[v+"_h1_value_range"])))
    test[v+"_started_after_firstHour"] = ((test[f"h1_{v}_max"].isna()) & (test[f"h1_{v}_min"].isna())) & (~test[f"d1_{v}_max"].isna())
    test[v+"_day_more_extreme"] = ((test[f"d1_{v}_max"]>test[f"h1_{v}_max"]) | (test[f"d1_{v}_min"]<test[f"h1_{v}_min"]))
    test[v+"_day_more_extreme"].fillna(False)

train["total_Tests_started_After_firstHour"] = train[first_h].sum(axis=1)
test["total_Tests_started_After_firstHour"] = test[first_h].sum(axis=1)
gc.collect()
train["total_Tests_started_After_firstHour"].describe()

count    130157.000000
mean          9.773996
std           5.746988
min           0.000000
25%           5.000000
50%          11.000000
75%          13.000000
max          31.000000
Name: total_Tests_started_After_firstHour, dtype: float64

In [14]:
groupers = ['apache_3j_diagnosis', 'profile']

for g in groupers:
    for v in lab_col_names:
        temp = pd.concat([train[[f"d1_{v}_max",g]], test[[f"d1_{v}_max",g]]], axis=0).groupby(g)[f"d1_{v}_max"].mean().to_dict()
        train[f'mean_diff_d1_{v}_{g}_max'] = train[f"d1_{v}_max"]-train[g].map(temp)
        test[f'mean_diff_d1_{v}_{g}_max'] = test[f"d1_{v}_max"]-test[g].map(temp)
        temp = pd.concat([train[[f"d1_{v}_min",g]], test[[f"d1_{v}_min",g]]], axis=0).groupby(g)[f"d1_{v}_min"].mean().to_dict()   
        train[f'mean_diff_d1_{v}_{g}_min'] = train[f"d1_{v}_min"]-train[g].map(temp)
        test[f'mean_diff_d1_{v}_{g}_min'] = test[f"d1_{v}_min"]-test[g].map(temp)
        temp = pd.concat([train[[f"h1_{v}_max",g]], test[[f"h1_{v}_max",g]]], axis=0).groupby(g)[f"h1_{v}_max"].mean().to_dict()   
        train[f'mean_diff_h1_{v}_{g}_max'] = train[f"h1_{v}_max"]-train[g].map(temp)
        test[f'mean_diff_h1_{v}_{g}_max'] = test[f"h1_{v}_max"]-test[g].map(temp)
        temp = pd.concat([train[[f"h1_{v}_min",g]], test[[f"h1_{v}_min",g]]], axis=0).groupby(g)[f"h1_{v}_min"].mean().to_dict()   
        train[f'mean_diff_h1_{v}_{g}_min'] = train[f"h1_{v}_min"]-train[g].map(temp)
        test[f'mean_diff_h1_{v}_{g}_min'] = test[f"h1_{v}_min"]-test[g].map(temp)
gc.collect()

0

In [15]:
train['diasbp_indicator'] = (
(train['d1_diasbp_invasive_max'] == train['d1_diasbp_max']) & (train['d1_diasbp_noninvasive_max']==train['d1_diasbp_invasive_max'])|
(train['d1_diasbp_invasive_min'] == train['d1_diasbp_min']) & (train['d1_diasbp_noninvasive_min']==train['d1_diasbp_invasive_min'])|
(train['h1_diasbp_invasive_max'] == train['h1_diasbp_max']) & (train['h1_diasbp_noninvasive_max']==train['h1_diasbp_invasive_max'])|
(train['h1_diasbp_invasive_min'] == train['h1_diasbp_min']) & (train['h1_diasbp_noninvasive_min']==train['h1_diasbp_invasive_min'])
).astype(np.int8)


train['mbp_indicator'] = (
(train['d1_mbp_invasive_max'] == train['d1_mbp_max']) & (train['d1_mbp_noninvasive_max']==train['d1_mbp_invasive_max'])|
(train['d1_mbp_invasive_min'] == train['d1_mbp_min']) & (train['d1_mbp_noninvasive_min']==train['d1_mbp_invasive_min'])|
(train['h1_mbp_invasive_max'] == train['h1_mbp_max']) & (train['h1_mbp_noninvasive_max']==train['h1_mbp_invasive_max'])|
(train['h1_mbp_invasive_min'] == train['h1_mbp_min']) & (train['h1_mbp_noninvasive_min']==train['h1_mbp_invasive_min'])
).astype(np.int8)

train['sysbp_indicator'] = (
(train['d1_sysbp_invasive_max'] == train['d1_sysbp_max']) & (train['d1_sysbp_noninvasive_max']==train['d1_sysbp_invasive_max'])|
(train['d1_sysbp_invasive_min'] == train['d1_sysbp_min']) & (train['d1_sysbp_noninvasive_min']==train['d1_sysbp_invasive_min'])|
 (train['h1_sysbp_invasive_max'] == train['h1_sysbp_max']) & (train['h1_sysbp_noninvasive_max']==train['h1_sysbp_invasive_max'])|
(train['h1_sysbp_invasive_min'] == train['h1_sysbp_min']) & (train['h1_sysbp_noninvasive_min']==train['h1_sysbp_invasive_min'])   
).astype(np.int8)

train['d1_mbp_invnoninv_max_diff'] = train['d1_mbp_invasive_max'] - train['d1_mbp_noninvasive_max']
train['h1_mbp_invnoninv_max_diff'] = train['h1_mbp_invasive_max'] - train['h1_mbp_noninvasive_max']
train['d1_mbp_invnoninv_min_diff'] = train['d1_mbp_invasive_min'] - train['d1_mbp_noninvasive_min']
train['h1_mbp_invnoninv_min_diff'] = train['h1_mbp_invasive_min'] - train['h1_mbp_noninvasive_min']
train['d1_diasbp_invnoninv_max_diff'] = train['d1_diasbp_invasive_max'] - train['d1_diasbp_noninvasive_max']
train['h1_diasbp_invnoninv_max_diff'] = train['h1_diasbp_invasive_max'] - train['h1_diasbp_noninvasive_max']
train['d1_diasbp_invnoninv_min_diff'] = train['d1_diasbp_invasive_min'] - train['d1_diasbp_noninvasive_min']
train['h1_diasbp_invnoninv_min_diff'] = train['h1_diasbp_invasive_min'] - train['h1_diasbp_noninvasive_min']
train['d1_sysbp_invnoninv_max_diff'] = train['d1_sysbp_invasive_max'] - train['d1_sysbp_noninvasive_max']
train['h1_sysbp_invnoninv_max_diff'] = train['h1_sysbp_invasive_max'] - train['h1_sysbp_noninvasive_max']
train['d1_sysbp_invnoninv_min_diff'] = train['d1_sysbp_invasive_min'] - train['d1_sysbp_noninvasive_min']
train['h1_sysbp_invnoninv_min_diff'] = train['h1_sysbp_invasive_min'] - train['h1_sysbp_noninvasive_min']

test['diasbp_indicator'] = (
(test['d1_diasbp_invasive_max'] == test['d1_diasbp_max']) & (test['d1_diasbp_noninvasive_max']==test['d1_diasbp_invasive_max'])|
(test['d1_diasbp_invasive_min'] == test['d1_diasbp_min']) & (test['d1_diasbp_noninvasive_min']==test['d1_diasbp_invasive_min'])|
(test['h1_diasbp_invasive_max'] == test['h1_diasbp_max']) & (test['h1_diasbp_noninvasive_max']==test['h1_diasbp_invasive_max'])|
(test['h1_diasbp_invasive_min'] == test['h1_diasbp_min']) & (test['h1_diasbp_noninvasive_min']==test['h1_diasbp_invasive_min'])
).astype(np.int8)


test['mbp_indicator'] = (
(test['d1_mbp_invasive_max'] == test['d1_mbp_max']) & (test['d1_mbp_noninvasive_max']==test['d1_mbp_invasive_max'])|
(test['d1_mbp_invasive_min'] == test['d1_mbp_min']) & (test['d1_mbp_noninvasive_min']==test['d1_mbp_invasive_min'])|
(test['h1_mbp_invasive_max'] == test['h1_mbp_max']) & (test['h1_mbp_noninvasive_max']==test['h1_mbp_invasive_max'])|
(test['h1_mbp_invasive_min'] == test['h1_mbp_min']) & (test['h1_mbp_noninvasive_min']==test['h1_mbp_invasive_min'])
).astype(np.int8)

test['sysbp_indicator'] = (
(test['d1_sysbp_invasive_max'] == test['d1_sysbp_max']) & (test['d1_sysbp_noninvasive_max']==test['d1_sysbp_invasive_max'])|
(test['d1_sysbp_invasive_min'] == test['d1_sysbp_min']) & (test['d1_sysbp_noninvasive_min']==test['d1_sysbp_invasive_min'])|
 (test['h1_sysbp_invasive_max'] == test['h1_sysbp_max']) & (test['h1_sysbp_noninvasive_max']==test['h1_sysbp_invasive_max'])|
(test['h1_sysbp_invasive_min'] == test['h1_sysbp_min']) & (test['h1_sysbp_noninvasive_min']==test['h1_sysbp_invasive_min'])   
).astype(np.int8)

test['d1_mbp_invnoninv_max_diff'] = test['d1_mbp_invasive_max'] - test['d1_mbp_noninvasive_max']
test['h1_mbp_invnoninv_max_diff'] = test['h1_mbp_invasive_max'] - test['h1_mbp_noninvasive_max']
test['d1_mbp_invnoninv_min_diff'] = test['d1_mbp_invasive_min'] - test['d1_mbp_noninvasive_min']
test['h1_mbp_invnoninv_min_diff'] = test['h1_mbp_invasive_min'] - test['h1_mbp_noninvasive_min']
test['d1_diasbp_invnoninv_max_diff'] = test['d1_diasbp_invasive_max'] - test['d1_diasbp_noninvasive_max']
test['h1_diasbp_invnoninv_max_diff'] = test['h1_diasbp_invasive_max'] - test['h1_diasbp_noninvasive_max']
test['d1_diasbp_invnoninv_min_diff'] = test['d1_diasbp_invasive_min'] - test['d1_diasbp_noninvasive_min']
test['h1_diasbp_invnoninv_min_diff'] = test['h1_diasbp_invasive_min'] - test['h1_diasbp_noninvasive_min']

test['d1_sysbp_invnoninv_max_diff'] = test['d1_sysbp_invasive_max'] - test['d1_sysbp_noninvasive_max']
test['h1_sysbp_invnoninv_max_diff'] = test['h1_sysbp_invasive_max'] - test['h1_sysbp_noninvasive_max']
test['d1_sysbp_invnoninv_min_diff'] = test['d1_sysbp_invasive_min'] - test['d1_sysbp_noninvasive_min']
test['h1_sysbp_invnoninv_min_diff'] = test['h1_sysbp_invasive_min'] - test['h1_sysbp_noninvasive_min']


for v in ['albumin','bilirubin','bun','glucose','hematocrit','pao2fio2ratio','arterial_ph','resprate','sodium','temp','wbc','creatinine']:
    train[f'{v}_indicator'] = (((train[f'{v}_apache']==train[f'd1_{v}_max']) & (train[f'd1_{v}_max']==train[f'h1_{v}_max'])) |
                 ((train[f'{v}_apache']==train[f'd1_{v}_max']) & (train[f'd1_{v}_max']==train[f'd1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'd1_{v}_max']) & (train[f'd1_{v}_max']==train[f'h1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_max']) & (train[f'h1_{v}_max']==train[f'd1_{v}_max'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_max']) & (train[f'h1_{v}_max']==train[f'h1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_max']) & (train[f'h1_{v}_max']==train[f'd1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'd1_{v}_min']) & (train[f'd1_{v}_min']==train[f'd1_{v}_max'])) |
                 ((train[f'{v}_apache']==train[f'd1_{v}_min']) & (train[f'd1_{v}_min']==train[f'h1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'd1_{v}_min']) & (train[f'd1_{v}_min']==train[f'h1_{v}_max'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_min']) & (train[f'h1_{v}_min']==train[f'h1_{v}_max'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_min']) & (train[f'h1_{v}_min']==train[f'd1_{v}_min'])) |
                 ((train[f'{v}_apache']==train[f'h1_{v}_min']) & (train[f'h1_{v}_min']==train[f'd1_{v}_max'])) 
                ).astype(np.int8)
    test[f'{v}_indicator'] = (((test[f'{v}_apache']==test[f'd1_{v}_max']) & (test[f'd1_{v}_max']==test[f'h1_{v}_max'])) |
                 ((test[f'{v}_apache']==test[f'd1_{v}_max']) & (test[f'd1_{v}_max']==test[f'd1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'd1_{v}_max']) & (test[f'd1_{v}_max']==test[f'h1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_max']) & (test[f'h1_{v}_max']==test[f'd1_{v}_max'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_max']) & (test[f'h1_{v}_max']==test[f'h1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_max']) & (test[f'h1_{v}_max']==test[f'd1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'd1_{v}_min']) & (test[f'd1_{v}_min']==test[f'd1_{v}_max'])) |
                 ((test[f'{v}_apache']==test[f'd1_{v}_min']) & (test[f'd1_{v}_min']==test[f'h1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'd1_{v}_min']) & (test[f'd1_{v}_min']==test[f'h1_{v}_max'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_min']) & (test[f'h1_{v}_min']==test[f'h1_{v}_max'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_min']) & (test[f'h1_{v}_min']==test[f'd1_{v}_min'])) |
                 ((test[f'{v}_apache']==test[f'h1_{v}_min']) & (test[f'h1_{v}_min']==test[f'd1_{v}_max'])) 
                ).astype(np.int8)

In [16]:
more_extreme_cols = [c for c in train.columns if(c.endswith("_day_more_extreme"))]
train["total_day_more_extreme"] = train[more_extreme_cols].sum(axis=1)
test["total_day_more_extreme"] = test[more_extreme_cols].sum(axis=1)
train["d1_resprate_div_mbp_min"] = train["d1_resprate_min"].div(train["d1_mbp_min"])
train["d1_resprate_div_sysbp_min"] = train["d1_resprate_min"].div(train["d1_sysbp_min"])
train["d1_lactate_min_div_diasbp_min"] = train["d1_lactate_min"].div(train["d1_diasbp_min"])
train["d1_heartrate_min_div_d1_sysbp_min"] = train["d1_heartrate_min"].div(train["d1_sysbp_min"])
train["d1_hco3_div"]= train["d1_hco3_max"].div(train["d1_hco3_min"])
train["d1_resprate_times_resprate"] = train["d1_resprate_min"].multiply(train["d1_resprate_max"])
train["left_average_spo2"] = (2*train["d1_spo2_max"] + train["d1_spo2_min"])/3
test["d1_resprate_div_mbp_min"] = test["d1_resprate_min"].div(test["d1_mbp_min"])
test["d1_resprate_div_sysbp_min"] = test["d1_resprate_min"].div(test["d1_sysbp_min"])
test["d1_lactate_min_div_diasbp_min"] = test["d1_lactate_min"].div(test["d1_diasbp_min"])
test["d1_heartrate_min_div_d1_sysbp_min"] = test["d1_heartrate_min"].div(test["d1_sysbp_min"])
test["d1_hco3_div"]= test["d1_hco3_max"].div(test["d1_hco3_min"])
test["d1_resprate_times_resprate"] = test["d1_resprate_min"].multiply(test["d1_resprate_max"])
test["left_average_spo2"] = (2*test["d1_spo2_max"] + test["d1_spo2_min"])/3
train["total_chronic"] = train[["aids","cirrhosis", 'hepatic_failure']].sum(axis=1)
train["total_cancer_immuno"] = train[[ 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].sum(axis=1)
test["total_chronic"] = test[["aids","cirrhosis", 'hepatic_failure']].sum(axis=1)
test["total_cancer_immuno"] = test[[ 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].sum(axis=1)
train["has_complicator"] = train[["aids","cirrhosis", 'hepatic_failure',
                            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].max(axis=1)
test["has_complicator"] = test[["aids","cirrhosis", 'hepatic_failure',
                            'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']].max(axis=1)
gc.collect()
train[["has_complicator","total_chronic","total_cancer_immuno","has_complicator"]].describe()


Unnamed: 0,has_complicator,total_chronic,total_cancer_immuno,has_complicator.1
count,130157.0,130157.0,130157.0,130157.0
mean,0.06904,0.030709,0.058015,0.06904
std,0.253523,0.214531,0.275306,0.253523
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0
max,1.0,3.0,4.0,1.0


In [17]:
train['apache_3j'] = np.where(train['apache_3j_diagnosis_type']<0 , np.nan ,
                            np.where(train['apache_3j_diagnosis_type'] < 200, 'Cardiovascular' , 
                            np.where(train['apache_3j_diagnosis_type'] < 400, 'Respiratory' , 
                            np.where(train['apache_3j_diagnosis_type'] < 500, 'Neurological' , 
                            np.where(train['apache_3j_diagnosis_type'] < 600, 'Sepsis' , 
                            np.where(train['apache_3j_diagnosis_type'] < 800, 'Trauma' ,  
                            np.where(train['apache_3j_diagnosis_type'] < 900, 'Haematological' ,         
                            np.where(train['apache_3j_diagnosis_type'] < 1000, 'Renal/Genitourinary' ,         
                            np.where(train['apache_3j_diagnosis_type'] < 1200, 'Musculoskeletal/Skin disease' , 'Operative Sub-Diagnosis Codes' ))))))))
                                    )
test['apache_3j'] = np.where(test['apache_3j_diagnosis_type']<0 , np.nan ,
                            np.where(test['apache_3j_diagnosis_type'] < 200, 'Cardiovascular' , 
                            np.where(test['apache_3j_diagnosis_type'] < 400, 'Respiratory' , 
                            np.where(test['apache_3j_diagnosis_type'] < 500, 'Neurological' , 
                            np.where(test['apache_3j_diagnosis_type'] < 600, 'Sepsis' , 
                            np.where(test['apache_3j_diagnosis_type'] < 800, 'Trauma' ,  
                            np.where(test['apache_3j_diagnosis_type'] < 900, 'Haematological' ,         
                            np.where(test['apache_3j_diagnosis_type'] < 1000, 'Renal/Genitourinary' ,         
                            np.where(test['apache_3j_diagnosis_type'] < 1200, 'Musculoskeletal/Skin disease' , 'Operative Sub-Diagnosis Codes' ))))))))
                                    )

In [18]:
trainf = pd.read_pickle('../input/widsfeatures2021/X.pkl')
testf = pd.read_pickle('../input/widsfeatures2021/X_test.pkl')
trainf = trainf.rename(columns={'pao2_apache':'pao2fio2ratio_apache','ph_apache':'arterial_ph_apache'})
testf = testf.rename(columns={'pao2_apache':'pao2fio2ratio_apache','ph_apache':'arterial_ph_apache'})

In [19]:
train.shape, test.shape, trainf.shape, testf.shape

((130157, 814), (10234, 814), (130157, 865), (10234, 865))

In [20]:
col_order = train.columns.tolist()
train = train[col_order]
test = test[col_order]
col_order = trainf.columns.tolist()
trainf = trainf[col_order]
testf = testf[col_order]

In [21]:
train = pd.concat([trainf.reset_index(drop=True),train.reset_index(drop=True)], axis=1)
test =  pd.concat([testf.reset_index(drop=True),test.reset_index(drop=True)], axis=1)
train= train.fillna(0); test= test.fillna(0)
gc.collect()
train.shape, test.shape

((130157, 1679), (10234, 1679))

**Dropping Duplicated Columns**

In [22]:
Cols = list(train.columns)
for i,item in enumerate(train.columns):
    if item in train.columns[:i]: Cols[i] = "toDROP"
train.columns = Cols
test.columns = Cols
train = train.drop("toDROP",1)
test = test.drop("toDROP",1)
train.shape, test.shape

((130157, 1348), (10234, 1348))

Reordering Columns again!

In [23]:
col_order = train.columns.tolist()
train = train[col_order]
test = test[col_order]

In [24]:
drop_cols = ['abmi', 'age_type', 'aids', 'albumin_apache', 'albumin_h1_value_range', 'albumin_h1_zero_range','albumin_tot_change_value_range_normed', 'apache_3j_diagnosis-cat_age', 'apache_post_operative','apache_post_operative_std_d1_temp_max', 'arf_apache_std_d1_hemaglobin_max', 'arterial_pco2_d1_h1_max_eq','arterial_pco2_d1_h1_min_eq', 'arterial_pco2_d1_zero_range', 'arterial_pco2_h1_zero_range','arterial_ph_apache', 'arterial_ph_d1_h1_max_eq', 'arterial_ph_d1_value_range', 'arterial_ph_d1_zero_range','arterial_ph_h1_zero_range', 'arterial_po2_d1_h1_max_eq', 'arterial_po2_d1_h1_min_eq', 'arterial_po2_d1_value_range', 'bilirubin_h1_value_range', 'bilirubin_h1_zero_range','bilirubin_tot_change_value_range_normed', 'bmi_type', 'bun_d1_h1_max_eq', 'bun_d1_zero_range','bun_h1_value_range', 'bun_h1_zero_range', 'calcium_d1_zero_range', 'calcium_h1_value_range','calcium_h1_zero_range', 'creatinine_h1_zero_range', 'd1_albumin_min', 'd1_arterial_pco2_min','d1_arterial_ph_max', 'd1_arterial_ph_min', 'd1_calcium_max', 'd1_diasbp_max', 'd1_diasbp_min','d1_hematocrit_min', 'd1_inr_max', 'd1_inr_min', 'd1_mbp_invasive_max', 'd1_mbp_invasive_min', 'd1_mbp_max','d1_mbp_min', 'd1_mbp_noninvasive_max', 'd1_mbp_noninvasive_min', 'd1_pao2fio2ratio_max', 'd1_pao2fio2ratio_min', 'd1_platelets_max', 'd1_resprate_max', 'd1_sysbp_invasive_min', 'd1_temp_min','d1_wbc_min', 'diasbp_d1_h1_max_eq', 'diasbp_d1_zero_range', 'diasbp_invasive_d1_h1_max_eq','diasbp_invasive_d1_value_range', 'diasbp_invasive_d1_zero_range', 'diasbp_invasive_h1_value_range','diasbp_invasive_h1_zero_range', 'diasbp_noninvasive_d1_h1_max_eq', 'diasbp_noninvasive_d1_zero_range','diasbp_noninvasive_h1_zero_range', 'diff_bmi', 'elective_surgery_mean_d1_sysbp_min', 'gcs_unable_apache','h1_albumin_max', 'h1_albumin_min', 'h1_arterial_pco2_max', 'h1_arterial_pco2_min', 'h1_arterial_ph_min','h1_arterial_po2_max', 'h1_bilirubin_max', 'h1_bun_max', 'h1_creatinine_min', 'h1_diasbp_noninvasive_max','h1_heartrate_max', 'h1_heartrate_min', 'h1_hemaglobin_min', 'h1_hematocrit_max', 'h1_hematocrit_min','h1_lactate_max', 'h1_lactate_min', 'h1_mbp_invasive_max', 'h1_mbp_invasive_min', 'h1_mbp_max', 'h1_mbp_min','h1_mbp_noninvasive_max', 'h1_mbp_noninvasive_min', 'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min','h1_platelets_max', 'h1_platelets_min', 'h1_resprate_max', 'h1_resprate_min', 'h1_sodium_max','h1_spo2_max', 'h1_spo2_min', 'h1_sysbp_max', 'h1_sysbp_min', 'h1_sysbp_noninvasive_max','h1_sysbp_noninvasive_min', 'h1_temp_max', 'h1_temp_min', 'h1_wbc_max', 'h1_wbc_min', 'hco3_d1_h1_max_eq','hco3_d1_h1_min_eq', 'hco3_h1_value_range', 'hco3_h1_zero_range', 'heartrate_d1_zero_range','heartrate_h1_zero_range', 'height', 'hemaglobin_d1_value_range', 'hemaglobin_d1_zero_range','hematocrit_apache', 'hematocrit_d1_h1_min_eq', 'hematocrit_d1_value_range', 'hematocrit_d1_zero_range','inr_d1_h1_max_eq', 'inr_d1_h1_min_eq', 'inr_d1_value_range', 'inr_d1_zero_range', 'inr_day_more_extreme','inr_h1_value_range', 'inr_h1_zero_range', 'inr_started_after_firstHour', 'intubated_apache_mean_d1_spo2_max','lactate_h1_value_range', 'lactate_h1_zero_range', 'lymphoma', 'map_apache', 'mbp_d1_zero_range','mbp_h1_zero_range', 'mbp_invasive_d1_h1_min_eq', 'mbp_invasive_d1_value_range', 'mbp_invasive_d1_zero_range','mbp_invasive_h1_zero_range', 'mbp_noninvasive_d1_h1_max_eq', 'mbp_noninvasive_d1_h1_min_eq','mbp_noninvasive_d1_zero_range', 'mbp_noninvasive_h1_zero_range', 'mean_diff_d1_inr_min','mean_diff_h1_bilirubin_min', 'mean_diff_h1_inr_max', 'paco2_apache', 'paco2_for_ph_apache','pao2fio2ratio_apache', 'pao2fio2ratio_h1_value_range', 'pao2fio2ratio_h1_zero_range','rank_frqenc_leukemia', 'wbc_h1_value_range','platelets_d1_value_range', 'platelets_h1_zero_range', 'potassium_d1_h1_max_eq','potassium_h1_value_range','potassium_h1_zero_range', 'rank_frqenc_apache_2_diagnosis', 'resprate_apache', 'resprate_d1_h1_min_eq','resprate_d1_zero_range', 'sodium_d1_h1_min_eq', 'sodium_d1_zero_range','spo2_d1_h1_max_eq','sysbp_d1_zero_range', 'sysbp_h1_zero_range', 'sysbp_invasive_d1_h1_min_eq','sysbp_invasive_d1_zero_range','sysbp_noninvasive_d1_h1_min_eq', 'sysbp_noninvasive_d1_zero_range','sysbp_noninvasive_h1_zero_range','temp_d1_zero_range', 'ventilated_apache_std_d1_glucose_min', 'wbc_apache', 'wbc_d1_h1_min_eq','wbc_d1_value_range', 'wbc_d1_zero_range', 'wbc_h1_zero_range', 'gcs_eyes_apache_mean_d1_bun_min','rank_frqenc_aids']

In [25]:
drop_cols = list(set(drop_cols))
print(len(drop_cols))

185


In [26]:
cats = ['elective_surgery', 'icu_id', 'arf_apache', 'intubated_apache', 'ventilated_apache', 'cirrhosis','hepatic_failure', 'immunosuppression', 'leukemia', 'solid_tumor_with_metastasis', 'apache_3j_diagnosis_x','apache_2_diagnosis_x', 'apache_3j', 'apache_3j_diagnosis_split1', 'apache_2_diagnosis_split1', 'gcs_sum_type','hospital_admit_source', 'glucose_rate', 'glucose_wb', 'gcs_eyes_apache', 'glucose_normal', 'total_cancer_immuno','gender', 'total_chronic', 'icu_stay_type', 'apache_2_diagnosis_type', 'apache_3j_diagnosis_type']
len(cats)

27

In [27]:
for col in cats:
    train_only = list(set(train[col].unique()) - set(test[col].unique()))
    test_only = list(set(test[col].unique()) - set(train[col].unique()))
    both = list(set(test[col].unique()).union(set(train[col].unique())))
    train.loc[train[col].isin(train_only), col] = np.nan
    test.loc[test[col].isin(test_only), col] = np.nan
    try:
        lbl = OrdinalEncoder(dtype='int')
        train[col] = lbl.fit_transform(train[col].astype('str').values.reshape(-1,1))
        test[col] = lbl.transform(test[col].astype('str').values.reshape(-1,1))
    except:
        lbl = OrdinalEncoder(dtype='int')
        train[col] = lbl.fit_transform(train[col].astype('str').fillna('-1').values.reshape(-1,1))
        test[col] = lbl.transform(test[col].astype('str').fillna('-1').values.reshape(-1,1))
    temp = pd.concat([train[[col]], test[[col]]], axis=0)
    temp_mapping = temp.groupby(col).size()/len(temp)
    temp['enc'] = temp[col].map(temp_mapping)
    temp['enc'] = stats.rankdata(temp['enc'])
    temp = temp.reset_index(drop=True)
    train[f'rank_frqenc_{col}'] = temp[['enc']].values[:train.shape[0]]
    test[f'rank_frqenc_{col}'] = temp[['enc']].values[train.shape[0]:]               
    test[col] = test[col].astype('category')
    train[col] = train[col].astype('category')


In [28]:
drop_cols = list(set(drop_cols))
print(len(drop_cols))
train = train.drop(drop_cols, axis=1)
test = test.drop(drop_cols, axis=1)
train.shape, test.shape

185


((130157, 1183), (10234, 1183))

In [29]:
@contextmanager
def timer(logger=None, format_str='{:.3f}[s]', prefix=None, suffix=None):
    if prefix: format_str = str(prefix) + format_str
    if suffix: format_str = format_str + str(suffix)
    start = time.time()
    yield
    d = time.time() - start
    out_str = format_str.format(d)
    if logger:
        logger.info(out_str)
    else:
        print(out_str)

In [30]:
class TreeModel:
    """Wrapper for LightGBM/XGBoost/CATBoost"""
    def __init__(self, model_type: str):
        self.model_type = model_type
        self.trn_data = None
        self.val_data = None
        self.model = None

    def train(self,
              params: dict,
              X_train: pd.DataFrame, y_train: np.ndarray,
              X_val: pd.DataFrame, y_val: np.ndarray,
              train_weight: tp.Optional[np.ndarray] = None,
              val_weight: tp.Optional[np.ndarray] = None,
              train_params: dict = {}):
        if self.model_type == "lgb":
            self.trn_data = lgb.Dataset(X_train, label=y_train, weight=train_weight)
            self.val_data = lgb.Dataset(X_val, label=y_val, weight=val_weight)
            self.model = lgb.train(params=params,
                                   train_set=self.trn_data,
                                   valid_sets=[self.trn_data, self.val_data],
                                   **train_params)
        elif self.model_type == "xgb":
            self.trn_data = xgb.DMatrix(X_train, y_train, weight=train_weight)
            self.val_data = xgb.DMatrix(X_val, y_val, weight=val_weight)
            self.model = xgb.train(params=params,
                                   dtrain=self.trn_data,
                                   evals=[(self.trn_data, "train"), (self.val_data, "val")],
                                   **train_params)
        elif self.model_type == "cat":
            self.trn_data = Pool(X_train, label=y_train, group_id=[0] * len(X_train))
            self.val_data =  Pool(X_val, label=y_val, group_id=[0] * len(X_val))
            self.model = CatBoost(params)
            self.model.fit(
                self.trn_data, eval_set=[self.val_data], use_best_model=True, **train_params)
        else:
            raise NotImplementedError

    def predict(self, X: pd.DataFrame):
        if self.model_type == "lgb":
            return self.model.predict(
                X, num_iteration=self.model.best_iteration)  # type: ignore
        elif self.model_type == "xgb":
            X_DM = xgb.DMatrix(X)
            return self.model.predict(
                X_DM, ntree_limit=self.model.best_ntree_limit)  # type: ignore
        elif self.model_type == "cat":
            return self.model.predict(X)
        else:
            raise NotImplementedError

    @property
    def feature_names_(self):
        if self.model_type == "lgb":
            return self.model.feature_name()
        elif self.model_type == "xgb":
            return list(self.model.get_score(importance_type="gain").keys())
        elif self.model_type == "cat":
             return self.model.feature_names_
        else:
            raise NotImplementedError

    @property
    def feature_importances_(self):
        if self.model_type == "lgb":
            return self.model.feature_importance(importance_type="gain")
        elif self.model_type == "xgb":
            return list(self.model.get_score(importance_type="gain").values())
        elif self.model_type == "cat":
            return self.model.feature_importances_
        else:
            raise NotImplementedError

In [31]:
ID_COL = "encounter_id"
#TGT_COL = "diabetes_mellitus"

N_SPLITS =10# 5
RANDOM_SEED = 190295 #,42,90295#42
USE_MODEL = "lgb"

MODEL_PARAMS = {
   "seed": RANDOM_SEED,
      "n_jobs": 4,
       'num_leaves': 250,
    'max_bin':550,
     'min_child_weight': 0.03454472573214212,
    'feature_fraction': 0.35,
    'bagging_fraction': 0.475,
    'min_data_in_leaf': 150,
     'max_depth': -1,
    'objective': 'binary',
    'learning_rate':  0.009867383057779643,
    "boosting_type": "gbdt",
    "metric": 'auc', 
     "bagging_seed": 7,
#"verbosity": -1,
'reg_alpha': 0.3899927210061127,
'reg_lambda': 0.6485237330340494,
##'subsample': 0.8032697250789377, 
##'colsample_bytree': 0.21067140508531404, 
##'learning_rate': 0.009867383057779643,
## 'reg_lambda': 10.987474846877767, 
##            'reg_alpha': 17.335285595031994,     
      ##
'cat_smooth': 39,
 ## parameters to keep the exactly the same
'subsample_for_bin': 200000,
'min_child_samples':285, #20,    
    
    
    
    }
TRAIN_PARAMS = {
    "num_boost_round": 30000,
    "early_stopping_rounds": 1000,
    "verbose_eval": 50,
}

In [32]:
#train[FEAT_COLS].to_pickle('train.pkl')
#test[FEAT_COLS].to_pickle('test.pkl')
#np.save('test_id.npy',test_id)
#np.save('y.npy',y)

In [33]:
#train[FEAT_COLS]

In [34]:
X = train
X_test = test 


kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)
trn_val_indexs = list(kf.split(X, y))

In [35]:
oof_pred_arr = np.zeros(len(X))
test_preds_arr = np.zeros((N_SPLITS, len(X_test)))
feature_importances = pd.DataFrame()
score_list = []

In [36]:
for fold, (trn_idx, val_idx) in enumerate(kf.split(X, y)):
    print("*" * 100)
    print(f"Fold: {fold}")

    X_trn = X.loc[trn_idx].reset_index(drop=True)
    X_val = X.loc[val_idx].reset_index(drop=True)
    y_trn = y[trn_idx]
    y_val = y[val_idx]

    model = TreeModel(model_type=USE_MODEL)
    with timer(prefix="Model training"):
        model.train(
            params=MODEL_PARAMS, X_train=X_trn, y_train=y_trn,
            X_val=X_val, y_val=y_val, train_params=TRAIN_PARAMS)
    fi_tmp = pd.DataFrame()
    fi_tmp["feature"] = model.feature_names_
    fi_tmp["importance"] = model.feature_importances_
    fi_tmp["fold"] = fold
    feature_importances = feature_importances.append(fi_tmp)

    val_pred = model.predict(X_val)
    score = roc_auc_score(y_val, val_pred)#, squared=False)
    #print(score)
    print(f"score: {score:.5f}")
    score_list.append([fold, score])
    oof_pred_arr[val_idx] = val_pred
    test_pred = model.predict(X_test)
    test_preds_arr[fold] = test_pred

****************************************************************************************************
Fold: 0
[LightGBM] [Info] Number of positive: 25377, number of negative: 91764
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 286948
[LightGBM] [Info] Number of data points in the train set: 117141, number of used features: 1183
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.216636 -> initscore=-1.285377
[LightGBM] [Info] Start training from score -1.285377
Training until validation scores don't improve for 1000 rounds
[50]	training's auc: 0.883442	valid_1's auc: 0.853059
[100]	training's auc: 0.893661	valid_1's auc: 0.85773
[150]	training's auc: 0.903255	valid_1's auc: 0.862288
[200]	training's auc: 0.91211	valid_1's auc: 0.866041
[250]	training's auc: 0.920031	valid_1's auc: 0.868643
[300]	training's auc: 0.927566	valid_1's auc: 0.870989
[350]	training's auc: 0.934643	valid_1's auc: 0.873182
[400]	training's auc: 0.94099	valid_1's auc: 0.8745

In [37]:
oof_score = roc_auc_score(y, oof_pred_arr)#, squared=False)
score_list.append(["oof", oof_score])
pd.DataFrame(
    score_list, columns=["fold", "auc score"])

Unnamed: 0,fold,auc score
0,0,0.880285
1,1,0.877702
2,2,0.880243
3,3,0.885175
4,4,0.877724
5,5,0.88214
6,6,0.885518
7,7,0.878442
8,8,0.87756
9,9,0.880748


In [38]:
test_preds_arr = test_preds_arr.mean(axis=0)

In [39]:
#Create a  DataFrame
submission = pd.DataFrame({'encounter_id':test_id,'diabetes_mellitus':test_preds_arr})
                        

#Visualize the first 10 rows
submission.head(10)

Unnamed: 0,encounter_id,diabetes_mellitus
0,144740,0.046958
1,141990,0.252914
2,142038,0.086284
3,138628,0.050842
4,141682,0.275378
5,139096,0.024531
6,142994,0.78503
7,141954,0.036077
8,135344,0.603518
9,142552,0.866312


In [40]:
#Convert DataFrame to a csv file that can be uploaded
#This is saved in the same directory as your notebook
filename = 'submission.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: submission.csv


In [41]:
np.save('oof_preds.py',oof_pred_arr)
np.save('test_preds.py',test_preds_arr)