In [49]:
import pandas as pd
import re

from scipy.stats import chi2_contingency

In [3]:
df = pd.read_csv("./../Data/training_v2.csv")

# Columns Available

In [28]:
print(list(df.columns))

['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'albumin_apache', 'apache_2_diagnosis', 'apache_3j_diagnosis', 'apache_post_operative', 'arf_apache', 'bilirubin_apache', 'bun_apache', 'creatinine_apache', 'fio2_apache', 'gcs_eyes_apache', 'gcs_motor_apache', 'gcs_unable_apache', 'gcs_verbal_apache', 'glucose_apache', 'heart_rate_apache', 'hematocrit_apache', 'intubated_apache', 'map_apache', 'paco2_apache', 'paco2_for_ph_apache', 'pao2_apache', 'ph_apache', 'resprate_apache', 'sodium_apache', 'temp_apache', 'urineoutput_apache', 'ventilated_apache', 'wbc_apache', 'd1_diasbp_invasive_max', 'd1_diasbp_invasive_min', 'd1_diasbp_max', 'd1_diasbp_min', 'd1_diasbp_noninvasive_max', 'd1_diasbp_noninvasive_min', 'd1_heartrate_max', 'd1_heartrate_min', 'd1_mbp_invasive_max', 'd1_

## Columns that are not a reading

In [38]:
cols_notareading = [c for c in df.columns if len(re.findall("(min|max|apache)",  c)) == 0]
print(cols_notareading)

['encounter_id', 'patient_id', 'hospital_id', 'hospital_death', 'age', 'bmi', 'elective_surgery', 'ethnicity', 'gender', 'height', 'hospital_admit_source', 'icu_admit_source', 'icu_id', 'icu_stay_type', 'icu_type', 'pre_icu_los_days', 'readmission_status', 'weight', 'aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']


In [42]:
cols_diseases = cols_notareading[cols_notareading.index("aids"):]
print(cols_diseases)

['aids', 'cirrhosis', 'diabetes_mellitus', 'hepatic_failure', 'immunosuppression', 'leukemia', 'lymphoma', 'solid_tumor_with_metastasis']


# Missing Values

These have more than 80 % missing values

In [44]:
nacounts = df.isna().sum()
cols_extmiss = list(nacounts[nacounts.sort_values() > df.shape[0]*0.8].index)
print(cols_extmiss)

['h1_diasbp_invasive_max', 'h1_diasbp_invasive_min', 'h1_mbp_invasive_max', 'h1_mbp_invasive_min', 'h1_sysbp_invasive_max', 'h1_sysbp_invasive_min', 'h1_albumin_max', 'h1_albumin_min', 'h1_bilirubin_max', 'h1_bilirubin_min', 'h1_bun_max', 'h1_bun_min', 'h1_calcium_max', 'h1_calcium_min', 'h1_creatinine_max', 'h1_creatinine_min', 'h1_hco3_max', 'h1_hco3_min', 'h1_hematocrit_max', 'h1_hematocrit_min', 'h1_lactate_max', 'h1_lactate_min', 'h1_platelets_max', 'h1_platelets_min', 'h1_wbc_max', 'h1_wbc_min', 'h1_arterial_pco2_max', 'h1_arterial_pco2_min', 'h1_arterial_ph_max', 'h1_arterial_ph_min', 'h1_arterial_po2_max', 'h1_arterial_po2_min', 'h1_pao2fio2ratio_max', 'h1_pao2fio2ratio_min']


In [52]:
kdf = df
for cold in cols_diseases:
    for colmiss in cols_extmiss:
        ct = pd.crosstab(kdf[cold] , kdf[colmiss].isna())
        c2t = chi2_contingency(ct)
        if c2t[1] < 0.01:
            print(f"{cold} : {colmiss} : {c2t[1]}")
        print("\n\n")







































































































cirrhosis : h1_diasbp_invasive_max : 2.2492257743997913e-13



cirrhosis : h1_diasbp_invasive_min : 2.2492257743997913e-13



cirrhosis : h1_mbp_invasive_max : 1.5786887413968286e-12



cirrhosis : h1_mbp_invasive_min : 1.5786887413968286e-12



cirrhosis : h1_sysbp_invasive_max : 2.0587711748322757e-13



cirrhosis : h1_sysbp_invasive_min : 2.0587711748322757e-13



cirrhosis : h1_albumin_max : 2.640787310946045e-10



cirrhosis : h1_albumin_min : 2.640787310946045e-10



cirrhosis : h1_bilirubin_max : 1.7628890249097542e-13



cirrhosis : h1_bilirubin_min : 1.7628890249097542e-13



























cirrhosis : h1_hematocrit_max : 0.0013054538313660008



cirrhosis : h1_hematocrit_min : 0.0013054538313660008



cirrhosis : h1_lactate_max : 4.900302801784983e-07



cirrhosis : h1_lactate_min : 4.900302801784983e-07

































cirrhosis : h1_pao2fio2ratio_m

In [27]:
df[extmiss + ["hospital_death"]]

Unnamed: 0,h1_diasbp_invasive_max,h1_diasbp_invasive_min,h1_mbp_invasive_max,h1_mbp_invasive_min,h1_sysbp_invasive_max,h1_sysbp_invasive_min,h1_albumin_max,h1_albumin_min,h1_bilirubin_max,h1_bilirubin_min,...,h1_wbc_min,h1_arterial_pco2_max,h1_arterial_pco2_min,h1_arterial_ph_max,h1_arterial_ph_min,h1_arterial_po2_max,h1_arterial_po2_min,h1_pao2fio2ratio_max,h1_pao2fio2ratio_min,hospital_death
0,,,,,,,,,,,...,,,,,,,,,,0
1,,,,,,,,,,,...,12.70,37.0,37.000,7.45,7.45,51.0,51.0,51.0,51.0,0
2,,,,,,,,,,,...,,,,,,,,,,0
3,62.0,44.0,92.0,71.0,136.0,106.0,,,,,...,8.80,36.0,33.000,7.37,7.34,337.0,265.0,337.0,337.0,0
4,,,,,,,,,,,...,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91708,,,,,,,,,,,...,5.68,48.0,48.000,7.34,7.34,144.0,144.0,,,0
91709,,,,,,,,,,,...,,,,,,,,,,0
91710,,,,,,,,,,,...,,15.0,14.997,6.93,6.93,136.0,136.0,,,0
91711,,,,,,,,,,,...,,,,,,,,,,0
