In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
MIN_LOS_ICU = 48

# Load data

In [3]:
data_path = f'data/min{MIN_LOS_ICU:d}h/'

In [4]:
# load splits for cml:
splits_cml = None

# parse file
with open(data_path + f'splits_cml.json') as file:
    splits_cml = json.load(file)
    
# print miniumum icu stay of cohort:
print(f'Minimum stay: {splits_cml["min_los_icu"]:d}h')

# check miniumum icu stay of cohort:
if splits_cml["min_los_icu"] != MIN_LOS_ICU:
    raise Exception(f'Minimum stay not matching!')

Minimum stay: 48h


In [5]:
# load splits for lml:
splits_lml = {}

for n in [2, 4]:
    # parse file
    with open(data_path + f'splits_lml{n:d}.json') as file:
        splits_lml[n] = json.load(file)
    
    # print miniumum icu stay of cohort:
    print(f'Minimum stay {n:d} clients: {splits_lml[n]["min_los_icu"]:d}h')
    
    # check miniumum icu stay of cohort:
    if splits_lml[n]["min_los_icu"] != MIN_LOS_ICU:
        raise Exception(f'Minimum stay not matching!')

Minimum stay 2 clients: 48h
Minimum stay 4 clients: 48h


In [6]:
# load splits for fl:
splits_fl = {}

for n in [2, 4]:
    # parse file
    with open(data_path + f'splits_fl{n:d}.json') as file:
        splits_fl[n] = json.load(file)
    
    # print miniumum icu stay of cohort:
    print(f'Minimum stay {n:d} clients: {splits_fl[n]["min_los_icu"]:d}h')
    
    # check miniumum icu stay of cohort:
    if splits_fl[n]["min_los_icu"] != MIN_LOS_ICU:
        raise Exception(f'Minimum stay not matching!')

Minimum stay 2 clients: 48h
Minimum stay 4 clients: 48h


In [7]:
# load cohort:
demographics = pd.read_pickle(data_path + f'demographics_min{splits_cml["min_los_icu"]:d}h.pickle')

# print statistics:
demographics.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,dod,admittime,dischtime,los_hospital,admission_age,ethnicity,...,first_hosp_stay,intime,outtime,los_icu,icustay_seq,first_icu_stay_current_hosp,first_icu_stay_patient,first_careunit,deathtime_icu,label_death_icu
0,3,145834,211552,M,2102-06-14,2101-10-20 19:08:00,2101-10-31 13:58:00,10.784722,76.526792,WHITE,...,True,2101-10-20 19:10:11,2101-10-26 20:43:09,6.06456,1,True,True,MICU,NaT,0
1,6,107064,228232,F,NaT,2175-05-30 07:15:00,2175-06-15 16:00:00,16.364583,65.942297,WHITE,...,True,2175-05-30 21:30:54,2175-06-03 13:39:54,3.672917,1,True,True,SICU,NaT,0
2,9,150750,220597,M,2149-11-14,2149-11-09 13:06:00,2149-11-14 10:15:00,4.88125,41.790228,UNKNOWN/NOT SPECIFIED,...,True,2149-11-09 13:07:02,2149-11-14 20:52:14,5.323056,1,True,True,MICU,2149-11-14 10:15:00,1
3,12,112213,232669,M,2104-08-20,2104-08-07 10:15:00,2104-08-20 02:57:00,12.695833,72.374177,WHITE,...,True,2104-08-08 02:08:17,2104-08-15 17:22:25,7.634815,1,True,True,SICU,NaT,0
4,13,143045,263738,F,NaT,2167-01-08 18:43:00,2167-01-15 15:15:00,6.855556,39.866118,WHITE,...,True,2167-01-08 18:44:25,2167-01-12 10:43:31,3.666042,1,True,True,CCU,NaT,0


## Filter demographics

In [8]:
# create a list of all used icustay ids:
icustay_ids = []
for i in range(5):
    icustay_ids.extend(splits_cml["folds"][i]["ids_test"])

# print statistics:
print(f'Number of patients:{len(icustay_ids ):>9d}/{demographics.shape[0]:>5d}')
print(f'Labels:            {str(sorted(demographics.label_death_icu.unique())):>15}')

Number of patients:    10226/20200
Labels:                     [0, 1]


In [9]:
# drop unused patients:
demographics = demographics[demographics["icustay_id"].isin(icustay_ids).values]

demographics.head()

Unnamed: 0,subject_id,hadm_id,icustay_id,gender,dod,admittime,dischtime,los_hospital,admission_age,ethnicity,...,first_hosp_stay,intime,outtime,los_icu,icustay_seq,first_icu_stay_current_hosp,first_icu_stay_patient,first_careunit,deathtime_icu,label_death_icu
1,6,107064,228232,F,NaT,2175-05-30 07:15:00,2175-06-15 16:00:00,16.364583,65.942297,WHITE,...,True,2175-05-30 21:30:54,2175-06-03 13:39:54,3.672917,1,True,True,SICU,NaT,0
4,13,143045,263738,F,NaT,2167-01-08 18:43:00,2167-01-15 15:15:00,6.855556,39.866118,WHITE,...,True,2167-01-08 18:44:25,2167-01-12 10:43:31,3.666042,1,True,True,CCU,NaT,0
6,25,129635,203487,M,NaT,2160-11-02 02:06:00,2160-11-05 14:55:00,3.534028,58.95033,WHITE,...,True,2160-11-02 03:16:23,2160-11-05 16:23:27,3.546574,1,True,True,CCU,NaT,0
7,26,197661,244882,M,2128-02-25,2126-05-06 15:16:00,2126-05-13 15:00:00,6.988889,72.008179,UNKNOWN/NOT SPECIFIED,...,True,2126-05-07 09:52:30,2126-05-09 13:15:05,2.140683,1,True,True,CCU,NaT,0
12,41,101757,261027,M,2133-09-30,2132-12-31 10:30:00,2133-01-27 15:45:00,27.21875,56.642648,WHITE,...,True,2133-01-03 06:34:40,2133-01-06 16:01:33,3.393669,1,True,True,SICU,NaT,0


# Check splits

## Check test splits:

In [10]:
for n in [2, 4]:
    for i in range(5):
        if splits_cml["folds"][i]["ids_test"] != splits_lml[n]["folds"][i]["ids_test"]:
            raise Exception(f'Test-sets for lml ({n:d} clients) and cml do not match!')


        if splits_cml["folds"][i]["ids_test"] != splits_fl[n]["folds"][i]["ids_test"]:
            raise Exception(f'Test-sets for fl ({n:d} clients) and cml do not match!')
        
print("All checks passed! Test-sets are identical!")       

All checks passed! Test-sets are identical!


## Check validation splits:

In [11]:
for n in [2, 4]:
    for i in range(5):
        for j in range(n):
            if splits_lml[n]["folds"][i]["clients"][j]["ids_valid"] != splits_fl[n]["folds"][i]["clients"][j]["ids_valid"]:
                raise Exception(f'Validation-sets for lml and fl do not match ({n:d} clients)!')
        
print("All checks passed! Validation-sets are identical!")       

All checks passed! Validation-sets are identical!


## Check training splits:

In [12]:
for n in [2, 4]:
    for i in range(5):
        for j in range(n):
            if splits_lml[n]["folds"][i]["clients"][j]["ids_train"] != splits_fl[n]["folds"][i]["clients"][j]["ids_train"]:
                raise Exception(f'Training-sets for lml and fl do not match ({n:d} clients)!')
        
print("All checks passed! Training-sets are identical!")       

All checks passed! Training-sets are identical!


# Print class distribution

In [13]:
def describe_fold(fold):
    # print header:
    print('\n----------------------------------------------------------------------------------------------------')
    print(f'Fold {fold["fold"]:d}')
    print('----------------------------------------------------------------------------------------------------\n')

    # print test data class distribution:
    data_test = demographics[demographics["icustay_id"].isin(fold["ids_test"])]

    pos = np.sum(data_test.label_death_icu == 1)
    neg = np.sum(data_test.label_death_icu == 0)
    tot = data_test.shape[0]
    
    print(f'Test:         {pos:>5,d} deaths ({pos/tot*100:>5.1f}%) / {neg:>5,d} discharges ({neg/tot*100:>5.1f}%)', end='')
    print(f'{"labels: " + str(sorted(data_test.label_death_icu.unique())):>37}')

    for client in fold["clients"]:
         # print header:
        print(f'\nClient {client["client"]:d}')
        
        # print training data class distribution:
        data_train = demographics[demographics["icustay_id"].isin(client["ids_train"])]
        
        pos = np.sum(data_train.label_death_icu == 1)
        neg = np.sum(data_train.label_death_icu == 0)
        tot = data_train.shape[0]
        
        print(f'  Training:   {pos:>5,d} deaths ({pos/tot*100:>5,.1f}%) / {neg:>5d} discharges ({neg/tot*100:>5.1f}%)', end='')
        print(f'{"labels: " + str(sorted(data_train.label_death_icu.unique())):>37}')
        
        # print validation data class distribution:
        data_valid = demographics[demographics["icustay_id"].isin(client["ids_valid"]).values]
        
        pos = np.sum(data_valid.label_death_icu == 1)
        neg = np.sum(data_valid.label_death_icu == 0)
        tot = data_valid.shape[0]
        
        print(f'  Validation: {pos:>5,d} deaths ({pos/tot*100:>5,.1f}%) / {neg:>5d} discharges ({neg/tot*100:>5.1f}%)', end='')
        print(f'{"labels: " + str(sorted(data_valid.label_death_icu.unique())):>37}')

    print('\n----------------------------------------------------------------------------------------------------')
    

## Fold 1

In [14]:
describe_fold(splits_fl[4]["folds"][0])


----------------------------------------------------------------------------------------------------
Fold 1
----------------------------------------------------------------------------------------------------

Test:           111 deaths (  5.4%) / 1,935 discharges ( 94.6%)                       labels: [0, 1]

Client 1
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 2
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 3
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 4
  Training:      93 deaths (  5.4%) /  164

## Fold 2

In [15]:
describe_fold(splits_fl[4]["folds"][1])


----------------------------------------------------------------------------------------------------
Fold 2
----------------------------------------------------------------------------------------------------

Test:           110 deaths (  5.4%) / 1,935 discharges ( 94.6%)                       labels: [0, 1]

Client 1
  Training:      94 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 2
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 3
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 4
  Training:      93 deaths (  5.4%) /  164

## Fold 3

In [16]:
describe_fold(splits_fl[4]["folds"][2])


----------------------------------------------------------------------------------------------------
Fold 3
----------------------------------------------------------------------------------------------------

Test:           110 deaths (  5.4%) / 1,935 discharges ( 94.6%)                       labels: [0, 1]

Client 1
  Training:      94 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 2
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 3
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 4
  Training:      93 deaths (  5.4%) /  164

## Fold 4

In [17]:
describe_fold(splits_fl[4]["folds"][3])


----------------------------------------------------------------------------------------------------
Fold 4
----------------------------------------------------------------------------------------------------

Test:           110 deaths (  5.4%) / 1,935 discharges ( 94.6%)                       labels: [0, 1]

Client 1
  Training:      94 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 2
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 3
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 4
  Training:      93 deaths (  5.4%) /  164

## Fold 5

In [18]:
describe_fold(splits_fl[4]["folds"][4])


----------------------------------------------------------------------------------------------------
Fold 5
----------------------------------------------------------------------------------------------------

Test:           110 deaths (  5.4%) / 1,935 discharges ( 94.6%)                       labels: [0, 1]

Client 1
  Training:      94 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 2
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 3
  Training:      93 deaths (  5.4%) /  1645 discharges ( 94.6%)                       labels: [0, 1]
  Validation:    17 deaths (  5.5%) /   290 discharges ( 94.5%)                       labels: [0, 1]

Client 4
  Training:      93 deaths (  5.4%) /  164