In [1]:
import pandas as pd
import numpy as np
import json

# Load data

In [2]:
# load splits for cml:
splits_cml = None

# parse file
with open("splits_cml.json") as file:
    splits_cml = json.load(file)
    
# print miniumum icu stay of cohort:
print(f'Minimum stay: {splits_cml["min_los_icu"]:d}h')

Minimum stay: 48h


In [3]:
# load splits for lml:
splits_lml = {}

for n in [2, 4]:
    # parse file
    with open(f"splits_lml{n:d}.json") as file:
        splits_lml[n] = json.load(file)
    
    # print miniumum icu stay of cohort:
    print(f'Minimum stay {n:d} clients: {splits_lml[n]["min_los_icu"]:d}h')
    
    # check miniumum icu stay of cohort:
    if splits_lml[n]["min_los_icu"] != splits_cml["min_los_icu"]:
        raise Exception(f'Minimum stay not matching cml!')

Minimum stay 2 clients: 48h
Minimum stay 4 clients: 48h


In [4]:
# load splits for fl:
splits_fl = {}

for n in [2, 4]:
    # parse file
    with open(f"splits_fl{n:d}.json") as file:
        splits_fl[n] = json.load(file)
    
    # print miniumum icu stay of cohort:
    print(f'Minimum stay {n:d} clients: {splits_fl[n]["min_los_icu"]:d}h')
    
    # check miniumum icu stay of cohort:
    if splits_fl[n]["min_los_icu"] != splits_cml["min_los_icu"]:
        raise Exception(f'Minimum stay not matching cml!')

Minimum stay 2 clients: 48h
Minimum stay 4 clients: 48h


In [5]:
# load cohort:
demographics = pd.read_pickle(f'demographics_min{splits_cml["min_los_icu"]:d}h.pickle')

# Check splits

## Check test splits:

In [7]:
for n in [2, 4]:
    for i in range(5):
        if splits_cml["folds"][i]["ids_test"] != splits_lml[n]["folds"][i]["ids_test"]:
            raise Exception(f'Test-sets for lml ({n:d} clients) and cml do not match!')


        if splits_cml["folds"][i]["ids_test"] != splits_fl[n]["folds"][i]["ids_test"]:
            raise Exception(f'Test-sets for fl ({n:d} clients) and cml do not match!')
        
print("All checks passed! Test-sets are identical!")       

All checks passed! Test-sets are identical!


## Check validation splits:

In [8]:
for n in [2, 4]:
    for i in range(5):
        for j in range(n):
            if splits_lml[n]["folds"][i]["clients"][j]["ids_valid"] != splits_fl[n]["folds"][i]["clients"][j]["ids_valid"]:
                raise Exception(f'Validation-sets for lml and fl do not match ({n:d} clients)!')
        
print("All checks passed! Validation-sets are identical!")       

All checks passed! Validation-sets are identical!


## Check training splits:

In [9]:
for n in [2, 4]:
    for i in range(5):
        for j in range(n):
            if splits_lml[n]["folds"][i]["clients"][j]["ids_train"] != splits_fl[n]["folds"][i]["clients"][j]["ids_train"]:
                raise Exception(f'Training-sets for lml and fl do not match ({n:d} clients)!')
        
print("All checks passed! Training-sets are identical!")       

All checks passed! Training-sets are identical!


# Print class distribution

In [10]:
def describe_fold(fold):
    # print header:
    print('\n----------------------------------------------------------------------------------------------------')
    print(f'Fold {fold["fold"]:d}')
    print('----------------------------------------------------------------------------------------------------\n')

    # print test data class distribution:
    data_test = demographics.where(demographics["subject_id"].isin(fold["ids_test"]))

    pos = np.sum(data_test.label_death_icu == 1)
    neg = np.sum(data_test.label_death_icu == 0)
    tot = pos + neg
    
    print(f'Test:         {pos:>5,d} deaths ({pos/tot*100:>5,.1f}%) / {neg:>5,d} discharges ({neg/tot*100:>5,.1f}%)')
    
    for client in fold["clients"]:
         # print header:
        print(f'\nClient {client["client"]:d}')
        
        # print training data class distribution:
        data_train = demographics.where(demographics["subject_id"].isin(client["ids_train"]))
        
        pos = np.sum(data_train.label_death_icu == 1)
        neg = np.sum(data_train.label_death_icu == 0)
        tot = pos + neg
        
        print(f'  Training:   {pos:>5,d} deaths ({pos/tot*100:>5,.1f}%) / {neg:>5,d} discharges ({neg/tot*100:>5,.1f}%)')
        
        # print validation data class distribution:
        data_valid = demographics.where(demographics["subject_id"].isin(client["ids_valid"]))
        
        pos = np.sum(data_valid.label_death_icu == 1)
        neg = np.sum(data_valid.label_death_icu == 0)
        tot = pos + neg
        
        print(f'  Validation: {pos:>5,d} deaths ({pos/tot*100:>5,.1f}%) / {neg:>5,d} discharges ({neg/tot*100:>5,.1f}%)')

    print('\n----------------------------------------------------------------------------------------------------')
    

## Fold 1

In [12]:
describe_fold(splits_fl[4]["folds"][0])


----------------------------------------------------------------------------------------------------
Fold 1
----------------------------------------------------------------------------------------------------

Test:            81 deaths ( 10.3%) /   705 discharges ( 89.7%)

Client 1
  Training:      65 deaths (  9.1%) /   648 discharges ( 90.9%)
  Validation:    13 deaths ( 11.6%) /    99 discharges ( 88.4%)

Client 2
  Training:      61 deaths (  9.1%) /   608 discharges ( 90.9%)
  Validation:    11 deaths (  8.5%) /   118 discharges ( 91.5%)

Client 3
  Training:      60 deaths (  9.1%) /   599 discharges ( 90.9%)
  Validation:     6 deaths (  5.1%) /   112 discharges ( 94.9%)

Client 4
  Training:      63 deaths (  9.5%) /   597 discharges ( 90.5%)
  Validation:    20 deaths ( 15.9%) /   106 discharges ( 84.1%)

----------------------------------------------------------------------------------------------------


## Fold 2

In [13]:
describe_fold(splits_fl[4]["folds"][1])


----------------------------------------------------------------------------------------------------
Fold 2
----------------------------------------------------------------------------------------------------

Test:            93 deaths ( 11.6%) /   709 discharges ( 88.4%)

Client 1
  Training:      64 deaths (  9.7%) /   596 discharges ( 90.3%)
  Validation:     8 deaths (  6.0%) /   126 discharges ( 94.0%)

Client 2
  Training:      62 deaths (  8.9%) /   631 discharges ( 91.1%)
  Validation:    13 deaths ( 11.5%) /   100 discharges ( 88.5%)

Client 3
  Training:      55 deaths (  8.7%) /   577 discharges ( 91.3%)
  Validation:     5 deaths (  4.3%) /   111 discharges ( 95.7%)

Client 4
  Training:      70 deaths ( 10.0%) /   632 discharges ( 90.0%)
  Validation:    10 deaths (  8.3%) /   110 discharges ( 91.7%)

----------------------------------------------------------------------------------------------------


## Fold 3

In [14]:
describe_fold(splits_fl[4]["folds"][2])


----------------------------------------------------------------------------------------------------
Fold 3
----------------------------------------------------------------------------------------------------

Test:            62 deaths (  7.7%) /   739 discharges ( 92.3%)

Client 1
  Training:      70 deaths ( 10.5%) /   594 discharges ( 89.5%)
  Validation:     8 deaths (  6.3%) /   118 discharges ( 93.7%)

Client 2
  Training:      58 deaths (  8.3%) /   639 discharges ( 91.7%)
  Validation:     7 deaths (  6.3%) /   104 discharges ( 93.7%)

Client 3
  Training:      77 deaths ( 11.8%) /   578 discharges ( 88.2%)
  Validation:    19 deaths ( 15.4%) /   104 discharges ( 84.6%)

Client 4
  Training:      67 deaths (  9.9%) /   608 discharges ( 90.1%)
  Validation:    12 deaths ( 10.0%) /   108 discharges ( 90.0%)

----------------------------------------------------------------------------------------------------


## Fold 4

In [15]:
describe_fold(splits_fl[4]["folds"][3])


----------------------------------------------------------------------------------------------------
Fold 4
----------------------------------------------------------------------------------------------------

Test:            77 deaths (  9.5%) /   735 discharges ( 90.5%)

Client 1
  Training:      65 deaths (  9.7%) /   606 discharges ( 90.3%)
  Validation:    17 deaths ( 13.9%) /   105 discharges ( 86.1%)

Client 2
  Training:      56 deaths (  8.6%) /   596 discharges ( 91.4%)
  Validation:    18 deaths ( 14.2%) /   109 discharges ( 85.8%)

Client 3
  Training:      67 deaths (  9.7%) /   621 discharges ( 90.3%)
  Validation:    10 deaths (  8.3%) /   110 discharges ( 91.7%)

Client 4
  Training:      60 deaths (  9.1%) /   600 discharges ( 90.9%)
  Validation:    10 deaths (  8.3%) /   110 discharges ( 91.7%)

----------------------------------------------------------------------------------------------------


## Fold 5

In [16]:
describe_fold(splits_fl[4]["folds"][4])


----------------------------------------------------------------------------------------------------
Fold 5
----------------------------------------------------------------------------------------------------

Test:            67 deaths (  8.7%) /   704 discharges ( 91.3%)

Client 1
  Training:      74 deaths ( 10.9%) /   605 discharges ( 89.1%)
  Validation:    19 deaths ( 15.6%) /   103 discharges ( 84.4%)

Client 2
  Training:      57 deaths (  8.7%) /   596 discharges ( 91.3%)
  Validation:    13 deaths (  9.4%) /   125 discharges ( 90.6%)

Client 3
  Training:      58 deaths (  8.5%) /   621 discharges ( 91.5%)
  Validation:     9 deaths (  7.1%) /   118 discharges ( 92.9%)

Client 4
  Training:      73 deaths ( 10.6%) /   615 discharges ( 89.4%)
  Validation:    10 deaths (  8.7%) /   105 discharges ( 91.3%)

----------------------------------------------------------------------------------------------------
