In [1]:
# load modules and set configurations
import numpy as np
import pandas as pd

import os, copy, random, pickle, gc
from itertools import product
from tqdm import tqdm

pd.set_option('display.max_columns', None)

import torch

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42

# 1. Data Preparation

In [2]:
from pandarallel import pandarallel as pdrl
pdrl.initialize()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


## 1.1 splitting

In [3]:
# data loading
resampling_hour = '2h'
with open(f'stay_ids-{resampling_hour}.pkl', 'rb') as f:
    stay_ids = pickle.load(f)

drop_cols = [f'label_after_{i}hour' for i in range(1, 7)]
df = pd.read_feather(f'ED_new_{resampling_hour}our_forward_full.ftr').drop(columns=drop_cols)

In [4]:
# additional column defining
df['true'] = np.where(df['stay_id'].isin(stay_ids['a']), 1, 0)
df['n_seq'] = np.concatenate(np.repeat([np.arange(24)], repeats=stay_ids['all'].__len__(), axis=0))

In [5]:
df_nzp = df[df['time']!='0'].reset_index(drop=True) # delete zero paddings
# esi setting 1: 1,2/ 3,4,5
# esi setting 2: 1/ 2,3,4,5
# esi setting 3: 1,2,3/ 4,5
for idx, i in enumerate([[1, 2], [1], [1, 2, 3]]):
    df_nzp[f'low_esi{idx+1}'] = np.where(df_nzp['stay_id'].isin(df_nzp[~df_nzp['acuity'].isin(i)]['stay_id'].unique()), 1, 0)

In [6]:
# make splits for each esi settings 
for ver in [1, 2, 3]:
    for low_esi in [0, 1]:
        tmp_df = df_nzp.query(f'low_esi{ver}=={low_esi}').reset_index(drop=True)
        
        split_ids = {}
        a_ids = tmp_df.query('true==1')['stay_id'].unique()
        n_ids = tmp_df.query('true==0')['stay_id'].unique()
        a_len = a_ids.__len__()
        n_len = n_ids.__len__()

        seed_everything(SEED)
        rng = np.random.RandomState(SEED)

        tst_len = a_len//2
        val_th_len = a_len - a_len//2
        trn_len = round((n_len - a_len)/10*9)
        val_tr_len = n_len - a_len - trn_len

        split_ids['trn'] = rng.choice(n_ids, trn_len, replace=False)
        n_ids = np.setdiff1d(n_ids, split_ids['trn'])

        split_ids['val_tr'] = np.random.choice(n_ids, val_tr_len, replace=False)
        n_ids = np.setdiff1d(n_ids, split_ids['val_tr'])

        split_ids['val_th'] = np.concatenate([rng.choice(n_ids, val_th_len, replace=False), rng.choice(a_ids, val_th_len, replace=False)])
        a_ids = np.setdiff1d(a_ids, split_ids['val_th'])
        n_ids = np.setdiff1d(n_ids, split_ids['val_th'])

        split_ids['tst'] = np.concatenate([n_ids, a_ids])
        
        for i, j in split_ids.items():
            cond = df_nzp['stay_id'].isin(j)
            df_nzp.loc[cond, f'split_esi{ver}_{low_esi}'] = i

        cond = df_nzp[f'split_esi{ver}_{low_esi}'] == 'val_th'
        tmp_df = df_nzp[cond].reset_index(drop=True)
        a_ids = tmp_df.query('true==1')['stay_id'].unique().tolist()
        n_ids = tmp_df.query('true==0')['stay_id'].unique().tolist()
        a_len = len(a_ids); q = a_len//2; r = a_len%2

        split_ids = {
            'trn': np.concatenate([df_nzp[df_nzp[f'split_esi{ver}_{low_esi}']=='trn']['stay_id'].unique(), a_ids[:q+r]+n_ids[:q+r]]),
            'val': a_ids[q+r:]+n_ids[q+r:],
            'tst': df_nzp[df_nzp[f'split_esi{ver}_{low_esi}']=='tst']['stay_id'].unique()
        }

        for i, j in split_ids.items():
            cond = df_nzp['stay_id'].isin(j)
            df_nzp.loc[cond, f'split_esi{ver}_{low_esi}_ml'] = i

    split_ids = {s: np.concatenate([df_nzp[df_nzp[f'split_esi{ver}_{low_esi}_ml']==s]['stay_id'].unique() for low_esi in [0, 1]]) for s in ['trn', 'val', 'tst']} 

    for i, j in split_ids.items():
        cond = df_nzp['stay_id'].isin(j)
        df_nzp.loc[cond, f'split_esi{ver}_ml'] = i

    split_ids = {s: np.concatenate([df_nzp[df_nzp[f'split_esi{ver}_{low_esi}']==s]['stay_id'].unique() for low_esi in [0, 1]]) for s in ['trn', 'val_tr', 'val_th', 'tst']} 
    
    for i, j in split_ids.items():
        cond = df_nzp['stay_id'].isin(j)
        df_nzp.loc[cond, f'split_esi{ver}'] = i

In [11]:
# sanity check for split
for ver, low_esi in product([1, 2, 3], [0, 1]):
    print(df_nzp.groupby(f'split_esi{ver}_{low_esi}')['stay_id'].nunique())
    print(df_nzp.groupby(f'split_esi{ver}_{low_esi}_ml')['stay_id'].nunique())
    print(all(np.sort(df_nzp[df_nzp[f'low_esi{ver}']==low_esi]['stay_id'].unique()) == np.sort(df_nzp[~df_nzp[f'split_esi{ver}_{low_esi}'].isna()]['stay_id'].unique())))
    print('================================================================================================================================================')

for ver in [1, 2, 3]:
    print(df_nzp.groupby(f'split_esi{ver}_ml')['stay_id'].nunique())
    print(all(np.sort(df_nzp[df_nzp[f'split_esi{ver}']!='val_tr']['stay_id'].unique()) == np.sort(df_nzp[~df_nzp[f'split_esi{ver}_ml'].isna()]['stay_id'].unique())))
    print('================================================================================================================================================')

split_esi1_0
trn       79482
tst        9136
val_th     9136
val_tr     8831
Name: stay_id, dtype: int64
split_esi1_0_ml
trn    84050
tst     9136
val     4568
Name: stay_id, dtype: int64
True
split_esi1_1
trn       169855
tst         1246
val_th      1248
val_tr     18873
Name: stay_id, dtype: int64
split_esi1_1_ml
trn    170479
tst      1246
val       624
Name: stay_id, dtype: int64
True
split_esi2_0
trn       6853
tst       2998
val_th    2998
val_tr     761
Name: stay_id, dtype: int64
split_esi2_0_ml
trn    8353
tst    2998
val    1498
Name: stay_id, dtype: int64
True
split_esi2_1
trn       242484
tst         7384
val_th      7386
val_tr     26943
Name: stay_id, dtype: int64
split_esi2_1_ml
trn    246178
tst      7384
val      3692
Name: stay_id, dtype: int64
True
split_esi3_0
trn       228495
tst        10354
val_th     10356
val_tr     25388
Name: stay_id, dtype: int64
split_esi3_0_ml
trn    233673
tst     10354
val      5178
Name: stay_id, dtype: int64
True
split_esi3_1
trn     

In [32]:
cols = [i for i in df_nzp.columns if i.__contains__('_esi')]
df = pd.merge(df, df_nzp[['stay_id']+cols].drop_duplicates(), how='outer', on='stay_id')

In [33]:
df.to_feather('data-df-split.ftr')

## 1.2 scaling

In [2]:
df = pd.read_feather('data-df-split.ftr')

# columns defining 
tmp = ['stay_id', 'time', 'true', 'n_seq']+[j for j in df.columns if j.__contains__('_esi')]
scaling_cols = [col for col in df.columns if df[col].nunique() > 2]
scaling_cols = [i for i in scaling_cols if i not in tmp]
features = [i for i in df.columns if i not in tmp]

# data type conversion
df[features] = df[features].astype(np.float32)
df[['stay_id', 'true']] = df[['stay_id', 'true']].astype(int)

In [None]:
for ver in tqdm([1, 2, 3]):
    for low_esi in [0, 1]:
        tmp_df = df.query(f'low_esi{ver}=={low_esi}').reset_index(drop=True)
        zp_cond = tmp_df['time']!='0'
        
        for ml in ['', '_ml']:
            if ml == '_ml':
                splits = ['trn', 'val', 'tst']
            else:
                splits = ['trn', 'val_tr', 'val_th', 'tst']

            tmp = tmp_df.copy()
            split_col = f'split_esi{ver}_{low_esi}{ml}'
            mus = tmp.query(f'{split_col}=="trn"').loc[zp_cond, scaling_cols].mean(axis=0)
            stds = tmp.query(f'{split_col}=="trn"').loc[zp_cond, scaling_cols].std(axis=0)
            for i in splits:
                cond = (tmp[split_col]==i) & zp_cond
                tmp.loc[cond, scaling_cols] = (tmp.loc[cond, scaling_cols]-mus)/(stds+1e-09)
            
            if ml == '_ml':
                tmp[zp_cond].reset_index(drop=True).to_feather(f'data-df-std-esi{ver}-{low_esi}{ml}.ftr')
            else:
                tmp.to_feather(f'data-df-std-esi{ver}-{low_esi}{ml}.ftr')

    for ml in ['', '_ml']:
        if ml == '_ml':
            splits = ['trn', 'val', 'tst']
        else:
            splits = ['trn', 'val_tr', 'val_th', 'tst']

        tmp = df.copy()
        zp_cond = tmp['time']!='0'
        split_col = f'split_esi{ver}{ml}'
        mus = tmp.query(f'{split_col}=="trn"').loc[zp_cond, scaling_cols].mean(axis=0)
        stds = tmp.query(f'{split_col}=="trn"').loc[zp_cond, scaling_cols].std(axis=0)
        for i in splits:
            cond = (tmp[split_col]==i) & zp_cond
            tmp.loc[cond, scaling_cols] = (tmp.loc[cond, scaling_cols]-mus)/(stds+1e-09)

        if ml == '_ml':
            tmp[zp_cond].reset_index(drop=True).to_feather(f'data-df-std-esi{ver}{ml}.ftr')
        else:
            tmp.reset_index(drop=True).to_feather(f'data-df-std-esi{ver}{ml}.ftr')

## 1.3 preparing for lstm ae

In [2]:
df = pd.read_feather(f'data-df-split.ftr')
max_seq_len = 24

tmp = ['stay_id', 'time', 'true', 'n_seq']+[j for j in df.columns if j.__contains__('_esi')]
features = [i for i in df.columns if i not in tmp]
n_zp_cols = ['stay_id']+[j for j in df.columns if j.__contains__('_esi')]
zp_cols = [i for i in df.columns if i not in n_zp_cols]

In [3]:
def timestamps_split_zero_padding(df, max_seq_len=max_seq_len):
    n_col = len(df.columns)
    tmp = df.values.reshape(-1, max_seq_len, n_col)
    mask_padded = tmp[:, :, 1] == '0'
    len_seqs = 24-np.sum(mask_padded, axis=1)
    seqs = []
    
    for idx, ls in tqdm(enumerate(len_seqs)):
        if ls == 1:
            _seq = tmp[idx].copy()
            seqs.append(_seq.copy())
            continue
        
        stayid = tmp[idx, 0, 0]
        result = tmp[idx, 0, -1]
        
        for i in range(1, ls):
            _seq = tmp[idx].copy()
            _seq[i:, :] = [[stayid, '0']+np.repeat(0, n_col-3).tolist()+[result] for _ in range(max_seq_len-i)]
            seqs.append(_seq.copy())
        
        _seq = tmp[idx].copy()
        seqs.append(_seq.copy())

    seqs = np.array(seqs)
    seqs = seqs.reshape(-1, n_col)

    return pd.DataFrame(seqs, columns=df.columns)

In [4]:
data_dict = {ver:{low_esi:{} for low_esi in [0, 1, 'all']} for ver in [1, 2, 3]}
for ver in [1, 2, 3]:
    for low_esi in [0, 1]:
        df = pd.read_feather(f'data-df-std-esi{ver}-{low_esi}.ftr')
        split_col = f'split_esi{ver}_{low_esi}'    
        data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val_tr', 'val_th', 'tst']}

        for i in ['val_th', 'tst']:
            data[i] = timestamps_split_zero_padding(data[i])

        data_dict[ver][low_esi] = copy.deepcopy(data)


    df = pd.read_feather(f'data-df-std-esi{ver}.ftr')
    split_col = f'split_esi{ver}'    
    data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val_tr', 'val_th', 'tst']}
    for i in ['val_th', 'tst']:
        data[i] = timestamps_split_zero_padding(data[i])
    
    data_dict[ver]['all'] = copy.deepcopy(data)
    
    gc.collect()

9136it [00:06, 1371.58it/s]
9136it [00:06, 1396.45it/s]
1248it [00:01, 1169.13it/s]
1246it [00:00, 1282.33it/s]
10384it [00:07, 1388.64it/s]
10382it [00:07, 1370.41it/s]
2998it [00:01, 1581.59it/s]
2998it [00:01, 1619.13it/s]
7386it [00:05, 1368.75it/s]
7384it [00:05, 1412.93it/s]
10384it [00:07, 1408.52it/s]
10382it [00:07, 1454.15it/s]
10356it [00:07, 1359.05it/s]
10354it [00:07, 1408.52it/s]
28it [00:00, 2005.23it/s]
28it [00:00, 2370.57it/s]
10384it [00:07, 1372.86it/s]
10382it [00:07, 1414.35it/s]


In [5]:
for ver in [1, 2, 3]:
    for low_esi in [0, 1, 'all']:
        for i in tqdm(['trn', 'val_tr', 'val_th', 'tst']):
            tmp = {
                'X': data_dict[ver][low_esi][i][features].values.reshape((-1, max_seq_len, len(features))),
                'y': data_dict[ver][low_esi][i]['true'].values.reshape((-1, max_seq_len, 1)),
                'ids': data_dict[ver][low_esi][i]['stay_id'].values.reshape((-1, max_seq_len, 1)),
                'n_seq': data_dict[ver][low_esi][i]['n_seq'].values.reshape((-1, max_seq_len, 1))
            }

            data_dict[ver][low_esi][i] = copy.deepcopy(tmp)

            gc.collect()

100%|██████████| 4/4 [01:11<00:00, 17.98s/it]
100%|██████████| 4/4 [00:12<00:00,  3.05s/it]
100%|██████████| 4/4 [01:23<00:00, 20.87s/it]
100%|██████████| 4/4 [00:21<00:00,  5.35s/it]
100%|██████████| 4/4 [00:58<00:00, 14.72s/it]
100%|██████████| 4/4 [01:19<00:00, 20.00s/it]
100%|██████████| 4/4 [01:20<00:00, 20.22s/it]
100%|██████████| 4/4 [00:01<00:00,  3.74it/s]
100%|██████████| 4/4 [01:20<00:00, 20.18s/it]


In [8]:
with open(f'data-dict-for_lstm_ae.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

## 1.4 preparing for ML

In [2]:
df = pd.read_feather(f'data-df-split.ftr')

tmp = ['stay_id', 'time', 'true', 'n_seq']+[j for j in df.columns if j.__contains__('_esi')]
features = [i for i in df.columns if i not in tmp]

In [3]:
data_dict = {ver:{low_esi:{} for low_esi in [0, 1, 'all']} for ver in [1, 2, 3]}
for ver in [1, 2, 3]:
    for low_esi in [0, 1]:
        df = pd.read_feather(f'data-df-std-esi{ver}-{low_esi}_ml.ftr')
        df = df[df['time']!='0'].reset_index(drop=True)
        
        split_col = f'split_esi{ver}_{low_esi}_ml'    
        data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val', 'tst']}

        data_dict[ver][low_esi] = copy.deepcopy(data)

    df = pd.read_feather(f'data-df-std-esi{ver}_ml.ftr')
    df = df[df['time']!='0'].reset_index(drop=True)
    split_col = f'split_esi{ver}_ml'    
    data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val', 'tst']}
    
    data_dict[ver]['all'] = copy.deepcopy(data)
    
    gc.collect()

In [4]:
for ver in [1, 2, 3]:
    for low_esi in [0, 1, 'all']:
        for i in tqdm(['trn', 'val', 'tst']):
            tmp = {
                'X': data_dict[ver][low_esi][i][features].values,
                'y': data_dict[ver][low_esi][i]['true'].values.ravel(),
                'ids': data_dict[ver][low_esi][i]['stay_id'].values.ravel(),
                'n_seq': data_dict[ver][low_esi][i]['n_seq'].values.ravel()
            }

            data_dict[ver][low_esi][i] = copy.deepcopy(tmp)

100%|██████████| 3/3 [00:00<00:00, 24.92it/s]
100%|██████████| 3/3 [00:00<00:00, 12.79it/s]
100%|██████████| 3/3 [00:00<00:00, 10.50it/s]
100%|██████████| 3/3 [00:00<00:00, 167.54it/s]
100%|██████████| 3/3 [00:00<00:00, 10.91it/s]
100%|██████████| 3/3 [00:00<00:00,  6.63it/s]
100%|██████████| 3/3 [00:00<00:00, 10.66it/s]
100%|██████████| 3/3 [00:00<00:00, 150.78it/s]
100%|██████████| 3/3 [00:00<00:00, 10.81it/s]


In [5]:
with open(f'data-dict-for_ml.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

## 1.5 preparing for AE

In [2]:
df = pd.read_feather(f'data-df-split.ftr')

tmp = ['stay_id', 'time', 'true', 'n_seq']+[j for j in df.columns if j.__contains__('_esi')]
features = [i for i in df.columns if i not in tmp]

In [3]:
data_dict = {ver:{low_esi:{} for low_esi in [0, 1, 'all']} for ver in [1, 2, 3]}
for ver in [1, 2, 3]:
    for low_esi in [0, 1]:
        df = pd.read_feather(f'data-df-std-esi{ver}-{low_esi}.ftr')
        df = df[df['time']!='0'].reset_index(drop=True)
        
        split_col = f'split_esi{ver}_{low_esi}'    
        data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val_tr', 'val_th', 'tst']}

        data_dict[ver][low_esi] = copy.deepcopy(data)

    df = pd.read_feather(f'data-df-std-esi{ver}.ftr')
    df = df[df['time']!='0'].reset_index(drop=True)
    split_col = f'split_esi{ver}'
    data = {i: df.query(f'{split_col}==@i').reset_index(drop=True) for i in ['trn', 'val_tr', 'val_th', 'tst']}
    
    data_dict[ver]['all'] = copy.deepcopy(data)
    
    gc.collect()

In [4]:
for ver in [1, 2, 3]:
    for low_esi in [0, 1, 'all']:
        for i in tqdm(['trn', 'val_tr', 'val_th', 'tst']):
            tmp = {
                'X': data_dict[ver][low_esi][i][features].values,
                'y': data_dict[ver][low_esi][i]['true'].values,
                'ids': data_dict[ver][low_esi][i]['stay_id'].values,
                'n_seq': data_dict[ver][low_esi][i]['n_seq'].values
            }

            data_dict[ver][low_esi][i] = copy.deepcopy(tmp)

  0%|          | 0/4 [00:00<?, ?it/s]

100%|██████████| 4/4 [00:00<00:00, 25.27it/s]
100%|██████████| 4/4 [00:00<00:00, 16.64it/s]
100%|██████████| 4/4 [00:00<00:00,  8.92it/s]
100%|██████████| 4/4 [00:00<00:00, 199.71it/s]
100%|██████████| 4/4 [00:00<00:00, 11.56it/s]
100%|██████████| 4/4 [00:00<00:00,  7.32it/s]
100%|██████████| 4/4 [00:00<00:00, 11.73it/s]
100%|██████████| 4/4 [00:00<00:00, 234.03it/s]
100%|██████████| 4/4 [00:00<00:00, 12.44it/s]


In [5]:
with open(f'data-dict-for_ae.pkl', 'wb') as f:
    pickle.dump(data_dict, f)

## 1.6 preparing for PCA LSTM AE

In [2]:
with open('data-dict-for_lstm_ae.pkl', 'rb') as f:
    data_dict = pickle.load(f)

In [3]:
from sklearn.decomposition import PCA 
pca = PCA(n_components=1)

In [4]:
for ver in [1, 2, 3]:
    for low_esi in [0, 1, 'all']:
        tmp_X = data_dict[ver][low_esi]['trn']['X'].reshape(-1, 114)
        zp_mask = np.all(tmp_X == 0, axis=1)
        pca.fit(tmp_X[~zp_mask])

        for i in tqdm(['trn', 'val_tr', 'val_th', 'tst']):
            tmp_X = data_dict[ver][low_esi][i]['X'].reshape(-1, 114)
            zp_mask = np.all(tmp_X == 0, axis=1)
            pca_X = np.zeros((tmp_X.shape[0], 1))

            pca_X[~zp_mask] = pca.transform(tmp_X[~zp_mask])

            data_dict[ver][low_esi][i]['X'] = copy.deepcopy(pca_X.reshape(-1, 24, 1))

100%|██████████| 4/4 [00:04<00:00,  1.05s/it]
100%|██████████| 4/4 [00:01<00:00,  3.01it/s]
100%|██████████| 4/4 [00:05<00:00,  1.35s/it]
100%|██████████| 4/4 [00:01<00:00,  3.41it/s]
100%|██████████| 4/4 [00:04<00:00,  1.08s/it]
100%|██████████| 4/4 [00:05<00:00,  1.33s/it]
100%|██████████| 4/4 [00:05<00:00,  1.32s/it]
100%|██████████| 4/4 [00:00<00:00, 52.19it/s]
100%|██████████| 4/4 [00:05<00:00,  1.31s/it]


In [5]:
with open(f'data-dict-for_pca_lstm_ae.pkl', 'wb') as f:
    pickle.dump(data_dict, f)