In [1]:
import pandas as pd
import os
import shutil
import tqdm
from sklearn.model_selection import train_test_split

In [2]:
# PATHS
TASK = 'mortality_task'
save_path = "your_save_path_from_preprocessing.ipynb"

MODALITY1 = f'{save_path}/by_modality/{TASK}/modality1_demographics.csv'
MODALITY2 = f'{save_path}/by_modality/{TASK}/modality2_diagnosis.csv'
MODALITY3 = f'{save_path}/by_modality/{TASK}/modality3_treatment.csv'
MODALITY4 = f'{save_path}/by_modality/{TASK}/modality4_medication.csv'
MODALITY5 = f'{save_path}/by_modality/{TASK}/modality5_lab.csv'
MODALITY6 = f'{save_path}/by_modality/{TASK}/modality6_aps.csv'

dfm1 = pd.read_csv(MODALITY1, index_col=0)
print("Loaded modality 1")
dfm2 = pd.read_csv(MODALITY2, index_col=0)
print("Loaded modality 2")
dfm3 = pd.read_csv(MODALITY3, index_col=0)
print("Loaded modality 3")
dfm4 = pd.read_csv(MODALITY4, index_col=0)
print("Loaded modality 4")
dfm5 = pd.read_csv(MODALITY5, index_col=0)
print("Loaded modality 5")
dfm6 = pd.read_csv(MODALITY6, index_col=0)
print("Loaded modality 6")

Loaded modality 1
Loaded modality 2
Loaded modality 3
Loaded modality 4
Loaded modality 5
Loaded modality 6


Next, get all common patient stay ids

In [3]:
mods = [dfm1, dfm2, dfm3, dfm4, dfm5, dfm6]
patient_stay_ids = []
for m in mods:
    patient_stay_ids.append(set(list(m['patientunitstayid'])))
common_stay_ids = list(set.intersection(*patient_stay_ids))

Get associated patient ids by filtering demographic dataframe.

In [4]:
dfm1_common = dfm1[dfm1['patientunitstayid'].isin(common_stay_ids)]
pid = list(dfm1_common['uniquepid'].unique())
print(f"Found {len(pid)} unique patients associated with {len(common_stay_ids)} common stay ids.")

Found 84182 unique patients associated with 103018 common stay ids.


In [5]:
pid_train_val, pid_test = train_test_split(pid, test_size=0.2, random_state=42)
pid_train, pid_val = train_test_split(pid_train_val, test_size=0.125, random_state=42)
print(f"{len(pid_train)}, {len(pid_val)}, {len(pid_test)} train, val, test patients")

58926, 8419, 16837 train, val, test patients


With these patients, split the patient stay ids.

In [6]:
stayid_train = list(dfm1_common[dfm1_common['uniquepid'].isin(pid_train)]['patientunitstayid'])
stayid_val = list(dfm1_common[dfm1_common['uniquepid'].isin(pid_val)]['patientunitstayid'])
stayid_test = list(dfm1_common[dfm1_common['uniquepid'].isin(pid_test)]['patientunitstayid'])
print(f"{len(stayid_train)}, {len(stayid_val)}, {len(stayid_test)} train, val, test stays")

72134, 10273, 20611 train, val, test stays


In [7]:
with open(f'{save_path}/by_modality/{TASK}/splits/split.txt', 'w') as f:
    f.write("train\n")
    f.write(",".join([str(s) for s in stayid_train]))
    f.write("val\n")
    f.write(",".join([str(s) for s in stayid_val]))
    f.write("test\n")
    f.write(",".join([str(s) for s in stayid_test]))

For each modality, create and save a dataframe corresponding to train subset and test subset.

In [8]:
def create_split_dfs(modality_df, train_ids, val_ids, test_ids):
    train_df = modality_df[modality_df['patientunitstayid'].isin(train_ids)]
    assert len(train_df) == len(train_ids)
    train_df = train_df.sort_values(by='patientunitstayid')
    
    val_df = modality_df[modality_df['patientunitstayid'].isin(val_ids)]
    assert len(val_df) == len(val_ids)
    val_df = val_df.sort_values(by='patientunitstayid')
    
    test_df = modality_df[modality_df['patientunitstayid'].isin(test_ids)]
    assert len(test_df) == len(test_ids)
    test_df = test_df.sort_values(by='patientunitstayid')
    
    return train_df, val_df, test_df

In [9]:
modnames= ['demographics', 'diagnosis', 'treatment', 'medication', 'lab', 'aps']
split_dfs = []
for m in mods:
    split_dfs.append(create_split_dfs(m, stayid_train, stayid_val, stayid_test))

In [10]:
# target_train = list(split_dfs[0][0]['los'])
# target_val = list(split_dfs[0][1]['los'])
# target_test = list(split_dfs[0][2]['los'])

target_train = list(split_dfs[0][0]['hospitaldischargestatus'])
target_val = list(split_dfs[0][1]['hospitaldischargestatus'])
target_test = list(split_dfs[0][2]['hospitaldischargestatus'])

In [11]:
# add 'target' column
for mod in split_dfs:
    mod[0]['target'] = target_train
    mod[1]['target'] = target_val
    mod[2]['target'] = target_test

In [12]:
output_path = f'{save_path}/by_modality/{TASK}/splits'
if not os.path.exists(output_path):
    os.makedirs(output_path)
for i, (name, mod) in enumerate(zip(modnames, split_dfs)):
    train_df = mod[0]
    val_df = mod[1]
    test_df = mod[2]
    train_df.to_csv(os.path.join(output_path, f'{i+1}_{name}_train.csv'))
    val_df.to_csv(os.path.join(output_path, f'{i+1}_{name}_val.csv'))
    test_df.to_csv(os.path.join(output_path, f'{i+1}_{name}_test.csv'))

---