# Notebook objective: create and store train-validation-test splits

In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [1]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [2]:
import os
import copy
import json

from os.path import basename
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from config import Settings; settings = Settings()

from rnn_utils import DiagnosesDataset, MYCOLLATE, split_dataset

from torch.utils.data import DataLoader

## Reproducibility

In [3]:
seed = settings.random_seed; print(f'{seed=}')
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

seed=546


<torch._C.Generator at 0x120226510>

## Parameters

<div class="alert alert-block alert-info">
Change these parameters as you prefer
</div>

In [4]:
dataset_id = 'diag_only'
test_size = 0.15
val_size=0.15

sanity check and some processing

In [5]:
train_size = 1 - (test_size + val_size)
assert test_size + val_size < 1, 'Oops'

## Create splits

In [6]:
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
dataset_filepath = os.path.join(dataset_folder,'dataset.json')

with open(dataset_filepath,'r') as fp:
    dataset = json.load(fp)

split ids

In [7]:
pat_ids = list(dataset['data'].keys())

whole_train,test = train_test_split(pat_ids,test_size=test_size)

val_size_corrected = val_size/(1-test_size)
train,val = train_test_split(whole_train,test_size=val_size_corrected)

print(f"{len(whole_train)=}")
print(f"{len(train)=}")
print(f"{len(val)=}")
print(f"{len(test)=}")

len(whole_train)=6374
len(train)=5249
len(val)=1125
len(test)=1125


create dataset splits

In [8]:
def generate_subset_data(original,inclusion_list):
    subset = copy.deepcopy(original)
    for pat_id in list(subset['data'].keys()):
        if pat_id not in inclusion_list:
            del subset['data'][pat_id]
    return subset

whole_train_subset = generate_subset_data(dataset,whole_train)
train_subset = generate_subset_data(dataset,train)
val_subset = generate_subset_data(dataset,val)
test_subset = generate_subset_data(dataset,test)

# sanity checks

print(f"{len(whole_train_subset['data'])=}")
print(f"{len(train_subset['data'])=}")
print(f"{len(val_subset['data'])=}")
print(f"{len(test_subset['data'])=}")

len(whole_train_subset['data'])=6374
len(train_subset['data'])=5249
len(val_subset['data'])=1125
len(test_subset['data'])=1125


# Save

## prepare save path and file suffix

<div class="alert alert-block alert-warning">
Don't forget, the folder which the files will be saved is defined in .env file!
</div>

In [9]:
# file suffix with metadata
params = {'train':train_size,
          'eval':val_size,
          'test':test_size,
          'rseed':seed,
         }

In [10]:
# assign filename to each subset
names = {'whole_train_subset':whole_train_subset,
         'train_subset':train_subset,
         'val_subset':val_subset,
         'test_subset':test_subset
        }

# Save (finally!)
for name in names:
    filename = os.path.join(dataset_folder,name)
    with open(filename+'.json','w') as fp:
        json.dump(names[name],fp)

with open(os.path.join(dataset_folder,'metadata.json'),'w') as fp:
    json.dump(params,fp)

# Hurray!