In [26]:
%%file split_patients.py
#! /home/debian/Simao/miniconda3/envs/thesis/bin/python3

import sys
import os
cwd = os.getcwd()
print(cwd)

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

new_cwd = os.path.dirname(cwd) # parent directory
sys.path.append(new_cwd)

import copy
import json

from os.path import basename
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

from rnn_utils import DiagnosesDataset, MYCOLLATE, split_dataset

from torch.utils.data import DataLoader

import argparse



def main():

    parser = argparse.ArgumentParser(description='Split patients into train-test-validation')
    parser.add_argument('-i','--input_file', type=str, help='file with patient ids')
    parser.add_argument('-o','--output_path', type=str, help='path to save the splits')
    parser.add_argument('-v','--val_size', type=float, help='size of the validation set in fraction')
    parser.add_argument('-t','--test_size', type=float, help='size of the test set in fraction')
    
    
    

    args = parser.parse_args()
    print(args.input_file)
    print(args.output_path)
    print(args.val_size)
    print(args.test_size)
    val_size = args.val_size
    test_size = args.test_size
    
    assert os.path.isdir(args.output_path), 'Output path doesnt exist'
    
    seed = 426
    print(f'{seed=}')
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    train_size = 1 - (test_size + val_size)
    assert test_size + val_size < 1, 'Oops'
    print('train_size=',train_size)
    

    with open(args.input_file,'r') as fp:
        dataset = json.load(fp)
    
    pat_ids = list(dataset.keys())

    whole_train,test = train_test_split(pat_ids,test_size=test_size)

    val_size_corrected = val_size/(1-test_size)
    train,val = train_test_split(whole_train,test_size=val_size_corrected)

    print(f"{len(whole_train)=}")
    print(f"{len(train)=}")
    print(f"{len(val)=}")
    print(f"{len(test)=}")
    
    def generate_subset_data(original,inclusion_list):
        df = pd.DataFrame(original)
        subset_original = df.loc[:,inclusion_list].to_dict()
        return subset_original

    whole_train_subset = generate_subset_data(dataset,whole_train)
    train_subset = generate_subset_data(dataset,train)
    val_subset = generate_subset_data(dataset,val)
    test_subset = generate_subset_data(dataset,test)

    # sanity checks

    print(f"{len(whole_train_subset)=}")
    print(f"{len(train_subset)=}")
    print(f"{len(val_subset)=}")
    print(f"{len(test_subset)=}")

    # file suffix with metadata
    params = {'train':train_size,
              'eval':val_size,
              'test':test_size,
              'rseed':seed,
             }
    
        # assign filename to each subset
    names = {'whole_train_subset':whole_train_subset,
             'train_subset':train_subset,
             'val_subset':val_subset,
             'test_subset':test_subset
            }

    # Save (finally!)
    for name in names:

        filename = os.path.join(args.output_path,name)
        with open(filename+'.json','w') as fp:
            json.dump(names[name],fp)

    with open(os.path.join(args.output_path,'splits_metadata.json'),'w') as fp:
        json.dump(params,fp)
        
    print('Done')

if __name__ == '__main__':
    main()

Overwriting split_patients.py


In [27]:
!chmod +x split_patients.py

In [28]:
!./split_patients.py -i '/home/debian/Simao/master-thesis/data/model_ready_dataset/icare2021_diag_A301/dataset.json' -o '/home/debian/Simao/master-thesis/data/model_ready_dataset/icare2021_diag_A301' --val_size 0.15 --test_size 0.15

/home/debian/Simao/master-thesis/Setup
/home/debian/Simao/master-thesis/data/model_ready_dataset/icare2021_diag_A301/dataset.json
/home/debian/Simao/master-thesis/data/model_ready_dataset/icare2021_diag_A301
0.15
0.15
seed=426
train_size= 0.7
len(whole_train)=223389
len(train)=183967
len(val)=39422
len(test)=39422
len(whole_train_subset)=223389
len(train_subset)=183967
len(val_subset)=39422
len(test_subset)=39422
Done


# Notebook objective: create and store train-validation-test splits

In [1]:
import os
cwd = os.getcwd()

# protection against running this cell multiple times
assert os.path.dirname(cwd).split('/')[-1] == 'master-thesis','Oops, directory already changed previously as indended. Ignoring...'

# change working directory (if assert passed)
new_cwd = os.path.dirname(cwd) # parent directory
os.chdir(new_cwd)

In [2]:
# show all outputs
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [13]:
import os
import copy
import json

from os.path import basename
import numpy as np
from sklearn.model_selection import train_test_split
import torch
from config import Settings; settings = Settings()
import pandas as pd

from rnn_utils import DiagnosesDataset, MYCOLLATE, split_dataset

from torch.utils.data import DataLoader

## Reproducibility

In [14]:
seed = settings.random_seed; print(f'{seed=}')
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

seed=546


<torch._C.Generator at 0x7f51c0c20590>

## Parameters

<div class="alert alert-block alert-info">
Change these parameters as you prefer
</div>

In [15]:
dataset_id = 'diag_only'
dataset_name = 'dataset'
test_size = 0.15
val_size=0.15

sanity check and some processing

In [16]:
train_size = 1 - (test_size + val_size)
assert test_size + val_size < 1, 'Oops'

## Create splits

In [17]:
dataset_folder = os.path.join(settings.data_base,settings.model_ready_dataset_folder,dataset_id)
dataset_filepath = os.path.join(dataset_folder,dataset_name+'.json')

with open(dataset_filepath,'r') as fp:
    dataset = json.load(fp)

In [18]:
dataset_filepath

'data/model_ready_dataset/diag_only/dataset.json'

split ids

In [19]:
pat_ids = list(dataset['data'].keys())

whole_train,test = train_test_split(pat_ids,test_size=test_size)

val_size_corrected = val_size/(1-test_size)
train,val = train_test_split(whole_train,test_size=val_size_corrected)

print(f"{len(whole_train)=}")
print(f"{len(train)=}")
print(f"{len(val)=}")
print(f"{len(test)=}")

len(whole_train)=6374
len(train)=5249
len(val)=1125
len(test)=1125


create dataset splits

In [20]:
def generate_subset_data(original,inclusion_list):
    df = pd.DataFrame(original['data'])
    subset_original = df.loc[:,inclusion_list].to_dict()
    subset_original = {'metadata':original['metadata'],'data':subset_original}
    return subset_original

whole_train_subset = generate_subset_data(dataset,whole_train)
train_subset = generate_subset_data(dataset,train)
val_subset = generate_subset_data(dataset,val)
test_subset = generate_subset_data(dataset,test)

# sanity checks

print(f"{len(whole_train_subset['data'])=}")
print(f"{len(train_subset['data'])=}")
print(f"{len(val_subset['data'])=}")
print(f"{len(test_subset['data'])=}")

len(whole_train_subset['data'])=6374
len(train_subset['data'])=5249
len(val_subset['data'])=1125
len(test_subset['data'])=1125


# Save

## prepare save path and file suffix

<div class="alert alert-block alert-warning">
Don't forget, the folder which the files will be saved is defined in .env file!
</div>

In [21]:
# file suffix with metadata
params = {'train':train_size,
          'eval':val_size,
          'test':test_size,
          'rseed':seed,
         }

In [22]:
# assign filename to each subset
names = {'whole_train_subset':whole_train_subset,
         'train_subset':train_subset,
         'val_subset':val_subset,
         'test_subset':test_subset
        }

# Save (finally!)
for name in names:
    filepath = os.path.join(dataset_folder,dataset_name)
    
    if not os.path.isdir(filepath):
        os.mkdir(filepath)
    
    filename = os.path.join(filepath,name)
    with open(filename+'.json','w') as fp:
        json.dump(names[name],fp)

with open(os.path.join(filepath,'metadata.json'),'w') as fp:
    json.dump(params,fp)

# Hurray!