In [1]:
%load_ext autoreload
%autoreload 2

# Introduction and Objective
## Splitting the data into train, val, and test

Note! the Isfahan Uni of Med Science data (dataset_1) are excluded from this round of analysis. We include that 
dataset when we have their metadata avilable so that we can split them by stratification on the subjects' age.

In [2]:
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd

In [3]:
from utils import split_filenames

# load parameters

In [4]:
with open("../../code_configs/params.yaml") as f:
    params = yaml.safe_load(f)

In [5]:
print(params)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'ARTIFACTS_DATA_DIRECTORY': '../../artifacts', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230513T084705Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LAND

# load the metadata table and perform the splitting

In [6]:
store = pd.HDFStore(
    os.path.join(params['PRIMARY_DATA_DIRECTORY'], params['METADATA_TABLE_NAME']),
    mode='a',
)
# read the table into a pandas DataFrame
metadata_table = store.select('df')

In [7]:
metadata_table.head(5)

Unnamed: 0,source_image_filename,harmonized_id,dataset,dev_set,v_annots_present,f_annots_present,edges_present
0,45.jpg,041281ee7fb89f6835a71c309b3b503e3d5a68fc46a608...,dataset_1,,True,False,True
1,92.jpg,2cfa37a69916c8a45a51bb8beeb04425e07d2a22f694e0...,dataset_1,,True,False,True
2,43.jpg,7201dc2be0b97f59a7901004d6496bbe84c440530776db...,dataset_1,,True,False,True
3,7.jpg,2cd4487c03c72d1016ea0a72d1b21eb987878c90ae9eff...,dataset_1,,True,False,True
4,121.jpg,27624a6eb37bbc8aafabe2075f423d573b189eae6f23fb...,dataset_1,,True,False,True


In [8]:
selected_samples = metadata_table.loc[
    metadata_table['dataset'].isin(params['DATASETS_TO_INCLUDE']),
    ['harmonized_id', 'dataset']
]

In [9]:
selected_samples.shape

(479, 2)

# Perform the splitting

In [10]:
train_files, val_files, test_files = split_filenames(    
    filenames=selected_samples['harmonized_id'].tolist(), 
    train_ratio=0.6,
    val_ratio=0.2,
    test_ratio=0.2,
    seed=100,
    grouping_factor=selected_samples['dataset'].tolist(),
)

# store the splits into a table

In [11]:
indices_train = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        train_files,
    )
)[0]

In [12]:
indices_val = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        val_files,
    )
)[0]

In [13]:
indices_test = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        test_files,
    )
)[0]

In [14]:
split_arr = np.repeat('undefined', metadata_table.shape[0],)

In [15]:
split_arr[indices_train] = np.repeat('train', indices_train.shape)
split_arr[indices_val] = np.repeat('val', indices_val.shape)
split_arr[indices_test] = np.repeat('test', indices_test.shape)

In [16]:
metadata_table['split'] = split_arr

# write the new column to the metadata table

In [17]:
# write the modified DataFrame back to the HDF5 file
store.put('df', metadata_table, format='table', data_columns=True)
store.close()