In [1]:
%load_ext autoreload
%autoreload 2

# Introduction and Objective
## Splitting the data into train, val, and test

Note! the Isfahan Uni of Med Science data (dataset_1) are excluded from this round of analysis. We include that 
dataset when we have their metadata avilable so that we can split them by stratification on the subjects' age.

In [2]:
import numpy as np
import yaml
import os
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd

In [3]:
from utils import split_filenames, nested_dict_to_easydict

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', None)

# load parameters

In [5]:
with open("../../code_configs/params.yaml") as f:
    PARAMS = yaml.safe_load(f)
    PARAMS = nested_dict_to_easydict(PARAMS)

In [6]:
print(PARAMS)

{'RAW_DATA_DIRECTORY': '../../data/raw', 'INTERMEDIATE_DATA_DIRECTORY': '../../data/intermediate', 'PRIMARY_DATA_DIRECTORY': '../../data/primary', 'ARTIFACTS_DATA_DIRECTORY': '../../artifacts', 'DATASET_1_RAW_ZIP_FILE_NAME': 'radiography-20221212T190109Z-001.zip', 'DATASET_1_RAW_ZIP_NEW_ANNOT_FILE_NAME': 'label_edited-20230122T143005Z-001.zip', 'DATASET_1_INTERMEDIATE_DIR_NAME': 'dataset_1', 'DATASET_1_UNWANTED_JSON_FIELDS': ['imageData'], 'DATASET_3_RAW_RAR_FILE_NAME': 'RawImage.rar', 'DATASET_3_RAW_DIR_NAME_TEMP': 'dataset_3_bmp', 'DATASET_3_RAW_DIR_NAME': 'dataset_3_raw_data', 'UNWANTED_JSON_FIELDS': ['imageData', 'imagePath'], 'DATASET_1_INTERM_DIR_NAME': 'dataset_1_interm_data-20230513T084705Z-001', 'DATASET_1_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_1_INTERM_V_LANDMARKS_DIR_NAME': ['label'], 'DATASET_1_INTERM_F_LANDMARKS_DIR_NAME': None, 'DATASET_2_INTERM_DIR_NAME': 'dataset_2_interm_data-20230304T160421Z-001', 'DATASET_2_INTERM_IMG_DIR_NAME': ['image'], 'DATASET_2_INTERM_V_LAND

# load the metadata table and perform the splitting

In [7]:
store = pd.HDFStore(
    os.path.join(PARAMS.PRIMARY_DATA_DIRECTORY, PARAMS.TRAIN.METADATA_TABLE_NAME),
    mode='a',
)
# read the table into a pandas DataFrame
metadata_table = store.select('df')

In [8]:
metadata_table.shape

(621, 16)

In [9]:
metadata_table.head(5)

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set,valid
0,True,False,True,,,041281ee7fb89f6835a71c309b3b503e3d5a68fc46a608...,3.0,2.0,5.0,2.0,5.0,2.0,45.jpg,dataset_1,,True
1,True,False,True,,,2cfa37a69916c8a45a51bb8beeb04425e07d2a22f694e0...,3.0,2.0,5.0,2.0,5.0,2.0,92.jpg,dataset_1,,True
2,True,False,True,,,7201dc2be0b97f59a7901004d6496bbe84c440530776db...,3.0,2.0,5.0,2.0,5.0,2.0,43.jpg,dataset_1,,True
3,True,False,True,,,2cd4487c03c72d1016ea0a72d1b21eb987878c90ae9eff...,3.0,2.0,5.0,2.0,5.0,2.0,7.jpg,dataset_1,,True
4,True,False,True,,,27624a6eb37bbc8aafabe2075f423d573b189eae6f23fb...,3.0,2.0,5.0,2.0,5.0,2.0,121.jpg,dataset_1,,True


In [10]:
selected_samples = metadata_table.loc[
    (metadata_table['dataset'].isin(PARAMS.TRAIN.DATASETS_TO_INCLUDE)) & (
        metadata_table['valid'] == True
    ),
    ['harmonized_id', 'dataset']
]

In [11]:
selected_samples.shape

(475, 2)

# Perform the splitting

In [12]:
train_files, val_files, test_files = split_filenames(    
    filenames=selected_samples['harmonized_id'].tolist(), 
    train_ratio=0.6,
    val_ratio=0.2,
    test_ratio=0.2,
    seed=100,
    grouping_factor=selected_samples['dataset'].tolist(),
)

# store the splits into a table

In [13]:
indices_train = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        train_files,
    )
)[0]

In [14]:
indices_val = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        val_files,
    )
)[0]

In [15]:
indices_test = np.where(
    np.isin(
        metadata_table['harmonized_id'].to_numpy(), 
        test_files,
    )
)[0]

In [16]:
split_arr = np.repeat('undefined', metadata_table.shape[0],)

In [17]:
split_arr[indices_train] = np.repeat('train', indices_train.shape)
split_arr[indices_val] = np.repeat('val', indices_val.shape)
split_arr[indices_test] = np.repeat('test', indices_test.shape)

In [18]:
metadata_table['split'] = split_arr

In [19]:
metadata_table.loc[:,['split', 'dataset']].value_counts()

split      dataset  
train      dataset_3    237
undefined  dataset_1    142
test       dataset_3     80
val        dataset_3     79
train      dataset_2     47
test       dataset_2     17
val        dataset_2     15
undefined  dataset_3      4
dtype: int64

In [20]:
metadata_table.loc[metadata_table['valid']==False,]

Unnamed: 0,v_annots_present,f_annots_present,edges_present,f_annots_rows,f_annots_cols,harmonized_id,v_annots_2_rows,v_annots_2_cols,v_annots_3_rows,v_annots_3_cols,v_annots_4_rows,v_annots_4_cols,source_image_filename,dataset,dev_set,valid,split
266,True,True,True,19,2,d7fc3b01b39c52636257e70c66aadbfac93e06bee24874...,5.0,2.0,,,5.0,2.0,054.jpeg,dataset_3,training,False,undefined
335,True,True,True,19,2,a07a5a11953b5895737cf2073715be2bb15411bd45ff91...,,,5.0,2.0,5.0,2.0,115.jpeg,dataset_3,training,False,undefined
384,True,True,True,19,2,2231a08c6ea2c6d93e7706f7ed51710a039e53203b0d9d...,,,5.0,2.0,5.0,2.0,271.jpeg,dataset_3,test1,False,undefined
426,True,True,True,19,2,0c13b055b51564143a2c52b8558a37888bc45625966fc0...,5.0,2.0,,,5.0,2.0,180.jpeg,dataset_3,test1,False,undefined


# write the new column to the metadata table

In [21]:
metadata_table.reset_index(drop=True, inplace=True)

In [22]:
# write the modified DataFrame back to the HDF5 file
store.put('df', metadata_table, data_columns=True)
store.close()

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block2_values] [items->Index(['f_annots_rows', 'f_annots_cols', 'harmonized_id',
       'source_image_filename', 'dataset', 'dev_set', 'split'],
      dtype='object')]

  store.put('df', metadata_table, data_columns=True)
