### How to split the dataset?

- Total: 6338  samples age = 15.0 - 31.0
- Allowed: 5000  samples

In [None]:
from   ipywidgets import interactive, fixed
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random
import seaborn as sns

In [None]:
random.seed(0)
np.random.seed(0)

### Settings

In [None]:
# WARNING: Do NOT set these variables to TRUE unless you want to create a fresh train/test split
SAVE_TRAIN_SET = False
SAVE_VALID_SET = False
SAVE_TEST_SET  = False

In [None]:
ANNOTATIONS_ALL_CSV   = '../metadata/annotations_all.csv'
ANNOTATIONS_TRAIN_CSV = '../metadata/annotations_train.csv'
ANNOTATIONS_VALID_CSV = '../metadata/annotations_valid.csv'
ANNOTATIONS_TEST_CSV  = '../metadata/annotations_test.csv'

In [None]:
bins = np.linspace(start=15.0, stop=30.0, num=16, endpoint=True, dtype=np.int32)
bins

In [None]:
df_annotations = pd.read_csv(ANNOTATIONS_ALL_CSV)

df_annotations['age_years'] = df_annotations['age'].to_numpy() / 365.25
df_annotations['age_bin']   = np.digitize(df_annotations['age_years'].to_numpy(), bins)

patient = [x.split('/')[-1].replace('.npy','').split('_')[1] for x in df_annotations['image'].to_list()]
study   = [x.split('/')[-1].replace('.npy','').split('_')[2] for x in df_annotations['image'].to_list()]
series  = [x.split('/')[-1].replace('.npy','').split('_')[3] for x in df_annotations['image'].to_list()]

df_annotations['patient'] = patient
df_annotations['study']   = study
df_annotations['series']  = series

n_patient_images = []
for patient_ in df_annotations['patient'].to_list():
    df_patient        = df_annotations.iloc[np.where(df_annotations['patient']==patient_)[0]]
    n_images          = df_patient.shape[0]
    n_patient_images.append(n_images)
df_annotations['n_patient_images'] = n_patient_images

df_annotations

### Remove people older than 30.0 years
In other words: Include only 15 to 29 year olds

In [None]:
df_annotations_29 = df_annotations.iloc[np.where(df_annotations['age_years']<(30.0))[0]]
df_annotations_29 = df_annotations_29.reset_index(drop=True)
df_annotations_29

In [None]:
print('Number of patients between age 15.0 and 30.0 years: {:d}'.format(np.unique(df_annotations_29['patient']).shape[0]))

In [None]:
df_anntotations_male   = df_annotations_29.iloc[np.where(df_annotations_29['sex']=='M')[0]]
df_anntotations_female = df_annotations_29.iloc[np.where(df_annotations_29['sex']=='F')[0]]

df_anntotations_male   = df_anntotations_male.reset_index(drop=True)
df_anntotations_female = df_anntotations_female.reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(data=df_annotations_29, x='age_years', bins=bins, ax=ax[0])
ax[0].set_ylim(0,620)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (years)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)
ax[0].set_title('male and female combined', fontsize=16)

gfg = sns.histplot(data=df_annotations_29, x='age_years', hue='sex', bins=bins, multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,620)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (years)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution in Age Range [15.0,30.0]', fontsize=18)

plt.savefig('../results/age_dist_15_30_n5718.png')
plt.show()

### Reduce dataset to 5000 samples by removing persons with high age values

In [None]:
df_annotations_5000 = df_annotations_29.copy(deep=True)
df_annotations_5000

In [None]:
while(df_annotations_5000.shape[0]>5000):

    # Identify the age bin with the most samples
    _, bin_counts      = np.unique(df_annotations_5000['age_bin'].to_numpy(), return_counts=True)
    most_populated_bin = np.argmax(bin_counts)+1 # np.digitze() starts bins at 1

    # Create a temporary dataframe for samples from that bin
    df_most_populated_bin = df_annotations_5000[df_annotations_5000['age_bin'].isin([most_populated_bin])]
    
    # Drop samples with the least frequent sex
    sexes, sex_counts     = np.unique(df_most_populated_bin['sex'].to_numpy(), return_counts=True)
    least_frequent_sex    = sexes[np.argmin(sex_counts)]
    df_most_populated_bin = df_most_populated_bin[~df_most_populated_bin['sex'].isin([least_frequent_sex])]
    
    # Keep only patients with the highest number of images
    max_n_patient_images  = np.max(df_most_populated_bin['n_patient_images'].to_numpy())
    df_most_populated_bin = df_most_populated_bin[df_most_populated_bin['n_patient_images'].isin([max_n_patient_images])]

    # Randomly select an image to drop from the overall dataframe
    images_most_populated_bin = df_most_populated_bin['image'].to_numpy()
    image_to_drop = np.random.choice(images_most_populated_bin, size=1, replace=False)[0]

    # Drop the image
    df_annotations_5000 = df_annotations_5000[~df_annotations_5000['image'].isin([image_to_drop])]
    df_annotations_5000 = df_annotations_5000.reset_index(drop=True)
    
df_annotations_5000

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(data=df_annotations_5000, x='age_years', bins=bins, ax=ax[0])
ax[0].set_ylim(0,400)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (years)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)
ax[0].set_title('male and female combined', fontsize=16)

gfg = sns.histplot(data=df_annotations_5000, x='age_years', hue='sex', bins=bins, multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,400)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (years)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution in Age Range [15.0,30.0]', fontsize=18)

plt.savefig('results/age_dist_5000.png')
plt.show()

### Create test set and validation sets

We are only allowed to analyze 5000 images, so we need to get rid of some images.

However, we also want a very high quality test set, so we first sample that and then remove some images for the training set.

The criteria for the test set are:
- Only one image per person is allowed in the test set
- if a person is already part of the test set, additional images of that patient (which may exist) cannot be part of training and validation set
- Each age bin has the same number of images
- The male female distribution in each age bin is 50:50

#### Find persons whith only 1 image

This subset will be used to create the test set. Otherwise we would waste too much precious data. For example, if we include an image of a person with 12 images in total in the test set, 11 images will be thrown away. A person can only appear in either the test set or the training set. And in the test set we on

In [None]:
df_single_image        = df_annotations_5000.iloc[np.where(df_annotations_5000['n_patient_images']==1)[0]]
df_single_image_male   = df_single_image.iloc[np.where(df_single_image['sex']=='M')[0]]
df_single_image_female = df_single_image.iloc[np.where(df_single_image['sex']=='F')[0]]

df_single_image        = df_single_image.reset_index(drop=True)
df_single_image_male   = df_single_image_male.reset_index(drop=True)
df_single_image_female = df_single_image_female.reset_index(drop=True)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(df_single_image['age_years'], bins=bins, ax=ax[0])
ax[0].set_ylim(0,180)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (years)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)
ax[0].set_title('male and female combined', fontsize=16)

gfg = sns.histplot(data=df_single_image, x='age_years', hue='sex', bins=bins, multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,180)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (years)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution for Persons with Only One Image', fontsize=18)

plt.savefig('results/age_dist_single_image.png')
plt.show()

#### Find the least populated bin for each sex

The least populated bin determines the size of the validation set and test set:

- n_samples_bin >= validation_set_size + training_set_size

In [None]:
_, counts_m = np.unique(df_single_image_male['age_bin'].to_numpy(), return_counts=True)
lowest_count_m         = np.amin(counts_m)
age_bin_lowest_count_m = bins[np.argmin(counts_m)]

_, counts_f = np.unique(df_single_image_female['age_bin'].to_numpy(), return_counts=True)
lowest_count_f         = np.amin(counts_f)
age_bin_lowest_count_f = bins[np.argmin(counts_f)]

print('For male persons with 1 image the bin [{:.1f},{:.1f}] is the least populated bin with {:d} samples'.format(age_bin_lowest_count_m, age_bin_lowest_count_m+1, lowest_count_m))
print('For female persons with 1 image the bin [{:.1f},{:.1f}] is the least populated bin with {:d} samples'.format(age_bin_lowest_count_f, age_bin_lowest_count_f+1, lowest_count_f))

lowest_count = np.min([lowest_count_m, lowest_count_f])
print('\nLargest possible number of samples per age and sex bin: {:d}'.format(np.int32(lowest_count/2.0)))

#### Randomly sample validation set and test set

In [None]:
images_valid = []
images_test  = []

# Define bin size
# Data is binned with respect to age and sex, e.g. all 21 year old males are in one bin, all 17 year old females are in another
valid_set_bin_size = 10
test_set_bin_size  = 10

for bin_id in np.linspace(start=1, stop=len(bins), num=len(bins)-1, endpoint=False, dtype=np.int32):
    
    # Identify patients in the current age bin
    pats_m_bin = np.where(df_single_image_male['age_bin']==bin_id)[0]
    pats_f_bin = np.where(df_single_image_female['age_bin']==bin_id)[0]
    
    # Randomly sample the patients for both validation set and test set
    pats_m_bin = np.random.choice(pats_m_bin, size=test_set_bin_size+valid_set_bin_size, replace=False)
    pats_f_bin = np.random.choice(pats_f_bin, size=test_set_bin_size+valid_set_bin_size, replace=False)
    
    # Get image IDs of the sampled patients
    df_m_bin = df_single_image_male.loc[pats_m_bin]
    df_f_bin = df_single_image_female.loc[pats_f_bin]
    
    images_m = df_m_bin['image'].to_list()
    images_f = df_f_bin['image'].to_list()
    
    # Split images into validation set and test set
    images_m_valid = images_m[:valid_set_bin_size]
    images_m_test  = images_m[test_set_bin_size:]
    
    images_f_valid = images_f[:valid_set_bin_size]
    images_f_test  = images_f[test_set_bin_size:]
    
    # Append images for the current age to the list of validation set or test set images
    images_valid.extend(images_m_valid)
    images_valid.extend(images_f_valid)
    
    images_test.extend(images_m_test)
    images_test.extend(images_f_test)

#### Validation set

In [None]:
df_valid = df_annotations_5000[df_annotations_5000['image'].isin(images_valid)]
df_valid = df_valid.reset_index(drop=True)
df_valid = df_valid.drop(columns=['age_years','age_bin','patient','study','series','n_patient_images'])

if SAVE_VALID_SET:
    df_valid.to_csv(ANNOTATIONS_VALID_CSV, index=False)

df_valid

In [None]:
df_test = df_annotations_5000[df_annotations_5000['image'].isin(images_test)]
df_test = df_test.reset_index(drop=True)
df_test = df_test.drop(columns=['age_years','age_bin','patient','study','series','n_patient_images'])

if SAVE_TEST_SET:
    df_test.to_csv(ANNOTATIONS_TEST_CSV, index=False)

df_test

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(df_valid['age'], binwidth=365.25, binrange=[15*365.25,30*365.25], ax=ax[0])
ax[0].set_ylim(0,21)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (days)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)

gfg = sns.histplot(data=df_valid, x='age', hue='sex', binwidth=365.25, binrange=[15*365.25,30*365.25], multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,21)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (days)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution in Validation Set', fontsize=18)

plt.savefig('results/age_dist_valid_set.png')
plt.show()

fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(df_test['age'], binwidth=365.25, binrange=[15*365.25,30*365.25], ax=ax[0])
ax[0].set_ylim(0,21)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (days)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)

gfg = sns.histplot(data=df_test, x='age', hue='sex', binwidth=365.25, binrange=[15*365.25,30*365.25], multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,21)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (days)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution in Test Set', fontsize=18)

plt.savefig('results/age_dist_test_set.png')
plt.show()

### Create training set

The training set contains the remaining samples which are not part of the test set or the validation set

In [None]:
images_valid_or_test = images_valid + images_test

df_train = df_annotations_5000[~df_annotations_5000['image'].isin(images_valid_or_test)]
df_train = df_train.reset_index(drop=True)
df_train = df_train.drop(columns=['age_years','age_bin','patient','study','series','n_patient_images'])

if SAVE_TRAIN_SET:
    df_train.to_csv(ANNOTATIONS_TRAIN_CSV, index=False)

df_train

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(14,7))

sns.histplot(df_train['age'], binwidth=365.25, binrange=[15*365.25,30*365.25], ax=ax[0])
ax[0].set_ylim(0,370)
ax[0].tick_params(labelsize=14)
ax[0].set_xlabel('age / (days)', fontsize=16)
ax[0].set_ylabel('count', fontsize=16)

gfg = sns.histplot(data=df_train, x='age', hue='sex', binwidth=365.25, binrange=[15*365.25,30*365.25], multiple='dodge', ax=ax[1])
ax[1].set_ylim(0,370)
ax[1].tick_params(labelsize=14)
ax[1].set_xlabel('age / (days)', fontsize=16)
ax[1].set_ylabel('count', fontsize=16)
ax[1].set_title('male and female side-by-side', fontsize=16)
plt.setp(gfg.get_legend().get_title(), fontsize='14')
plt.setp(gfg.get_legend().get_texts(), fontsize='14')

plt.suptitle('Age Distribution in Training Set', fontsize=18)

plt.savefig('results/age_dist_train_set.png')
plt.show()