# Train Test Split
This notebook contains functionality to split the images into a train and test set and to create an annotation csv-file used by the pytorch data loader.

In [41]:
import random
import re
import sys

import pandas as pd

sys.path.insert(0, '../../scripts/')

from helpers import miscellaneous as misc

CONFIG = misc.get_config()

## Split with label CN, MCI and AD

In [42]:
# load image descriptions
df = pd.read_csv('../../' + CONFIG['RAW_DATA_DIR'] + 'data/images/ADNI1_Complete_1Yr_1.5T_1_20_2022.csv')
df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I97327,941_S_1311,MCI,M,69,1,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/02/2007,NiFTI,
1,I112538,941_S_1311,MCI,M,70,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,6/01/2008,NiFTI,
2,I97341,941_S_1311,MCI,M,70,3,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,9/27/2007,NiFTI,
3,I63874,941_S_1202,CN,M,78,1,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,1/30/2007,NiFTI,
4,I75150,941_S_1202,CN,M,78,3,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,8/24/2007,NiFTI,


In [43]:
example_file_name = 'ADNI_002_S_1155_MR_MPR__GradWarp__B1_Correction__N3__Scaled_Br_20070217034919863_S24144_I40845.nii'

regex_pattern = '(I[0-9]*)\.nii'  # extracts image data id from file name

re.findall(regex_pattern, example_file_name)

['I40845']

In [44]:
re.search('I40845', example_file_name)

<re.Match object; span=(88, 94), match='I40845'>

In [45]:
nii_files = misc.get_nii_filenames(CONFIG['FLATTENED_DATA_DIR'])
nii_files[:2]

['../../data/raw/flattened\\ADNI_002_S_0295_MR_MPR__GradWarp__B1_Correction__N3__Scaled_2_Br_20081001114556321_S13408_I118671.nii',
 '../../data/raw/flattened\\ADNI_002_S_0295_MR_MPR__GradWarp__B1_Correction__N3__Scaled_2_Br_20081001120532722_S21856_I118692.nii']

In [59]:
def split_into_train_test(df, group_identifier, proportion=0.8, save_files=False):
    assert 1 > proportion > 0, f'parameter proportion has to be in range [0,1]. {proportion} was given'

    testset = pd.DataFrame(columns=['filename', 'Group', 'Subject'])
    trainset = pd.DataFrame(columns=['filename', 'Group', 'Subject'])

    # df_files = map_imageID_to_file(df, misc.get_nii_filenames(CONFIG['FLATTENED_DATA_DIR']))

    # add filenames to df
    filenames = misc.get_nii_filenames(CONFIG['FLATTENED_DATA_DIR'])

    def get_filename_by_id(x):
        filename = [file for file in filenames if x['Image Data ID'] in file]
        if len(filename) == 1:
            return filename[0]
    #   else:
    #       print(x['Image Data ID']) # fixme missing images?

    df['filename'] = df.apply(get_filename_by_id, axis=1)

    # group by identifier
    df_grouped = df.groupby(group_identifier)

    groups = df[group_identifier].unique()
    for group in groups:
        df_group = df_grouped.get_group(group)

        subjects = df_group['Subject'].unique()

        random.seed(CONFIG['RANDOM_STATE'])
        random.shuffle(subjects)

        n_train = int(len(subjects) * proportion)  # accordingly n_test = len(subjects) - n_train

        train_subjects = subjects[:n_train]
        test_subjects = subjects[n_train:]

        sub_trainset = df_group[df_group['Subject'].isin(train_subjects)]
        sub_testset = df_group[df_group['Subject'].isin(test_subjects)]

        testset = pd.concat([testset, sub_testset[['filename', 'Group', 'Subject']]], ignore_index=True)
        trainset = pd.concat([trainset, sub_trainset[['filename', 'Group', 'Subject']]], ignore_index=True)

    if save_files:
        trainset.to_csv('../../' + CONFIG['ANNOTATIONS_DIR'] + 'train_labels.csv', sep=',', header=True, index=False)
        testset.to_csv('../../' + CONFIG['ANNOTATIONS_DIR'] + 'test_labels.csv', sep=',', header=True, index=False)

        return None

    return trainset, testset


split_into_train_test(df, 'Group', save_files=True)

## Split according to granular diagnosis

In [60]:
# maybe later ;)