# Train Test Split
This notebook contains functionality to split the images into a train and test set and to create an annotation csv-file used by the pytorch data loader.

In [2]:
import random
import re
import sys

import numpy as np
import pandas as pd

sys.path.insert(0, '../../scripts/')

from helpers import miscellaneous as misc

CONFIG = misc.get_config()

## Split with label CN, MCI and AD

In [3]:
# load image descriptions
df = pd.read_csv('../../' + CONFIG['RAW_DATA_DIR'] + '/images/ADNI1_Complete_1Yr_1.5T_1_20_2022.csv')
df.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I97327,941_S_1311,MCI,M,69,1,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,3/02/2007,NiFTI,
1,I112538,941_S_1311,MCI,M,70,4,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,6/01/2008,NiFTI,
2,I97341,941_S_1311,MCI,M,70,3,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,9/27/2007,NiFTI,
3,I63874,941_S_1202,CN,M,78,1,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,1/30/2007,NiFTI,
4,I75150,941_S_1202,CN,M,78,3,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,8/24/2007,NiFTI,


In [4]:
example_file_name = 'ADNI_002_S_1155_MR_MPR__GradWarp__B1_Correction__N3__Scaled_Br_20070217034919863_S24144_I40845.nii'

regex_pattern = '(I[0-9]*)\.nii'  # extracts image data id from file name

re.findall(regex_pattern, example_file_name)

['I40845']

In [5]:
re.search('I40845', example_file_name)

<re.Match object; span=(88, 94), match='I40845'>

In [6]:
nii_files = misc.get_nii_filenames(CONFIG['FLATTENED_DATA_DIR'])
nii_files = [i.split("../")[-1] for i in nii_files]

In [7]:
nii_files[0]

'data/raw/flattened\\ADNI_002_S_0295_MR_MPR__GradWarp__B1_Correction__N3__Scaled_2_Br_20081001114556321_S13408_I118671.nii'

In [8]:
def split_into_train_test(df, group_identifier, data_dir, cognitive_test_path = None, proportion=0.8, save_files=False, file_type = "nii"):
    """
    df: file to ADNI images dataset
    group_identifier: identifier for CN, MCI or AD
    data_dir: path to all .nii files of subjects
    cognitive_test_path: if not none: add to each image a cognitive test result based on the given test, default: None
    proportion: train/test split proportion, default: 0.8
    save_files: save as file (True/False), default: False
    """
    assert 1 > proportion > 0, f'parameter proportion has to be in range [0,1]. {proportion} was given'

    testset = pd.DataFrame(columns=['filename', 'Group', 'Subject'])
    trainset = pd.DataFrame(columns=['filename', 'Group', 'Subject'])

    # add filenames to df
    if file_type == "png":
        filenames = misc.get_png_filenames(data_dir)
    else:
        filenames = misc.get_nii_filenames(data_dir)
    
    filenames = [i.split("../")[-1] for i in filenames]

    def get_filename_by_id(x):
        filename = [file for file in filenames if x['Image Data ID'] in file]
        if len(filename) == 1:
            return filename[0]
    #   else:
    #       print(x['Image Data ID']) # fixme missing images?

    df['filename'] = df.apply(get_filename_by_id, axis=1)

    # group by identifier
    df_grouped = df.groupby(group_identifier)

    groups = df[group_identifier].unique()
    for group in groups:
        df_group = df_grouped.get_group(group)
        subjects = df_group['Subject'].unique()
        random.seed(CONFIG['RANDOM_STATE'])
        random.shuffle(subjects)

        n_train = int(len(subjects) * proportion)  # accordingly n_test = len(subjects) - n_train

        train_subjects = subjects[:n_train]
        test_subjects = subjects[n_train:]

        sub_trainset = df_group[df_group['Subject'].isin(train_subjects)]
        sub_testset = df_group[df_group['Subject'].isin(test_subjects)]
        
        trainset_cognitive_test_column_list = []
        testset_cognitive_test_column_list = []
        
        if(cognitive_test_path is not None):
            sub_trainset, trainset_cognitive_test_column_list = get_cognitive_test_set(sub_trainset, cognitive_test_path)
            sub_testset, testset_cognitive_test_column_list = get_cognitive_test_set(sub_testset, cognitive_test_path)
        
        trainset = pd.concat([trainset, sub_trainset[['filename', 'Group', 'Subject'] + trainset_cognitive_test_column_list]], ignore_index=True)
        testset = pd.concat([testset, sub_testset[['filename', 'Group', 'Subject'] + testset_cognitive_test_column_list]], ignore_index=True)
        
        
    if save_files:
        trainset.to_csv('../../' + CONFIG['ANNOTATIONS_DIR'] + 'train_labels.csv', sep=',', header=True, index=False)
        testset.to_csv('../../' + CONFIG['ANNOTATIONS_DIR'] + 'test_labels.csv', sep=',', header=True, index=False)

        return None

    return trainset, testset

def get_cognitive_test_set(subjects, path_to_cognitive_test_set):
    """
    Merges the train or test set entries with their cognitive test result data by looking at the date of the USERDATE attribute.
    Mathces image date of acquisition to the closest cognitive test result in time.
    TODO: see if USERDATE is adequate to match with "acq date" attribute from the images.
    """
    subjects["Subject_ID"] = subjects["Subject"].apply(lambda x: x.split("_")[-1])
    subjects["Acq Date"] = pd.to_datetime(subjects["Acq Date"])

    cognitive_test_results = pd.read_csv(path_to_cognitive_test_set)

    cognitive_test_results.USERDATE = pd.to_datetime(cognitive_test_results.USERDATE)

    #fill dictionary with all the test results by matching subject_id with RID
    dicts = {}
    for item in subjects["Subject_ID"].unique(): dicts[item]=cognitive_test_results[cognitive_test_results.RID == int(item)]

    #this could potentially replace all of the active code below
    """subjects[cognitive_test_results.columns] = np.nan
    for i in range(0, len(subjects)):
        value = get_closest_test_result_by_date(subjects.iloc[i], dicts)
        subjects.loc[i, cognitive_test_results.columns.values.tolist()] = value"""
    
    closest_test_results = [get_closest_test_result_by_date(subjects.iloc[i], dicts) for i in range(0, len(subjects))]
    closest_test_results = pd.concat(closest_test_results).reset_index(drop=True)
    subjects = subjects.reset_index(drop=True)
    subjects_ = pd.concat([subjects, closest_test_results], axis=1)
    return subjects_, cognitive_test_results.columns.values.tolist()

def get_closest_test_result_by_date(item, dicts):
    #Needs check when it doesn't find anything in the list of all cognitive test results
    try:
        test_results = dicts[item.Subject_ID]
        test_results = test_results[test_results.USERDATE ==  min(test_results.USERDATE, key = lambda x: abs(x - item["Acq Date"]))]
        return(test_results)
    except:
        print(f"test_result not found for image subject ID {item.Subject_ID}, returning NONE")
        #Implement if needed
        value = next(iter(dicts.values()))
        value = pd.DataFrame(columns = value.columns)
        value = value.append([np.nan], ignore_index=True)
        return(value)

split_into_train_test(df, 'Group', CONFIG['FLATTENED_DATA_DIR'], save_files=True)

## Split according to granular diagnosis

In [8]:
# maybe later ;)