# Define data splits

In [1]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
import json
# https://stackoverflow.com/questions/34478398/import-local-function-from-a-module-housed-in-another-directory-with-relative-im
module_path = os.path.abspath(os.path.join('..'))
if module_path in sys.path:
    sys.path.remove(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)
from data.utils import generate_split_dict

In [2]:
df = pd.read_pickle("../../LVNC_dataset/raw_dataset/df_info.pick")
df_patients = pd.read_pickle("../../LVNC_dataset/raw_dataset/df_info_patients.pick")

In [3]:
df

Unnamed: 0,patient,slice,set,reversed,score,pta_qlvthc,pta_class_map
0,ANH,4,Hebron,False,5.0,41.98,41.801974
1,ANH,5,Hebron,False,5.0,42.02,41.825516
2,ANH,6,Hebron,False,4.5,42.78,42.629004
3,ANH,7,Hebron,False,4.5,38.60,38.458082
4,ANH,8,Hebron,False,5.0,37.26,37.127519
...,...,...,...,...,...,...,...
3077,X9,4,X,False,5.0,35.32,35.079389
3078,X9,5,X,False,5.0,30.86,30.661447
3079,X9,6,X,False,5.0,26.88,26.719278
3080,X9,7,X,False,5.0,25.68,25.507652


In [4]:
df_patients

Unnamed: 0,patient,VT%,max_slice,reversed,set,num_slices,slices,slices_left
0,P1,29.7443,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
1,P2,33.0554,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
2,P3,29.0985,8,False,HCM,7,"[2, 3, 4, 5, 6, 7, 8]",[1]
3,P4,22.7398,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
4,P5,28.5148,10,False,HCM,9,"[2, 3, 4, 5, 6, 7, 8, 9, 10]",[1]
...,...,...,...,...,...,...,...,...
374,MEAP,29.0362,9,True,Hebron,7,"[3, 4, 5, 6, 7, 8, 9]","[1, 2]"
375,MJGV,32.2028,11,True,Hebron,5,"[7, 8, 9, 10, 11]","[1, 2, 3, 4, 5, 6]"
376,RAM,29.1448,5,True,Hebron,4,"[2, 3, 4, 5]",[1]
377,RGP,36.3299,10,True,Hebron,8,"[3, 4, 5, 6, 7, 8, 9, 10]","[1, 2]"


Number of patients:

In [5]:
len(df["patient"].unique())

379

Number of patients per set:

In [6]:
df.groupby('set')['patient'].apply(lambda x: len(np.unique(x)))

set
HCM       293
Hebron     28
X          58
Name: patient, dtype: int64

Number of slices per set

In [7]:
df.groupby('set').apply(lambda x: len(x))

set
HCM       2381
Hebron     196
X          467
dtype: int64

Split as follows:
- Test set:
    + HCM: 53 patients
    + Hebron: 8 patients
    + X: 13
- Train set:
    + HCM: 240
    + Hebron: 20 patients
    + X: 45 patients
    
We should take into account that it is better to keep the test set with patients without slices left. That would help us to compare with a future 3D-based version.

In [8]:
def intermediate_slice_left(group):
    max_slice = max(group["slice"])
    min_slice = min(group["slice"])
    
    return pd.Series([max_slice-min_slice+1 == len(group), len(group)], index =['Continuous', 'Num slices'])

df_slices = df.groupby(["set","patient"]).apply(intermediate_slice_left)


In [9]:
df_slices

Unnamed: 0_level_0,Unnamed: 1_level_0,Continuous,Num slices
set,patient,Unnamed: 2_level_1,Unnamed: 3_level_1
HCM,P1,True,7
HCM,P10,True,9
HCM,P100,False,6
HCM,P101,True,8
HCM,P102,True,7
...,...,...,...
X,X66,True,8
X,X67,True,10
X,X7,True,8
X,X8,True,9


In [10]:
min(df_slices['Num slices'])

3

In [11]:
max(df_slices['Num slices'])

14

In [12]:
np.median(df_slices['Num slices'])

8.0

In [13]:
sum(df_slices['Continuous'] & (df_slices['Num slices']>4))

344

In [14]:
df_aux = df_slices[df_slices['Continuous'] & (df_slices['Num slices']>4)]

In [18]:
np.random.seed(1234)
test_selection = []
for s, g in df_aux.groupby('set'):
    if s=='HCM':
        n = 53
    elif s=='Hebron':
        n=8
    elif s=='X':
        n=13
    test_selection.append(g.sample(n))

In [19]:
test_patients = list(pd.concat(test_selection).reset_index()['patient'])

In [20]:
df_train = df[~df['patient'].isin(test_patients)]
df_test = df[df['patient'].isin(test_patients)]

In [21]:
df_train

Unnamed: 0,patient,slice,set,reversed,score,pta_qlvthc,pta_class_map
0,ANH,4,Hebron,False,5.0,41.98,41.801974
1,ANH,5,Hebron,False,5.0,42.02,41.825516
2,ANH,6,Hebron,False,4.5,42.78,42.629004
3,ANH,7,Hebron,False,4.5,38.60,38.458082
4,ANH,8,Hebron,False,5.0,37.26,37.127519
...,...,...,...,...,...,...,...
3070,X8,10,X,True,5.0,46.59,46.366316
3071,X8,11,X,True,5.0,52.00,51.761987
3072,X8,12,X,True,4.5,44.62,44.358261
3073,X8,13,X,True,5.0,39.66,39.314408


In [22]:
df_test

Unnamed: 0,patient,slice,set,reversed,score,pta_qlvthc,pta_class_map
39,DGA,4,Hebron,False,5.0,26.01,25.817338
40,DGA,5,Hebron,False,5.0,30.38,30.161580
41,DGA,6,Hebron,False,5.0,43.59,43.368652
42,DGA,7,Hebron,False,5.0,51.89,51.717524
43,DGA,8,Hebron,False,5.0,49.17,48.987792
...,...,...,...,...,...,...,...
3077,X9,4,X,False,5.0,35.32,35.079389
3078,X9,5,X,False,5.0,30.86,30.661447
3079,X9,6,X,False,5.0,26.88,26.719278
3080,X9,7,X,False,5.0,25.68,25.507652


## Cross-validation folds

Different combination of sets in the train and validation sets

In [23]:
set_combinations = [
    ["HCM"],
    ["HCM", "X"],
    ["HCM", "Hebron"],
    ["HCM", "X", "Hebron"]
]

In [24]:
train_patients = df_train["patient"].unique()
df_train_patients = df_patients[df_patients["patient"].isin(train_patients)]

In [25]:
df_train_patients

Unnamed: 0,patient,VT%,max_slice,reversed,set,num_slices,slices,slices_left
0,P1,29.7443,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
2,P3,29.0985,8,False,HCM,7,"[2, 3, 4, 5, 6, 7, 8]",[1]
3,P4,22.7398,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
4,P5,28.5148,10,False,HCM,9,"[2, 3, 4, 5, 6, 7, 8, 9, 10]",[1]
5,P6,26.6175,7,False,HCM,7,"[1, 2, 3, 4, 5, 6, 7]",[]
...,...,...,...,...,...,...,...,...
371,JADP,28.7786,10,True,Hebron,8,"[3, 4, 5, 6, 7, 8, 9, 10]","[1, 2]"
372,JDB,34.0609,11,True,Hebron,8,"[4, 5, 6, 7, 8, 9, 10, 11]","[1, 2, 3]"
374,MEAP,29.0362,9,True,Hebron,7,"[3, 4, 5, 6, 7, 8, 9]","[1, 2]"
376,RAM,29.1448,5,True,Hebron,4,"[2, 3, 4, 5]",[1]


In [26]:
# 5 folds per set based on 27.4 threshold
folds = {}
n_folds = 5
threshold = 27.4

skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=1234)

for p_set, group in df_train_patients.groupby("set"):
    folds[p_set] = []
    for train, val in skf.split(group, group["VT%"]>=threshold):
        folds[p_set].append({
            "train": group.iloc[train],
            "val": group.iloc[val]
        })

In [27]:
# Total number of slices in each fold
for set_name in folds:
    print("Set",set_name)
    for split in folds[set_name]:
        print(split["val"]["num_slices"].sum())

Set HCM
399
379
395
402
373
Set Hebron
31
22
28
25
31
Set X
71
72
71
72
71


In [28]:
folds_slices = {}
for set_name in folds:
    folds_slices[set_name] = []
    for split in folds[set_name]:
        folds_slices[set_name].append({
            s: pd.merge(split[s], df, on="patient").apply(lambda row: (row["patient"], row["slice"]), axis=1).to_list()
            for s in ["train", "val"]
        })
        

## Create split files

In [29]:
test_list = df_test.apply(lambda row: (row["patient"], row["slice"]), axis=1).to_list()

Split file with only HCM patients in the cross-validation and HCM, X and Hebron patients in test:

In [30]:
split_dict = {
    "test": test_list,
    "cross_validation": folds_slices["HCM"]
}
with open(os.path.join('../../LVNC_dataset/', "split_5_cv_HCM_groupedby_patient.json"), 'w') as f:
    json.dump(split_dict, f)

## Create split files combining HCM and X

In [31]:
folds_slices.keys()

dict_keys(['HCM', 'Hebron', 'X'])

In [32]:
folds_x_and_hcm = []
for fold_hcm, fold_x in zip(folds_slices["HCM"], folds_slices["X"]):
    folds_x_and_hcm.append({
        "train": fold_hcm["train"] + fold_x["train"],
        "val": fold_hcm["val"] + fold_x["val"],
    })

In [33]:
split_dict = {
    "test": test_list,
    "cross_validation": folds_x_and_hcm
}

with open(os.path.join('../../LVNC_dataset/', "split_5_cv_HCM_X_groupedby_patient.json"), 'w') as f:
    json.dump(split_dict, f)


Now we create a split file containing all the training data (that is, not test) without validation dataset. This is useful to train with the full training set after the cross-validation.

In [34]:
fold_hcm = folds_slices["HCM"][0]
fold_x = folds_slices["X"][0]

In [36]:
split_dict = {
    "test": test_list,
    "train": fold_hcm["train"]+fold_hcm["val"]+fold_x["train"]+fold_hcm["val"], 
    "val": []
}

In [38]:
with open(os.path.join('../../LVNC_dataset/', "split_al_train_HCM_X_groupedby_patient.json"), 'w') as f:
    json.dump(split_dict, f)