In [9]:
from pathlib import Path

import pandas as pd
from sklearn.model_selection import GroupShuffleSplit, GroupKFold

See which files are present in the targets and list them

In [2]:
# Load and inspect the targets.
targets = Path("../data/targets.csv")
df = pd.read_csv(targets)
df["filename"] = [str(index) + ".tif" for index in df.index]
filenames = list(str(fn.name) for fn in Path("../data/stacks").glob("*.tif"))  # Make sure the image is there!
df = df[df["filename"].isin(filenames)]
len(df)

53

First, shuffle the dataset and split the dataset in train and test, while making sure no person is in both the training and testing set.

In [7]:
# Split the full dataset in train and test.
gss = GroupShuffleSplit(1)
for split in gss.split(df["sample"], groups=df["person_id"]):
    train, test = split
    train_df = df[df["sample"].isin(train)]
    test_df = df[df["sample"].isin(test)]
    train_df.to_csv("../data/splits/train.csv", index=False)
    test_df.to_csv("../data/splits/test.csv", index=False)

Make cross validation splits, again making sure no person is in both the training and validation sets.

In [17]:
train_path = "../data/splits/train.csv"
# Split the train dataset into train and validation for cross-validation.
df = pd.read_csv(train_path)
gkf = GroupKFold(5)
for fold, split in enumerate(gkf.split(df["sample"], groups=df["person_id"])):
    train, val = split
    train_df = df[df["sample"].isin(train)]
    val_df = df[df["sample"].isin(val)]
    train_df.to_csv(f"../data/splits/fold-{fold}-split-train.csv", index=False)
    val_df.to_csv(f"../data/splits/fold-{fold}-split-val.csv", index=False)