# Split your data into Train-Test-Validation splits
## Store these splits in a CSV file in the splits folder

For our sample dataset of chestmnist, they have already split it into train, test and validation sets. We'll go ahead and use these splits but let's take note that it will not always be this easy.
Often, we have to create the splits ourselves.
To do that, I often get all the file names, shuffle them, and then split them into the desired proportions.
Then, I write these splits into a CSV file in the splits folder.
So, splits/ will contain CSVs of which files from our data pool we'll be using.
Doing it this way ensures that we can always recreate the splits if we need to. It also allows us to remove bad/contaminated samples from our data.

In [1]:
SPLIT_DIR = '/home/sasank.desaraju/med-start/splits'
DATA_SRC = '/home/sasank.desaraju/med-start/data/chestmnist_64.npz'
SPLIT_NAME = 'my_split_64'

In [2]:
import sys
import os
import time
from sklearn.model_selection import train_test_split as tts
import pandas as pd
import numpy as np
import glob
import csv

In [3]:
"""
Create train/test/val CSVs where each one has the following columns:
- image_path: path to the image
- label: label of the image
- split: train/test/val
- patient_id: patient id of the image
"""
data = np.load(DATA_SRC)

In [5]:
if not os.path.exists(os.path.join(SPLIT_DIR, f'{SPLIT_NAME}')):
    os.makedirs(os.path.join(SPLIT_DIR, f'{SPLIT_NAME}'))

train_images = data['train_images']
train_labels = data['train_labels']

with open(os.path.join(SPLIT_DIR, f'{SPLIT_NAME}', f'train_{SPLIT_NAME}.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image', 'label', 'stage', 'patient_id'])
    for i in range(len(train_images)):
        #writer.writerow([train_images[i], train_labels[i], 'train', train_images[i].split('/')[-2]])
        writer.writerow([i, i, 'train', i])         # This is a bit silly that we're just using the index as the image path, but it's useful when the data is not as nicely organized as the chestmnist data.

test_images = data['test_images']
test_labels = data['test_labels']

with open(os.path.join(SPLIT_DIR, f'{SPLIT_NAME}', f'test_{SPLIT_NAME}.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image', 'label', 'stage', 'patient_id'])
    for i in range(len(test_images)):
        #writer.writerow([test_images[i], test_labels[i], 'test', test_images[i].split('/')[-2]])
        writer.writerow([i, i, 'test', i])

val_images = data['val_images']
val_labels = data['val_labels']

with open(os.path.join(SPLIT_DIR, f'{SPLIT_NAME}', f'val_{SPLIT_NAME}.csv'), 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image', 'label', 'stage', 'patient_id'])
    for i in range(len(val_images)):
        #writer.writerow([val_images[i], val_labels[i], 'val', val_images[i].split('/')[-2]])
        writer.writerow([i, i, 'val', i])


Below is how to do this if you just have a bunch of train and label files in a folder that you have to split into train, test and validation sets.

In [None]:
# Create a CSV of all of the images and labels we have
"""
Create a master CSV with the image and label names.
"""
# Get the list of folders
folders = glob.glob(IMAGE_ROOT + '*')
# Create the master CSV
with open(DATA_ROOT + master_csv_name, 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['image', 'label', 'patient_id'])
    for folder in folders:
        # Get the image and label names
        image = os.path.join(folder.split('/')[-1], folder.split('/')[-1] + '_image.nii.gz')
        label = os.path.join(folder.split('/')[-1], folder.split('/')[-1] + '_label.nii.gz')
        patient_id = folder.split('/')[-1]
        # Write the image and label names to the CSV
        writer.writerow([image, label, patient_id])

In [None]:
# Create the train/test/val CSVs
"""
Create the train/test/val CSVs.
"""
# Read in the master CSV
df = pd.read_csv(DATA_ROOT + master_csv_name)
# Split the data into train/test/val
train, test = tts(df, test_size=0.2, random_state=42)
train, val = tts(train, test_size=0.2, random_state=42)
# Make the subdirectory for the data split using the data_name if it doesn't exist
if not os.path.exists(DATA_ROOT + data_name):
    os.mkdir(DATA_ROOT + data_name)
# Write the CSVs
train.to_csv(os.path.join(DATA_ROOT, data_name, 'train_' + data_name + '.csv'), index=False)
test.to_csv(os.path.join(DATA_ROOT, data_name, 'test_' + data_name + '.csv'), index=False)
val.to_csv(os.path.join(DATA_ROOT, data_name, 'val_' + data_name + '.csv'), index=False)

In [5]:
# This is a file I used to do this for another project where I had my images and labels in separate folders and had to create the split myself.

"""
Sasank Desaraju
4/4/23

This is to create a train/test/val split of CSVs for our Fistula Segmentation dataset.
"""

import sys
import os
import time
from sklearn.model_selection import train_test_split as tts
import pandas as pd
import numpy as np
import glob
import csv


"""
Each pair of image and label are inside of their own folder with a numerical identifier.
Within each folder, the image and label are named [number]_image.nii.gz and [number]_label.nii.gz.
For example, the image and label for the pair with the identifier 1 are located at:
    IMAGE_ROOT/1/1_image.nii.gz
    IMAGE_ROOT/1/1_label.nii.gz

We want to create, first, a master CSV that just contains a list of the iamges and their labels in two columns.

Then, we want to split this into train/test/val sets and create CSVs of each of the splits.
The CSVs should be in the format:
    image_name, label_name
where both the image_name and label are relative to the IMAGE_ROOT.
"""

DATA_ROOT = '/home/sasank/Documents/GitRepos/Fistula-Segmentation/data/'
IMAGE_ROOT = '/media/sasank/LinuxStorage/Dropbox (UFL)/FistulaData/Segmentations/'

def create_master_csv(master_csv_name):
    """
    Create a master CSV with the image and label names.
    """
    # Get the list of folders
    folders = glob.glob(IMAGE_ROOT + '*')
    # Create the master CSV
    with open(DATA_ROOT + master_csv_name, 'w') as f:
        writer = csv.writer(f)
        writer.writerow(['image', 'label', 'patient_id'])
        for folder in folders:
            # Get the image and label names
            image = os.path.join(folder.split('/')[-1], folder.split('/')[-1] + '_image.nii.gz')
            label = os.path.join(folder.split('/')[-1], folder.split('/')[-1] + '_label.nii.gz')
            patient_id = folder.split('/')[-1]
            # Write the image and label names to the CSV
            writer.writerow([image, label, patient_id])

def create_train_test_val_csvs(data_name, master_csv_name):
    """
    Create the train/test/val CSVs.
    """
    # Read in the master CSV
    df = pd.read_csv(DATA_ROOT + master_csv_name)
    # Split the data into train/test/val
    train, test = tts(df, test_size=0.2, random_state=42)
    train, val = tts(train, test_size=0.2, random_state=42)
    # Make the subdirectory for the data split using the data_name if it doesn't exist
    if not os.path.exists(DATA_ROOT + data_name):
        os.mkdir(DATA_ROOT + data_name)
    # Write the CSVs
    train.to_csv(os.path.join(DATA_ROOT, data_name, 'train_' + data_name + '.csv'), index=False)
    test.to_csv(os.path.join(DATA_ROOT, data_name, 'test_' + data_name + '.csv'), index=False)
    val.to_csv(os.path.join(DATA_ROOT, data_name, 'val_' + data_name + '.csv'), index=False)


if __name__ == '__main__':
    # Create the master CSV
    create_master_csv(master_csv_name='full_data.csv')
    # Create the train/test/val CSVs
    create_train_test_val_csvs(data_name='BaseSplit', master_csv_name='full_data.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/home/sasank/Documents/GitRepos/Fistula-Segmentation/data/full_data.csv'