# Train Test Split

This notebook creates a train-test split by patient. Once the patients are split, then we randomly sample a subset of each patients lung images to be used for training. 

In [1]:
import os, shutil
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
IMAGE_DIR = "/Users/raamis/lung_cancer_detector/data/images/"

images = os.listdir(IMAGE_DIR)
images = np.core.defchararray.add(IMAGE_DIR,images)

labels = np.core.defchararray.replace(images,"images","labels")
labels = np.core.defchararray.replace(labels,".png",".txt")

# Create dataframe of images and labels
patient_df = pd.DataFrame()
patient_df["image_path"] = images
patient_df["label_path"] = labels

patient_names = []
for name in patient_df["image_path"]:
    id = name.split("/")[-1].split("_")[0]
    patient_names.append(id)

patient_df["patient_id"] = patient_names
patient_df.head()

Unnamed: 0,image_path,label_path,patient_id
0,/Users/raamis/lung_cancer_detector/data/images...,/Users/raamis/lung_cancer_detector/data/labels...,G0059
1,/Users/raamis/lung_cancer_detector/data/images...,/Users/raamis/lung_cancer_detector/data/labels...,G0044
2,/Users/raamis/lung_cancer_detector/data/images...,/Users/raamis/lung_cancer_detector/data/labels...,A0112
3,/Users/raamis/lung_cancer_detector/data/images...,/Users/raamis/lung_cancer_detector/data/labels...,A0084
4,/Users/raamis/lung_cancer_detector/data/images...,/Users/raamis/lung_cancer_detector/data/labels...,A0249


In [3]:
# Create 80/20 train test split
patient_id = patient_df["patient_id"].unique()
train_patients,test_patients = train_test_split(patient_id,test_size=0.2,train_size=0.8)

## Sample Train and Test Images

Now that we have split the patients into a train and test set, let's randomly sample images from each patient to create a set of images and labels for training and testing.

In [4]:
# create dataframes for train and test set
train_df = patient_df[patient_df.patient_id.isin(train_patients)]
test_df = patient_df[patient_df.patient_id.isin(test_patients)]

In [5]:
# sample 3 images from each patient in the train and test set
train_sample = train_df.groupby("patient_id").apply(
    pd.DataFrame.sample,n=3,replace=True
).reset_index(drop=True)

test_sample = test_df.groupby("patient_id").apply(
    pd.DataFrame.sample,n=3,replace=True
).reset_index(drop=True)

In [6]:
# Drop any duplicates introduced by the sampling
train_sample = train_sample.drop_duplicates("image_path")
test_sample = test_sample.drop_duplicates("image_path")

In [7]:
print(f"There are {len(train_sample)} images in the training set")
print(f"There are {len(test_sample)} images in the training set")

There are 804 images in the training set
There are 203 images in the training set


In [8]:
# make sure there are no duplicate patients between the training and validation sets
num_dupes = np.intersect1d(
    test_sample["patient_id"].unique(),
    train_sample["patient_id"].unique()
)

print(f"There are {len(num_dupes)} duplicate patients")

There are 0 duplicate patients


## Move images into YOLO Training Directoies

In [9]:
TRAIN_IMAGE_DIR = "/Users/raamis/repos/lung_cancer_detector/data/images/train"
TRAIN_LABEL_DIR = "/Users/raamis/repos/lung_cancer_detector/data/labels/train"

VAL_IMAGE_DIR = "/Users/raamis/repos/lung_cancer_detector/data/images/val"
VAL_LABEL_DIR = "/Users/raamis/repos/lung_cancer_detector/data/labels/val"

The below part is commented out since we only want to copy over images and labels once

In [10]:
# Copy over training images and labels
# for image,label in zip(train_sample["image_path"],train_sample["label_path"]):
#     new_image_path = os.path.join(TRAIN_IMAGE_DIR,image.split('/')[-1])
#     new_label_path = os.path.join(TRAIN_LABEL_DIR,label.split('/')[-1])

#     shutil.copy(image,new_image_path)
#     shutil.copy(label,new_label_path)

# # copy over validation images and labels
# for image,label in zip(test_sample["image_path"],test_sample["label_path"]):
#     new_image_path = os.path.join(VAL_IMAGE_DIR,image.split('/')[-1])
#     new_label_path = os.path.join(VAL_LABEL_DIR,label.split('/')[-1])

#     shutil.copy(image,new_image_path)
#     shutil.copy(label,new_label_path)

## Sanity Checks

Let's make sure there is a proper 1 to 1 image-label mapping in the training and validation directories. We also check to make sure there are no duplicate files between the training and validation directories.

In [11]:
from yolo_directory_checks import check_for_duplicates, valid_image_label_mapping

In [12]:
print(check_for_duplicates("/Users/raamis/repos/lung_cancer_detector/data/images/"))

[]


In [13]:
IMAGE_DIR = "/Users/raamis/repos/lung_cancer_detector/data/images/"
LABEL_DIR = "/Users/raamis/repos/lung_cancer_detector/data/labels/"
print(valid_image_label_mapping(IMAGE_DIR,LABEL_DIR))


True
