**Subsetting the large dataset**

In [1]:
import os

base_dir = "/kaggle/input/rsna-2022-cervical-spine-fracture-detection/"
print(os.listdir(base_dir))


['sample_submission.csv', 'train_images', 'train_bounding_boxes.csv', 'segmentations', 'train.csv', 'test.csv', 'test_images']


Checking the available indexes in the csv file which can be used to refer the image dataset

In [2]:
import pandas as pd

labels = pd.read_csv("/kaggle/input/rsna-2022-cervical-spine-fracture-detection/train.csv")
print(labels.columns)

Index(['StudyInstanceUID', 'patient_overall', 'C1', 'C2', 'C3', 'C4', 'C5',
       'C6', 'C7'],
      dtype='object')


Considered factors when subsetting the dataset

* Equal number of *fractured* and *normal* images.
* 400 images for each.
* Split in to training, testing and validation sets.
* Training + Validation 90% , Testing 10%
* Create seperate CSV files for each.
* Copy the subset in to a different folder
* Use only 3 middle slices to reduce the storage requirement the computational power

In [3]:
import os
import shutil
import pandas as pd
from sklearn.model_selection import train_test_split

# Base path to full dataset
base_dir = "/kaggle/input/rsna-2022-cervical-spine-fracture-detection/"
labels = pd.read_csv(os.path.join(base_dir, "train.csv"))

fractured = labels[labels["patient_overall"]==1]["StudyInstanceUID"].sample(400, random_state=42)
normal = labels[labels["patient_overall"]==0]["StudyInstanceUID"].sample(400, random_state=42)
subset_uids = list(fractured) + list(normal)

# Filter labels
subset_df = labels[labels["StudyInstanceUID"].isin(subset_uids)].copy()

# Dataset Spliting
train_val_uids, test_uids = train_test_split(
    subset_uids, test_size=0.1, random_state=42, stratify=subset_df["patient_overall"]
)

train_uids, val_uids = train_test_split(
    train_val_uids, test_size=0.2, random_state=42,
    stratify=subset_df[subset_df["StudyInstanceUID"].isin(train_val_uids)]["patient_overall"]
)

train_df = subset_df[subset_df["StudyInstanceUID"].isin(train_uids)]
val_df   = subset_df[subset_df["StudyInstanceUID"].isin(val_uids)]
test_df  = subset_df[subset_df["StudyInstanceUID"].isin(test_uids)]

# Save CSVs accordingly
subset_csv_dir = "subset_csv"
os.makedirs(subset_csv_dir, exist_ok=True)
train_df.to_csv(os.path.join(subset_csv_dir,"train.csv"), index=False)
val_df.to_csv(os.path.join(subset_csv_dir,"val.csv"), index=False)
test_df.to_csv(os.path.join(subset_csv_dir,"test.csv"), index=False)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

# Copy the subset in to a new folder
subset_img_dir = "rsna_subset_images"
os.makedirs(subset_img_dir, exist_ok=True)

for pid in subset_uids:
    src_folder = os.path.join(base_dir, "train_images", pid)
    dst_folder = os.path.join(subset_img_dir, pid)
    os.makedirs(dst_folder, exist_ok=True)

    # Select 3 middle slices per study
    all_files = sorted(os.listdir(src_folder))
    mid_index = len(all_files) // 2
    selected_files = all_files[mid_index-1:mid_index+2]  # 3 slices

    for f in selected_files:
        shutil.copy(os.path.join(src_folder, f), os.path.join(dst_folder, f))

print(f"Subsetted images copied to {subset_img_dir}")


Train: 576, Val: 144, Test: 80
Subsetted images copied to rsna_subset_images


In [4]:
import os
import zipfile
import shutil

# Paths
subset_img_dir = "rsna_subset_images"
subset_csv_dir = "subset_csv"        
zip_name = "rsna_subset.zip"

# Create the zip
with zipfile.ZipFile(zip_name, 'w', zipfile.ZIP_DEFLATED) as zipf:
    for root, dirs, files in os.walk(subset_img_dir):
        for f in files:
            file_path = os.path.join(root, f)
            # keep the folder structure inside the zip
            arcname = os.path.relpath(file_path, os.path.dirname(subset_img_dir))
            zipf.write(file_path, arcname)
    
    # Add CSV files
    for csv_file in ["train.csv", "val.csv", "test.csv"]:
        csv_path = os.path.join(subset_csv_dir, csv_file)
        if os.path.exists(csv_path):
            zipf.write(csv_path, os.path.join("subset_csv", csv_file))

print(f"Subset folder and CSVs zipped successfully into {zip_name}")


Subset folder and CSVs zipped successfully into rsna_subset.zip
