In [None]:
"""
Script to create a balanced subset of CT trauma patients (with and without active bleeding)
from the RSNA dataset. Outputs:
- bleed_subset_ids.csv: list of selected patient IDs
- bleed_subset_images/: directory containing the extracted CT image folders for those patients

How to use:
1. Place 'train_2024.csv' (labels file) and 'train_images.zip' (Kaggle download) in the working directory.
2. Run this script.
"""

import pandas as pd
import zipfile
import os

# Load study-level labels
LABEL_CSV = 'train_2024.csv'
df = pd.read_csv(LABEL_CSV)

# Select all bleed-positive studies (active extravasation)
pos_df = df[df['extravasation_injury'] == 1]

# Select an equal number of bleed-negative studies (no extravasation)
neg_df = df[df['extravasation_healthy'] == 1] \
#           .sample(n=len(pos_df), random_state=42)

# Combine and save patient IDs
subset_ids = pd.concat([pos_df, neg_df])['patient_id'].astype(str)
subset_ids.to_csv('bleed_subset_ids.csv', index=False)
print(f"Balanced subset: {len(pos_df)} positives, {len(neg_df)} negatives.")
print("Example patient IDs:", subset_ids.head().to_list())

# Extract relevant image folders from zip
IMAGES_ZIP = 'train_images.zip'
OUT_DIR = 'bleed_subset_images'
os.makedirs(OUT_DIR, exist_ok=True)

print("Extracting relevant patient folders...")

with zipfile.ZipFile(IMAGES_ZIP, "r") as zipf:
    all_files = zipf.namelist()
    for file in all_files:
        # Each file path is "patient_id/series_id/xxx.dcm"
        parts = file.split('/')
        if len(parts) > 1 and parts[0] in subset_ids.values:
            dest = os.path.join(OUT_DIR, file)
            os.makedirs(os.path.dirname(dest), exist_ok=True)
            with zipf.open(file) as src, open(dest, "wb") as dst:
               dst.write(src.read())
            print("Extracted:", file)

print("All selected patient folders are in", OUT_DIR)