In [1]:
import os
import pydicom
import cv2
import pandas as pd
from tqdm import tqdm

In [11]:
# Paths
TRAIN_DIR = "../data/rsna/stage_2_train_images"
LABELS_CSV = "../data/rsna/stage_2_train_labels.csv"
OUTPUT_IMG_DIR = "../data/rsna/images"
OUTPUT_LABELS_CSV = "../data/rsna/labels.csv"
CONVERT_TEST = True

In [12]:
os.makedirs(OUTPUT_IMG_DIR, exist_ok=True)

In [14]:
# Read the labels
df = pd.read_csv(LABELS_CSV)

# Keep only useful columns: patientId + Target (0 or 1)
df = df[["patientId", "Target"]].rename(columns={"patientId": "image", "Target": "label"})

print(f"Total images to convert: {len(df)}")

Total images to convert: 30227


In [15]:
for _, row in tqdm(df.iterrows(), total=len(df), desc="Converting train set"):
    dcm_path = os.path.join(TRAIN_DIR, f"{row['image']}.dcm")
    jpg_path = os.path.join(OUTPUT_IMG_DIR, f"{row['image']}.jpg")

    if not os.path.exists(dcm_path):
        print(f"Skipping missing file {dcm_path}")
        continue

    try:
        dcm = pydicom.dcmread(dcm_path)
        img = dcm.pixel_array

        # Normalize to 0–255 and save as JPEG
        img = cv2.convertScaleAbs(img, alpha=(255.0 / img.max()))
        cv2.imwrite(jpg_path, img)
    except Exception as e:
        print(f"Error converting {row['image']}: {e}")

Converting train set: 100%|██████████| 30227/30227 [04:09<00:00, 121.28it/s]


In [16]:
df["image"] = df["image"].apply(lambda x: f"{x}.jpg")
df.to_csv(OUTPUT_LABELS_CSV, index=False)
print(f"Conversion complete → {OUTPUT_LABELS_CSV}")

Conversion complete → ../data/rsna/labels.csv


In [17]:
TEST_DIR = "../data/rsna/stage_2_test_images"
if CONVERT_TEST:
    TEST_OUTPUT_DIR = os.path.join("../data/rsna/images_test")
    os.makedirs(TEST_OUTPUT_DIR, exist_ok=True)
    test_files = [f for f in os.listdir(TEST_DIR) if f.endswith(".dcm")]
    print(f"Converting {len(test_files)} test DICOMs...")

    for f in tqdm(test_files, desc="Converting test set"):
        dcm_path = os.path.join(TEST_DIR, f)
        jpg_path = os.path.join(TEST_OUTPUT_DIR, f.replace(".dcm", ".jpg"))
        try:
            dcm = pydicom.dcmread(dcm_path)
            img = dcm.pixel_array
            img = cv2.convertScaleAbs(img, alpha=(255.0 / img.max()))
            cv2.imwrite(jpg_path, img)
        except Exception as e:
            print(f"Error converting test file {f}: {e}")

    print(f"Test images saved in {TEST_OUTPUT_DIR}")

Converting 3000 test DICOMs...


Converting test set: 100%|██████████| 3000/3000 [00:31<00:00, 95.80it/s] 

Test images saved in ../data/rsna/images_test



