<a href="https://colab.research.google.com/github/sajjkavinda/rsna-classification/blob/main/bone_fractures_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
!unzip "/content/drive/MyDrive/rsna_subset_png.zip" -d "/content/rsna_subset"


In [None]:
import os
os.listdir("/content/rsna_subset_png/kaggle/working/rsna_subset")[:5]

In [None]:
!pip uninstall -y gdcm python-gdcm pydicom pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg
!pip install pydicom pylibjpeg pylibjpeg-libjpeg pylibjpeg-openjpeg
!pip install opencv-python matplotlib tqdm

In [None]:
import os
import pydicom
import cv2
from tqdm import tqdm

input_dir = "/content/rsna_subset/kaggle/working/rsna_subset"
output_dir = "/content/rsna_png_images"
os.makedirs(output_dir, exist_ok=True)

for patient in tqdm(os.listdir(input_dir)):
    patient_path = os.path.join(input_dir, patient)
    if not os.path.isdir(patient_path):
        continue
    patient_output = os.path.join(output_dir, patient)
    os.makedirs(patient_output, exist_ok=True)

    for file in os.listdir(patient_path):
        if file.endswith(".dcm"):
            dcm_path = os.path.join(patient_path, file)
            png_path = os.path.join(patient_output, file.replace(".dcm", ".png"))

            try:
                dcm = pydicom.dcmread(dcm_path)
                img = dcm.pixel_array
                img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX)
                cv2.imwrite(png_path, img)
            except Exception as e:
                print(f"Error converting {dcm_path}: {e}")


In [None]:
import os
print(len(os.listdir("/content/rsna_png_images")))

In [None]:
import pandas as pd

base_dir = "/content/"

# Load labels again if needed
labels = pd.read_csv(base_dir + "train.csv")

# Keep only the subset
subset_ids = os.listdir("/content/rsna_png_images")
subset_labels = labels[labels["StudyInstanceUID"].isin(subset_ids)][["StudyInstanceUID", "patient_overall"]]

print(subset_labels.head())

In [None]:
import os

converted_ids = [d for d in os.listdir("/content/rsna_subset_png/kaggle/working/rsna_subset/") if os.path.isdir(os.path.join("/content/rsna_subset_png/kaggle/working/rsna_subset/", d))]
print("Total converted folders:", len(converted_ids))

In [None]:
import pandas as pd

train_csv_path = "/content/train.csv"  # adjust to where your csv actually is
test_csv_path = "/content/test.csv"

train_df = pd.read_csv(train_csv_path)
test_df = pd.read_csv(test_csv_path)

train_df = train_df[train_df["StudyInstanceUID"].isin(converted_ids)]
test_df = test_df[test_df["StudyInstanceUID"].isin(converted_ids)]

print("Train available:", len(train_df))
print("Test available:", len(test_df))


In [None]:
from sklearn.model_selection import train_test_split

train_split, val_split = train_test_split(
    train_df, test_size=0.2, stratify=train_df["patient_overall"], random_state=42
)

test_split = val_split.copy()  # use val as test if you just need it for experiments

In [None]:
converted_dir = "/content/rsna_subset_png/kaggle/working/rsna_subset/"


In [None]:
import shutil
from tqdm import tqdm

def copy_images(df, split, labeled=True):
    for _, row in tqdm(df.iterrows(), total=len(df)):
        src_dir = os.path.join(converted_dir, row["StudyInstanceUID"])
        label = "fractured" if labeled and row["patient_overall"] == 1 else "normal"
        if not labeled:
            label = "unknown"
        dst_dir = f"/content/data/{split}/{label}"
        os.makedirs(dst_dir, exist_ok=True)
        for file in os.listdir(src_dir):
            shutil.copy(os.path.join(src_dir, file),
                        os.path.join(dst_dir, f"{row['StudyInstanceUID']}_{file}"))

# Copy train, val, and test sets
copy_images(train_split, "train")
copy_images(val_split, "val")
copy_images(test_df, "test", labeled=False)