Build a Dataset of 150 Tumor and Non-Tumor MRIs from Kaggle Dataset

In [1]:
import os
import pandas as pd 
import h5py
import cv2
from PIL import Image

In [2]:
#File Locations
CSV_PATH = r"C:\Users\rithv\Downloads\archive_extracted\BraTS20 Training Metadata.csv"
INPUT_ROOT = r"C:\Users\rithv\Downloads\archive_extracted"
OUTPUT_DIR = r"C:\Users\rithv\GitHub\brain-mri-scans\code"
TUMOR_COUNT = 150
NO_TUMOR_COUNT = 150

os.makedirs(f"{OUTPUT_DIR}/images", exist_ok=True)
os.makedirs(f"{OUTPUT_DIR}/masks", exist_ok=True)

Select 150 Tumor and Non-Tumor Images from Middle of Brain Volume

In [3]:
df = pd.read_csv(CSV_PATH)
grouped = df.groupby('volume')

selected = []
tumor_selected = 0
no_tumor_selected = 0

#Select Middle Slices
for volume, group in grouped:
    group = group.sort_values("slice")
    total_slices = len(group)
    if total_slices < 20:
        continue

    start = int(total_slices * 0.3)
    end = int(total_slices *0.7)
    middle_slices = group.iloc[start:end]

    tumor_slices = middle_slices[middle_slices["target"] == 1]
    no_tumor_slices = middle_slices[middle_slices["target"] == 0]

    for _, row in tumor_slices.iterrows():
        if tumor_selected < TUMOR_COUNT:
            selected.append(row)
            tumor_selected += 1
    
    for _, row in no_tumor_slices.iterrows():
        if no_tumor_selected < NO_TUMOR_COUNT:
            selected.append(row)
            no_tumor_selected += 1

    if tumor_selected >= TUMOR_COUNT and no_tumor_selected >= NO_TUMOR_COUNT:
        break

print("Tumor:", tumor_selected)
print("No Tumor:", no_tumor_selected)




Tumor: 150
No Tumor: 150


Define function for reading H5 file

In [4]:
def load_h5(path):
    with h5py.File(path, "r") as f:
        print("Keys in this h5 file as:", list(f.keys()))
        img = f["image"][:]
        mask_key = "mask" if "mask" in f else "label"
        mask = f[mask_key][:]
    return img, mask    

Reading H5 MRI Files and creating a dataset with the Images, Mask, and a CSV file

In [6]:
rows_for_csv = []

for i, row in enumerate(selected, start=1):

    h5_path = row["slice_path"].replace(
    "../input/brats2020-training-data",
    INPUT_ROOT)
    img, mask = load_h5(h5_path)

    img = cv2.normalize(img, None, 0, 255, cv2.NORM_MINMAX).astype("uint8")
    mask = (mask > 0).astype("uint8") * 255

    img_filename = f"{i}.png"
    mask_filename = f"{i}_mask.png"

    Image.fromarray(img).save(f"{OUTPUT_DIR}/images/{img_filename}")
    Image.fromarray(mask).save(f"{OUTPUT_DIR}/masks/{mask_filename}")

    rows_for_csv.append({
        "filename": img_filename,
        "target": int(row["target"])
    })

df_out = pd.DataFrame(rows_for_csv)
df_out.to_csv(f"{OUTPUT_DIR}/labels.csv", index=False)

print("Dataset and CSV file created successfully")

Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in this h5 file as: ['image', 'mask']
Keys in thi