# EDA — LGG Brain MRI Segmentation (Kaggle)

Goal:
- Understand file structure and pairing (image ↔ mask)
- Quantify tumor presence and mask area statistics
- Visualize samples + overlays
- Build a patient-level table (to avoid leakage later)


In [None]:
import os, re, glob, random, math
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

from PIL import Image

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


In [None]:
# Kaggle usually mounts datasets here:
KAGGLE_INPUT = Path("/kaggle/input")

# Try common expected folder name first; otherwise scan for "kaggle_3m"
candidates = []
if KAGGLE_INPUT.exists():
    candidates = list(KAGGLE_INPUT.glob("**/kaggle_3m"))

print("Found candidates:", [str(p) for p in candidates[:10]])

if len(candidates) == 0:
    raise FileNotFoundError(
        "Could not find 'kaggle_3m' under /kaggle/input. "
        "Open the dataset page and attach the dataset to this notebook."
    )

DATA_ROOT = candidates[0]
DATA_ROOT


In [None]:
patient_dirs = sorted([p for p in DATA_ROOT.iterdir() if p.is_dir()])
print("DATA_ROOT:", DATA_ROOT)
print("Number of patient folders:", len(patient_dirs))
print("First 5 patient folders:", [p.name for p in patient_dirs[:5]])


In [None]:
all_tifs = sorted(glob.glob(str(DATA_ROOT / "*" / "*.tif")))
mask_tifs = sorted([p for p in all_tifs if p.endswith("_mask.tif")])
img_tifs  = sorted([p for p in all_tifs if not p.endswith("_mask.tif")])

print("Total .tif files:", len(all_tifs))
print("Image .tif files:", len(img_tifs))
print("Mask  .tif files:", len(mask_tifs))

# Sanity checks
assert len(mask_tifs) > 0, "No mask files found."
assert len(img_tifs) > 0, "No image files found."


In [None]:
def to_key(path):
    # Example:
    # .../TCGA_XXXX/TCGA_XXXX_YY.tif         -> key "TCGA_XXXX_YY"
    # .../TCGA_XXXX/TCGA_XXXX_YY_mask.tif    -> key "TCGA_XXXX_YY"
    base = Path(path).name
    if base.endswith("_mask.tif"):
        base = base.replace("_mask.tif", "")
    else:
        base = base.replace(".tif", "")
    return base

img_map = {to_key(p): p for p in img_tifs}
msk_map = {to_key(p): p for p in mask_tifs}

keys = sorted(set(img_map.keys()) & set(msk_map.keys()))
missing_img = sorted(set(msk_map.keys()) - set(img_map.keys()))
missing_msk = sorted(set(img_map.keys()) - set(msk_map.keys()))

print("Paired image-mask keys:", len(keys))
print("Masks without images:", len(missing_img))
print("Images without masks:", len(missing_msk))

if len(missing_img) > 0:
    print("Example missing image keys:", missing_img[:5])
if len(missing_msk) > 0:
    print("Example missing mask keys:", missing_msk[:5])

assert len(keys) > 0, "No paired image-mask items."
