# EDA — LGG Brain MRI Segmentation (Kaggle)

Goal:
- Understand file structure and pairing (image ↔ mask)
- Quantify tumor presence and mask area statistics
- Visualize samples + overlays
- Build a patient-level table (to avoid leakage later)


In [None]:
import os, re, glob, random, math
from pathlib import Path

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
plt.style.use("seaborn-v0_8")
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)

from PIL import Image

SEED = 42
random.seed(SEED)
np.random.seed(SEED)


In [None]:
# Kaggle usually mounts datasets here:
KAGGLE_INPUT = Path("/kaggle/input")

# Try common expected folder name first; otherwise scan for "kaggle_3m"
candidates = []
if KAGGLE_INPUT.exists():
    candidates = list(KAGGLE_INPUT.glob("**/kaggle_3m"))

print("Found candidates:", [str(p) for p in candidates[:10]])

if len(candidates) == 0:
    raise FileNotFoundError(
        "Could not find 'kaggle_3m' under /kaggle/input. "
        "Open the dataset page and attach the dataset to this notebook."
    )

DATA_ROOT = candidates[0]
DATA_ROOT


In [None]:
patient_dirs = sorted([p for p in DATA_ROOT.iterdir() if p.is_dir()])
print("DATA_ROOT:", DATA_ROOT)
print("Number of patient folders:", len(patient_dirs))
print("First 5 patient folders:", [p.name for p in patient_dirs[:5]])


In [None]:
all_tifs = sorted(glob.glob(str(DATA_ROOT / "*" / "*.tif")))
mask_tifs = sorted([p for p in all_tifs if p.endswith("_mask.tif")])
img_tifs  = sorted([p for p in all_tifs if not p.endswith("_mask.tif")])

print("Total .tif files:", len(all_tifs))
print("Image .tif files:", len(img_tifs))
print("Mask  .tif files:", len(mask_tifs))

# Sanity checks
assert len(mask_tifs) > 0, "No mask files found."
assert len(img_tifs) > 0, "No image files found."


In [None]:
def to_key(path):
    # Example:
    # .../TCGA_XXXX/TCGA_XXXX_YY.tif         -> key "TCGA_XXXX_YY"
    # .../TCGA_XXXX/TCGA_XXXX_YY_mask.tif    -> key "TCGA_XXXX_YY"
    base = Path(path).name
    if base.endswith("_mask.tif"):
        base = base.replace("_mask.tif", "")
    else:
        base = base.replace(".tif", "")
    return base

img_map = {to_key(p): p for p in img_tifs}
msk_map = {to_key(p): p for p in mask_tifs}

keys = sorted(set(img_map.keys()) & set(msk_map.keys()))
missing_img = sorted(set(msk_map.keys()) - set(img_map.keys()))
missing_msk = sorted(set(img_map.keys()) - set(msk_map.keys()))

print("Paired image-mask keys:", len(keys))
print("Masks without images:", len(missing_img))
print("Images without masks:", len(missing_msk))

if len(missing_img) > 0:
    print("Example missing image keys:", missing_img[:5])
if len(missing_msk) > 0:
    print("Example missing mask keys:", missing_msk[:5])

assert len(keys) > 0, "No paired image-mask items."


In [None]:
def patient_id_from_path(path):
    return Path(path).parent.name

df = pd.DataFrame({
    "key": keys,
    "image_path": [img_map[k] for k in keys],
    "mask_path":  [msk_map[k] for k in keys],
})
df["patient_id"] = df["image_path"].apply(patient_id_from_path)

# Attempt to extract slice number if present at end (common pattern: ..._12.tif)
def slice_index_from_key(k):
    m = re.search(r"_(\d+)$", k)
    return int(m.group(1)) if m else np.nan

df["slice_idx"] = df["key"].apply(slice_index_from_key)
df = df.sort_values(["patient_id", "slice_idx", "key"]).reset_index(drop=True)

df.head()


In [None]:
def load_tif(path):
    # Returns numpy array, uint8 typically
    arr = np.array(Image.open(path))
    return arr

# Inspect a few samples
sample_rows = df.sample(5, random_state=SEED)
for i, r in sample_rows.iterrows():
    img = load_tif(r.image_path)
    msk = load_tif(r.mask_path)
    print("Patient:", r.patient_id, "key:", r.key,
          "| img shape:", img.shape, img.dtype,
          "| msk shape:", msk.shape, msk.dtype,
          "| mask max:", msk.max())


In [None]:
def to_mask01(msk):
    # Some masks may be stored as 0/255; ensure 0/1
    if msk.ndim == 3:
        # If stored as RGB, reduce to single channel
        m = msk[..., 0]
    else:
        m = msk
    m = (m > 0).astype(np.uint8)
    return m

# Estimate unique values across a small subset
subset = df.sample(50, random_state=SEED)
vals = set()
for p in subset["mask_path"].tolist():
    m = load_tif(p)
    m01 = to_mask01(m)
    vals |= set(np.unique(m01).tolist())

vals


In [None]:
def mask_stats(mask_path):
    m = to_mask01(load_tif(mask_path))
    area = int(m.sum())
    h, w = m.shape[:2]
    frac = area / float(h*w)
    present = int(area > 0)
    return present, area, frac, h, w

stats = df["mask_path"].apply(mask_stats)
df[["tumor_present", "mask_area_px", "mask_area_frac", "H", "W"]] = pd.DataFrame(stats.tolist(), index=df.index)

df.describe(include="all")


In [None]:
summary = {
    "n_patients": df["patient_id"].nunique(),
    "n_slices_total": len(df),
    "tumor_slices": int(df["tumor_present"].sum()),
    "non_tumor_slices": int((1 - df["tumor_present"]).sum()),
    "tumor_slice_rate": float(df["tumor_present"].mean()),
    "image_size_unique": df[["H","W"]].drop_duplicates().shape[0],
}
summary


In [None]:
patient_df = (df.groupby("patient_id")
              .agg(
                  n_slices=("key","count"),
                  tumor_slices=("tumor_present","sum"),
                  tumor_slice_rate=("tumor_present","mean"),
                  mean_mask_frac=("mask_area_frac","mean"),
                  max_mask_frac=("mask_area_frac","max"),
              )
              .reset_index()
              .sort_values(["tumor_slices","n_slices"], ascending=False))

patient_df.head(10)


In [None]:
plt.figure(figsize=(8,4))
plt.hist(patient_df["n_slices"], bins=20)
plt.title("Slices per patient")
plt.xlabel("Number of slices")
plt.ylabel("Count of patients")
plt.show()

patient_df["n_slices"].describe()


In [None]:
plt.figure(figsize=(8,4))
plt.hist(patient_df["tumor_slices"], bins=20)
plt.title("Tumor-positive slices per patient")
plt.xlabel("Tumor-positive slices")
plt.ylabel("Count of patients")
plt.show()

patient_df["tumor_slices"].describe()
