In [None]:
# ROI statistics

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:

import os
from pathlib import Path
from collections import defaultdict
import pandas as pd

# --- Config: update only if your path is different ---
BASE_DIR = Path('/content/drive/MyDrive/BRACS/ROIPatches')

# Helper: which files count as "patches"
PATCH_EXTS = {'.png', '.jpg', '.jpeg', '.tif', '.tiff', '.bmp', '.webp'}

def count_files_recursive(root: Path) -> int:
    """Count image files recursively under root."""
    cnt = 0
    for dirpath, dirnames, filenames in os.walk(root):
        for f in filenames:
            if Path(f).suffix.lower() in PATCH_EXTS:
                cnt += 1
    return cnt

# NEW: mapping lesion -> subtype (case-insensitive, robust to '0_N' style)
def to_base_lesion_name(name: str) -> str:  # NEW
    s = name.strip()
    if '_' in s:
        s = s.split('_', 1)[-1]
    return s.upper()

LESION_TO_SUBTYPE = {  # NEW
    'N': 'Normal', 'PB': 'Normal', 'UDH': 'Normal',
    'FEA': 'Atypical', 'ADH': 'Atypical',
    'DCIS': 'Malignant', 'IC': 'Malignant',
}
SUBTYPE_ORDER = ['Normal', 'Atypical', 'Malignant']  # NEW

# Discover splits dynamically (so it works whether you have train/val or val/test, etc.)
splits = sorted([p.name for p in BASE_DIR.iterdir() if p.is_dir()])

# Collect all lesion types that appear in any split
all_lesions = sorted(  # (kept as-is; still scans lesion dirs)
    {lesion_dir.name
     for split in splits
     for lesion_dir in (BASE_DIR / split).iterdir()
     if lesion_dir.is_dir()}
)

# Count patches: table[lesion][split] = count
table = defaultdict(dict)

for split in splits:
    split_dir = BASE_DIR / split
    for lesion in all_lesions:
        lesion_dir = split_dir / lesion
        if lesion_dir.is_dir():
            # Your structure: MyDrive/BRACS/ROIPatches/{split}/{lesion}/BRACS_XXX_X/
            count = 0
            for case_dir in lesion_dir.iterdir():
                if case_dir.is_dir():
                    count += count_files_recursive(case_dir)
            table[lesion][split] = count
        else:
            table[lesion][split] = 0

# NEW: aggregate per-lesion counts into subtypes
subtype_table = defaultdict(lambda: {s: 0 for s in splits})  # NEW
for lesion, split_dict in table.items():  # NEW
    base = to_base_lesion_name(lesion)
    subtype = LESION_TO_SUBTYPE.get(base)
    if subtype is None:
        continue
    for s in splits:
        subtype_table[subtype][s] += split_dict.get(s, 0)

# CHANGED: build DataFrame from subtypes instead of raw lesions
df = pd.DataFrame.from_dict(subtype_table, orient='index')[splits]  # CHANGED
df['Total'] = df.sum(axis=1)
df = df.reindex(SUBTYPE_ORDER)  # NEW: enforce Normal/Atypical/Malignant order

# Also add a bottom row with column totals across subtypes
totals_row = df.sum(axis=0).to_frame().T
totals_row.index = ['__COLUMN_TOTALS__']
df_final = pd.concat([df, totals_row], axis=0)

# Display
df_final


Mounted at /content/drive


Unnamed: 0,test,train,val,Total
Normal,2833,41188,1897,45918
Atypical,1106,5191,922,7219
Malignant,6805,49274,6528,62607
__COLUMN_TOTALS__,10744,95653,9347,115744
