# Inspect WP5 sup_masks: load .npy and summarize

This notebook scans a `sup_masks` directory and, for each sample, loads the actual `*_supmask.npy` (and optionally `*_seedmask.npy` and `*_pseudolabel.npy`) to compute and report:
- Shape and dtype of each array
- Count and fraction of supervised voxels from `supmask.npy` (computed from the array, not from JSON)
- Optional: unique label values and per-class counts from `pseudolabel.npy`

It will also save a CSV summary next to the `sup_masks` folder.

In [None]:
from pathlib import Path
import json, math, os, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Point to your sup_masks directory (edit if needed)
SUP_MASKS_DIR = Path('runs/fixed_points_bundle50/ratio_0.00001/sup_masks')
assert SUP_MASKS_DIR.exists(), f'Not found: {SUP_MASKS_DIR}'

# Controls
MAX_FILES = 0   # 0 => process all; otherwise limit to first N supmask files
LOAD_SEEDMASK = True
LOAD_PSEUDOLABEL = True
COMPUTE_PL_UNIQUES = True   # compute unique label values for pseudolabel

SUP_MASKS_DIR

In [None]:
# Discover supmask files (authoritative list to iterate)
sup_files = sorted(SUP_MASKS_DIR.glob('*_supmask.npy'))
if MAX_FILES and MAX_FILES > 0:
    sup_files = sup_files[:MAX_FILES]
len(sup_files), sup_files[:3]

In [None]:
# Iterate and compute stats by loading .npy arrays
def canonical_shape(arr):
    # If (1, X, Y, Z) squeeze channel dim; else return original shape
    if arr.ndim == 4 and arr.shape[0] == 1:
        return arr.shape[1:]
    return arr.shape

def nonzero_fraction(arr):
    total = arr.size
    if arr.ndim == 4 and arr.shape[0] == 1:
        total = int(np.prod(arr.shape[1:]))
    nz = int(np.count_nonzero(arr))
    return nz, total, (nz / total if total > 0 else np.nan)

def unique_counts(arr):
    vals, cnts = np.unique(arr, return_counts=True)
    # Return as {int(val): int(count)} for readability
    out = {int(v): int(c) for v, c in zip(vals, cnts)}
    return out

records = []
for sp in sup_files:
    base = sp.name.replace('_supmask.npy', '')
    seed_p = SUP_MASKS_DIR / f'{base}_seedmask.npy'
    pl_p = SUP_MASKS_DIR / f'{base}_pseudolabel.npy'
    stats_p = SUP_MASKS_DIR / f'{base}_supmask_stats.json'

    # Load arrays (memory-mapped to be memory-friendly)
    sup = np.load(sp, mmap_mode='r')
    seed = np.load(seed_p, mmap_mode='r') if LOAD_SEEDMASK and seed_p.exists() else None
    pl = np.load(pl_p, mmap_mode='r') if LOAD_PSEUDOLABEL and pl_p.exists() else None

    sup_nz, sup_tot, sup_frac = nonzero_fraction(sup)
    seed_nz, seed_tot, seed_frac = (None, None, None)
    if seed is not None:
        seed_nz, seed_tot, seed_frac = nonzero_fraction(seed)

    pl_uniques = None
    if pl is not None and COMPUTE_PL_UNIQUES:
        pl_uniques = unique_counts(pl)

    # Integrate JSON stats only for cross-check (not required)
    js = json.loads(stats_p.read_text()) if stats_p.exists() else {}

    rec = {
        'id': js.get('id', base),
        # Supmask
        'sup_shape': tuple(canonical_shape(sup)),
        'sup_dtype': str(sup.dtype),
        'sup_count': sup_nz,
        'sup_total': sup_tot,
        'sup_ratio': sup_frac,
        # Seedmask
        'seed_shape': tuple(canonical_shape(seed)) if seed is not None else None,
        'seed_dtype': str(seed.dtype) if seed is not None else None,
        'seed_count': seed_nz,
        'seed_total': seed_tot,
        'seed_ratio': seed_frac,
        # Pseudolabel
        'pl_shape': tuple(canonical_shape(pl)) if pl is not None else None,
        'pl_dtype': str(pl.dtype) if pl is not None else None,
        'pl_min': int(pl.min()) if pl is not None else None,
        'pl_max': int(pl.max()) if pl is not None else None,
        'pl_uniques': pl_uniques,
        # Paths
        'supmask_path': str(sp),
        'seedmask_path': str(seed_p) if seed_p.exists() else None,
        'pseudolabel_path': str(pl_p) if pl_p.exists() else None,
        'stats_path': str(stats_p) if stats_p.exists() else None,
    }
    records.append(rec)

df = pd.DataFrame.from_records(records)
df.head()

In [None]:
# Basic summary (computed from .npy)
print('Samples:', len(df))
print('sup_ratio mean/min/max:', df['sup_ratio'].mean(), df['sup_ratio'].min(), df['sup_ratio'].max())
df[['id','sup_ratio','sup_count','sup_total','sup_dtype','sup_shape']].head(10)

In [None]:
# Histogram of supervised voxel ratios
ax = df['sup_ratio'].plot(kind='hist', bins=50, figsize=(6,4), title='Supervised voxel ratio per sample (from supmask.npy)')
ax.set_xlabel('sup_ratio (supervised / total voxels)')
plt.show()

In [None]:
# Optional: cross-check JSON stats vs .npy computed sup_ratio
CROSS_CHECK_JSON = True

if CROSS_CHECK_JSON:
    diffs = []
    for i, row in df.iterrows():
        sp = row.get('stats_path')
        if not sp or not Path(sp).exists():
            continue
        d = json.loads(Path(sp).read_text())
        sup_fraction_json = d.get('sup_fraction', None)
        if sup_fraction_json is None:
            continue
        if not np.isclose(float(sup_fraction_json), row['sup_ratio'], rtol=1e-3, atol=1e-6):
            diffs.append((row['id'], sup_fraction_json, row['sup_ratio']))
    print('JSON cross-check mismatches:', len(diffs))
    diffs[:5] if diffs else None
else:
    print('Skipping JSON cross-check. Set CROSS_CHECK_JSON=True to enable.')

In [None]:
# Save CSV summary next to the sup_masks for convenience
out_csv = SUP_MASKS_DIR / 'supmask_ratio_summary.csv'
cols = [
    'id',
    'sup_ratio','sup_count','sup_total','sup_dtype','sup_shape','supmask_path',
    'seed_ratio','seed_count','seed_total','seed_dtype','seed_shape','seedmask_path',
    'pl_min','pl_max','pl_dtype','pl_shape','pseudolabel_path',
    'stats_path'
]
df[cols].to_csv(out_csv, index=False)
out_csv, out_csv.exists()