# 01 — Data Audit (Goldenhar-CFID)

**Objective**: Verify dataset integrity before training.

This notebook checks:
- directory structure (`images/`, `labels/`)
- image–label pairing
- class distribution (counts, imbalance)
- label sanity: normalized boxes, range checks, empty labels
- split leakage checks (train/val overlap)

**Outputs**:
- `outputs/tables/data_audit_summary.csv`
- `outputs/figures/data_class_distribution.png`


In [None]:
from pathlib import Path
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import yaml

DATA_ROOT = Path(os.getenv("DATA_ROOT", "../data"))
OUTPUT_ROOT = Path(os.getenv("OUTPUT_ROOT", "../outputs"))
OUTPUT_ROOT.mkdir(parents=True, exist_ok=True)
(OUTPUT_ROOT / "tables").mkdir(parents=True, exist_ok=True)
(OUTPUT_ROOT / "figures").mkdir(parents=True, exist_ok=True)

DATASET_ROOT = DATA_ROOT / "processed" / "D1_yolo640"  # change to D2_balanced_yolo640 if needed
IMAGES_DIR = DATASET_ROOT / "images"
LABELS_DIR = DATASET_ROOT / "labels"

CLASSES = [
    "Cleft Lip",
    "Epibulbar Dermoid",
    "Eyelid Coloboma",
    "Facial Asymmetry",
    "Malocclusion",
    "Microtia",
    "Vertebral Abnormalities",
]
NUM_CLASSES = len(CLASSES)

def read_yaml(p: Path):
    with open(p, "r", encoding="utf-8") as f:
        return yaml.safe_load(f)

def list_images(images_dir: Path):
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
    return sorted([p for p in images_dir.rglob("*") if p.suffix.lower() in exts])

def label_path_for_image(img_path: Path, labels_dir: Path):
    return labels_dir / (img_path.stem + ".txt")

def parse_yolo_label_file(p: Path):
    if not p.exists():
        return []
    txt = p.read_text(encoding="utf-8").strip()
    if not txt:
        return []
    rows = []
    for line in txt.splitlines():
        parts = line.strip().split()
        if len(parts) != 5:
            continue
        c, x, y, w, h = parts
        rows.append((int(float(c)), float(x), float(y), float(w), float(h)))
    return rows

print("DATASET_ROOT:", DATASET_ROOT)
print("IMAGES_DIR exists:", IMAGES_DIR.exists())
print("LABELS_DIR exists:", LABELS_DIR.exists())


In [None]:
images = list_images(IMAGES_DIR)
print("#images:", len(images))

missing_labels = 0
empty_labels = 0
bad_rows = 0

class_counts = np.zeros(NUM_CLASSES, dtype=int)
bbox_out_of_range = 0
bbox_negative = 0

records = []
for img in images:
    lp = label_path_for_image(img, LABELS_DIR)
    if not lp.exists():
        missing_labels += 1
        continue
    ann = parse_yolo_label_file(lp)
    if len(ann) == 0:
        empty_labels += 1
    for (c, x, y, w, h) in ann:
        if c < 0 or c >= NUM_CLASSES:
            bad_rows += 1
            continue
        class_counts[c] += 1
        if min(x, y, w, h) < 0:
            bbox_negative += 1
        if max(x, y, w, h) > 1.0:
            bbox_out_of_range += 1
    records.append({
        "image": str(img.relative_to(DATASET_ROOT)),
        "label": str(lp.relative_to(DATASET_ROOT)),
        "n_boxes": len(ann),
    })

audit = {
    "n_images": len(images),
    "missing_label_files": int(missing_labels),
    "empty_label_files": int(empty_labels),
    "bad_label_rows": int(bad_rows),
    "bbox_negative_values": int(bbox_negative),
    "bbox_out_of_range": int(bbox_out_of_range),
}

audit_df = pd.DataFrame([audit])
audit_df


In [None]:
out_csv = OUTPUT_ROOT / "tables" / "data_audit_summary.csv"
audit_df.to_csv(out_csv, index=False)
print("Saved:", out_csv)

fig = plt.figure(figsize=(10, 4), dpi=200)
plt.bar(np.arange(NUM_CLASSES), class_counts)
plt.xticks(np.arange(NUM_CLASSES), CLASSES, rotation=30, ha="right")
plt.ylabel("#Boxes")
plt.title("Class Distribution (YOLO labels)")
plt.tight_layout()
out_fig = OUTPUT_ROOT / "figures" / "data_class_distribution.png"
plt.savefig(out_fig, bbox_inches="tight")
plt.show()
print("Saved:", out_fig)


## Split leakage check

This checks that **train_images** and **val_images** are disjoint for each split manifest.


In [None]:
SPLITS_ROOT = DATA_ROOT / "splits"

def leakage_check(manifest_path: Path):
    m = read_yaml(manifest_path)
    tr = set(m.get("train_images", []))
    va = set(m.get("val_images", []))
    overlap = tr.intersection(va)
    return {
        "manifest": str(manifest_path),
        "n_train": len(tr),
        "n_val": len(va),
        "overlap": len(overlap),
    }

rows = []
for p in sorted(SPLITS_ROOT.rglob("run*_seed*.yaml")):
    rows.append(leakage_check(p))

leak_df = pd.DataFrame(rows)
leak_df.sort_values(["overlap", "manifest"], ascending=[False, True]).head(20)
