# Hazelnut defect dataset EDA (MVTec AD)

This notebook explores the first dataset located at `data/hazelnut_1` to understand
its structure, class balance, image properties, and defect masks.

License note: CC BY-NC-SA 4.0 (non-commercial). See `data/hazelnut_1/readme.txt`.

In [None]:
from pathlib import Path
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from tqdm import tqdm

SEED = 42
random.seed(SEED)
np.random.seed(SEED)

DATA_DIR = Path("data/hazelnut_1")
assert DATA_DIR.exists(), f"Missing dataset folder: {DATA_DIR.resolve()}"

pd.set_option("display.max_rows", 200)
plt.rcParams["figure.figsize"] = (10, 5)

In [None]:
train_dir = DATA_DIR / "train"
test_dir = DATA_DIR / "test"
mask_dir = DATA_DIR / "ground_truth"

exts = {".png", ".jpg", ".jpeg", ".bmp", ".tif", ".tiff"}

def list_images(d: Path):
    return sorted([p for p in d.iterdir() if p.is_file() and p.suffix.lower() in exts])

train_good = list_images(train_dir / "good")

test_classes = sorted([p.name for p in test_dir.iterdir() if p.is_dir()])
mask_classes = sorted([p.name for p in mask_dir.iterdir() if p.is_dir()])

print("Train good images:", len(train_good))
print("Test classes:", test_classes)
print("Mask classes:", mask_classes)

In [None]:
rows = []
rows.append({"split": "train", "class": "good", "count": len(train_good)})

for cls in test_classes:
    files = list_images(test_dir / cls)
    rows.append({"split": "test", "class": cls, "count": len(files)})

counts_df = pd.DataFrame(rows).sort_values(["split", "class"]).reset_index(drop=True)
counts_df

In [None]:
# Collect all image files for inspection
all_images = []
for p in train_good:
    all_images.append(("train", "good", p))

for cls in test_classes:
    for p in list_images(test_dir / cls):
        all_images.append(("test", cls, p))

len(all_images)

In [None]:
# Image size, mode, and basic integrity checks
records = []
errors = []

for split, cls, p in tqdm(all_images):
    try:
        with Image.open(p) as img:
            w, h = img.size
            mode = img.mode
        records.append({
            "split": split,
            "class": cls,
            "path": str(p),
            "width": w,
            "height": h,
            "mode": mode,
        })
    except Exception as e:
        errors.append((str(p), repr(e)))

img_df = pd.DataFrame(records)
print("Unreadable images:", len(errors))
img_df.head()

In [None]:
# Image size distribution
sns.histplot(img_df, x="width", hue="split", multiple="stack")
plt.title("Image width distribution")
plt.show()

sns.histplot(img_df, x="height", hue="split", multiple="stack")
plt.title("Image height distribution")
plt.show()

img_df.groupby(["split", "class", "mode"]).size().reset_index(name="count")

In [None]:
# Sample mean/std stats on train-good (RGB)

def sample_files(files, n=200):
    if len(files) <= n:
        return files
    return random.sample(files, n)

sample = sample_files(train_good, n=200)

means = []
stds = []
for p in tqdm(sample):
    with Image.open(p) as img:
        img = img.convert("RGB")
        arr = np.asarray(img, dtype=np.float32) / 255.0
    means.append(arr.mean(axis=(0, 1)))
    stds.append(arr.std(axis=(0, 1)))

means = np.vstack(means)
stds = np.vstack(stds)

stats = pd.DataFrame({
    "channel": ["R", "G", "B"],
    "mean": means.mean(axis=0),
    "std": stds.mean(axis=0),
})
stats

In [None]:
# Ground-truth mask coverage and alignment with test images

def mask_coverage_for_class(cls):
    test_files = list_images(test_dir / cls)
    mask_files = list_images(mask_dir / cls)

    test_stems = {p.stem for p in test_files}
    mask_stems = {p.stem for p in mask_files}

    missing_masks = sorted(test_stems - mask_stems)
    orphan_masks = sorted(mask_stems - test_stems)

    coverages = []
    for p in mask_files:
        with Image.open(p) as img:
            mask = np.asarray(img.convert("L"))
        coverages.append((mask > 0).mean())

    return {
        "class": cls,
        "test_count": len(test_files),
        "mask_count": len(mask_files),
        "missing_masks": len(missing_masks),
        "orphan_masks": len(orphan_masks),
        "coverage_mean": float(np.mean(coverages)) if coverages else 0.0,
        "coverage_median": float(np.median(coverages)) if coverages else 0.0,
        "coverage_min": float(np.min(coverages)) if coverages else 0.0,
        "coverage_max": float(np.max(coverages)) if coverages else 0.0,
    }

mask_stats = [mask_coverage_for_class(cls) for cls in mask_classes]
mask_df = pd.DataFrame(mask_stats)
mask_df

In [None]:
# Coverage distributions by defect class
for cls in mask_classes:
    mask_files = list_images(mask_dir / cls)
    coverages = []
    for p in mask_files:
        with Image.open(p) as img:
            mask = np.asarray(img.convert("L"))
        coverages.append((mask > 0).mean())

    if coverages:
        sns.histplot(coverages, bins=20)
        plt.title(f"Mask coverage distribution - {cls}")
        plt.xlabel("Coverage ratio")
        plt.show()

In [None]:
# Visual samples

def overlay_mask(img, mask, color=(255, 0, 0), alpha=0.5):
    img = img.convert("RGB")
    mask = mask.convert("L")
    arr = np.asarray(img).copy()
    m = np.asarray(mask) > 0
    arr[m] = (1 - alpha) * arr[m] + alpha * np.array(color)
    return arr.astype(np.uint8)

# Good samples
fig, axes = plt.subplots(1, 4, figsize=(14, 4))
for ax, p in zip(axes, random.sample(train_good, k=min(4, len(train_good)))):
    ax.imshow(Image.open(p))
    ax.set_title("train/good")
    ax.axis("off")
plt.show()

# Defect samples with masks
for cls in mask_classes:
    test_files = list_images(test_dir / cls)
    mask_files = list_images(mask_dir / cls)
    if not test_files or not mask_files:
        continue

    p = random.choice(test_files)
    mask_path = mask_dir / cls / f"{p.stem}.png"
    if not mask_path.exists():
        continue

    img = Image.open(p)
    mask = Image.open(mask_path)
    overlay = overlay_mask(img, mask)

    fig, axes = plt.subplots(1, 3, figsize=(12, 4))
    axes[0].imshow(img)
    axes[0].set_title(f"test/{cls}")
    axes[0].axis("off")
    axes[1].imshow(mask, cmap="gray")
    axes[1].set_title("mask")
    axes[1].axis("off")
    axes[2].imshow(overlay)
    axes[2].set_title("overlay")
    axes[2].axis("off")
    plt.show()