In [1]:
# Data handling
import pandas as pd
import numpy as np
import os
import cv2
from pathlib import Path

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Image handling
from PIL import Image

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Set plot style
sns.set(style="whitegrid")
%matplotlib inline

In [3]:
BASE_DIR = Path("../data/interim")
test_dir = BASE_DIR / "test"
template_dir = BASE_DIR / "template"
annot_dir = BASE_DIR / "annotations"

In [4]:
# Scan dataset and index files
from tqdm import tqdm

img_exts = {".jpg", ".jpeg", ".png", ".bmp"}

def list_files(folder: Path, exts):
    return sorted([p for p in folder.iterdir() if p.suffix.lower() in exts])

assert test_dir.exists() and template_dir.exists() and annot_dir.exists(), "Expected interim dataset at ../data/interim/{test,template,annotations}"

test_files = list_files(test_dir, img_exts)
template_files = list_files(template_dir, img_exts)
annot_files = sorted([p for p in annot_dir.iterdir() if p.suffix.lower() == ".txt"]) 

print(f"Counts -> test: {len(test_files)}, template: {len(template_files)}, annots: {len(annot_files)}")

# Build index by numeric stem
index = []
name_to_template = {p.stem: p for p in template_files}
name_to_annot = {p.stem: p for p in annot_files}

for test_path in tqdm(test_files):
    stem = test_path.stem
    tpl_path = name_to_template.get(stem)
    ann_path = name_to_annot.get(stem)
    index.append({
        "id": stem,
        "test_path": str(test_path),
        "template_path": str(tpl_path) if tpl_path is not None else None,
        "annot_path": str(ann_path) if ann_path is not None else None,
        "has_template": tpl_path is not None,
        "has_annotation": ann_path is not None,
    })

df = pd.DataFrame(index)
df.head()


Counts -> test: 1426, template: 1426, annots: 1426


100%|██████████| 1426/1426 [00:00<00:00, 285057.55it/s]


Unnamed: 0,id,test_path,template_path,annot_path,has_template,has_annotation
0,0001_test,..\data\interim\test\0001_test.jpg,,,False,False
1,0002_test,..\data\interim\test\0002_test.jpg,,,False,False
2,0003_test,..\data\interim\test\0003_test.jpg,,,False,False
3,0004_test,..\data\interim\test\0004_test.jpg,,,False,False
4,0005_test,..\data\interim\test\0005_test.jpg,,,False,False


In [None]:
# Image stats: dimensions and file sizes
from multiprocessing import Pool, cpu_count

def get_image_stats(row):
    path = row["test_path"]
    try:
        with Image.open(path) as im:
            width, height = im.size
        file_size_kb = os.path.getsize(path) / 1024.0
        return width, height, file_size_kb
    except Exception as e:
        return None, None, None

with Pool(min(8, cpu_count())) as pool:
    results = pool.map(get_image_stats, df.to_dict("records"))

w, h, sz = zip(*results)
df["width"], df["height"], df["size_kb"] = w, h, sz

display(df.describe(include="all"))

fig, axes = plt.subplots(1, 3, figsize=(14, 4))
sns.histplot(df["width"].dropna(), kde=False, ax=axes[0])
axes[0].set_title("Width distribution")
sns.histplot(df["height"].dropna(), kde=False, ax=axes[1])
axes[1].set_title("Height distribution")
sns.histplot(df["size_kb"].dropna(), kde=False, ax=axes[2])
axes[2].set_title("File size (KB)")
plt.tight_layout()
plt.show()


In [None]:
# Parse annotation files
# Assumed format per line: class_id x y w h (space-separated) or x y w h class
# Adjust parser if your annotation schema differs.

def parse_annotation_file(path: str):
    boxes = []
    if path is None or not os.path.exists(path):
        return boxes
    with open(path, "r") as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) < 4:
                continue
            nums = [float(p) for p in parts if p.replace('.', '', 1).isdigit()]
            if len(nums) >= 4:
                # assume x y w h [class]
                x, y, w, h = nums[:4]
                cls = parts[-1] if not parts[-1].replace('.', '', 1).isdigit() else "unknown"
                boxes.append({"x": x, "y": y, "w": w, "h": h, "cls": cls})
    return boxes

ann_counts = []
all_classes = []
for _, row in df.iterrows():
    boxes = parse_annotation_file(row["annot_path"]) if row["has_annotation"] else []
    ann_counts.append(len(boxes))
    all_classes.extend([b["cls"] for b in boxes])

df["num_boxes"] = ann_counts

print(df["num_boxes"].describe())

# Class distribution
cls_series = pd.Series(all_classes) if len(all_classes) else pd.Series([], dtype=str)
plt.figure(figsize=(8,4))
cls_series.value_counts().sort_values(ascending=False).plot(kind='bar')
plt.title("Annotation class distribution")
plt.xlabel("class")
plt.ylabel("count")
plt.tight_layout()
plt.show()


In [None]:
# Visualization utilities: show test, template, diff, threshold, and contours
import numpy as np

def subtract_and_threshold(test_img, template_img):
    gray_test = cv2.cvtColor(test_img, cv2.COLOR_BGR2GRAY)
    gray_tpl = cv2.cvtColor(template_img, cv2.COLOR_BGR2GRAY)
    diff = cv2.absdiff(gray_test, gray_tpl)
    blur = cv2.GaussianBlur(diff, (5,5), 0)
    _, th = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    kernel = np.ones((3,3), np.uint8)
    mor = cv2.morphologyEx(th, cv2.MORPH_OPEN, kernel, iterations=1)
    mor = cv2.dilate(mor, kernel, iterations=1)
    return diff, th, mor

def draw_contours(base_img, mask):
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    vis = base_img.copy()
    cv2.drawContours(vis, contours, -1, (0,255,0), 2)
    return vis, contours


def show_sample(row):
    test_path = row["test_path"]
    tpl_path = row["template_path"]
    test_img = cv2.imread(test_path)
    tpl_img = cv2.imread(tpl_path) if tpl_path is not None else None
    if test_img is None or tpl_img is None:
        print("Missing image(s)")
        return
    diff, th, mor = subtract_and_threshold(test_img, tpl_img)
    vis, contours = draw_contours(test_img, mor)

    fig, axes = plt.subplots(1,5, figsize=(16,4))
    axes[0].imshow(cv2.cvtColor(test_img, cv2.COLOR_BGR2RGB)); axes[0].set_title("Test")
    axes[1].imshow(cv2.cvtColor(tpl_img, cv2.COLOR_BGR2RGB)); axes[1].set_title("Template")
    axes[2].imshow(diff, cmap='gray'); axes[2].set_title("Abs diff")
    axes[3].imshow(mor, cmap='gray'); axes[3].set_title("Otsu + morph")
    axes[4].imshow(cv2.cvtColor(vis, cv2.COLOR_BGR2RGB)); axes[4].set_title(f"Contours: {len(contours)}")
    for ax in axes: ax.axis('off')
    plt.tight_layout(); plt.show()

# Show a few random samples with annotations count
samples = df.sample(3, random_state=42)
for _, r in samples.iterrows():
    print(f"ID: {r['id']} | boxes: {r.get('num_boxes', 0)}")
    show_sample(r)


In [None]:
# Save summary figures
from pathlib import Path
out_dir = Path("../reports/figures")
out_dir.mkdir(parents=True, exist_ok=True)

# Size distributions
fig, axes = plt.subplots(1, 3, figsize=(14, 4))
sns.histplot(df["width"].dropna(), kde=False, ax=axes[0]); axes[0].set_title("Width")
sns.histplot(df["height"].dropna(), kde=False, ax=axes[1]); axes[1].set_title("Height")
sns.histplot(df["size_kb"].dropna(), kde=False, ax=axes[2]); axes[2].set_title("File size (KB)")
plt.tight_layout()
fig_path = out_dir / "image_size_distributions.png"
plt.savefig(fig_path, dpi=150)
plt.close()

# Boxes per image
plt.figure(figsize=(6,4))
sns.histplot(df["num_boxes"].fillna(0), bins=20)
plt.title("Boxes per image")
plt.tight_layout()
fig_path2 = out_dir / "boxes_per_image.png"
plt.savefig(fig_path2, dpi=150)
plt.close()

print("Saved:", fig_path, fig_path2)
