In [1]:
pip install lxml pillow tqdm numpy h5py pyyaml


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import os, shutil, random, json, pickle, io
from pathlib import Path
from xml.etree import ElementTree as ET

from PIL import Image
import numpy as np
import h5py
import yaml
from tqdm import tqdm

# --------------------------
# CONFIG — edit if needed
# --------------------------
IMAGES_DIR = r"C:\Users\sagni\Downloads\Helmet Checker\archive\images"
ANN_DIR    = r"C:\Users\sagni\Downloads\Helmet Checker\archive\annotations"
OUT_ROOT   = r"C:\Users\sagni\Downloads\Helmet Checker"

# Splits and seed
SPLITS = {"train": 0.70, "val": 0.15, "test": 0.15}
SEED = 42

# Class handling for AndrewMVD (small) dataset:
# It typically has 'helmet' and 'head' (we'll map head -> no_helmet).
VOC_TO_CANON = {
    "helmet": "helmet",
    "head": "no_helmet",
    # Add any unexpected variants here if needed, e.g.:
    # "no-helmet": "no_helmet",
}

# Canonical ordered class list for YOLO/COCO
CLASSES = ["helmet", "no_helmet"]  # index 0,1

# --------------------------
# Helpers
# --------------------------
def ensure_dir(p: Path):
    p.mkdir(parents=True, exist_ok=True)

def list_xmls(ann_dir: Path):
    return sorted([p for p in ann_dir.glob("*.xml")])

def parse_voc(xml_path: Path):
    """
    Returns:
      img_filename (str),
      width (int),
      height (int),
      annotations: list of dicts {cls_name, xmin, ymin, xmax, ymax}
    """
    tree = ET.parse(xml_path)
    root = tree.getroot()

    filename = root.findtext("filename")
    size = root.find("size")
    w = int(size.findtext("width"))
    h = int(size.findtext("height"))

    anns = []
    for obj in root.findall("object"):
        name = obj.findtext("name").strip()
        bnd = obj.find("bndbox")
        xmin = int(float(bnd.findtext("xmin")))
        ymin = int(float(bnd.findtext("ymin")))
        xmax = int(float(bnd.findtext("xmax")))
        ymax = int(float(bnd.findtext("ymax")))
        anns.append({
            "cls_name": name,
            "xmin": xmin, "ymin": ymin, "xmax": xmax, "ymax": ymax
        })
    return filename, w, h, anns

def canon_class(name: str):
    name = name.lower().strip()
    return VOC_TO_CANON.get(name, None)

def yolo_bbox(xmin, ymin, xmax, ymax, img_w, img_h):
    # Convert VOC to YOLO normalized cx, cy, w, h (in [0,1])
    cx = (xmin + xmax) / 2.0 / img_w
    cy = (ymin + ymax) / 2.0 / img_h
    bw = (xmax - xmin) / float(img_w)
    bh = (ymax - ymin) / float(img_h)
    return cx, cy, bw, bh

def coco_bbox(xmin, ymin, xmax, ymax):
    # COCO uses [x, y, width, height]
    return [xmin, ymin, xmax - xmin, ymax - ymin]

# --------------------------
# Discover and split
# --------------------------
random.seed(SEED)
images_dir = Path(IMAGES_DIR)
ann_dir = Path(ANN_DIR)
out_root = Path(OUT_ROOT)

xml_files = list_xmls(ann_dir)
if not xml_files:
    raise RuntimeError(f"No XML files found in {ann_dir}")

# Align xmls with images; filter any missing
records = []
missing_images = []
for xml in xml_files:
    try:
        filename, w, h, anns = parse_voc(xml)
    except Exception as e:
        print(f"[WARN] Failed to parse {xml}: {e}")
        continue

    # image path (handle cases where filename has extension)
    img_path = (images_dir / filename)
    if not img_path.exists():
        # Sometimes filename in xml differs; try same stem with common image extensions
        stem = Path(filename).stem
        candidates = list(images_dir.glob(stem + ".*"))
        if candidates:
            img_path = candidates[0]
        else:
            missing_images.append(filename)
            continue

    # Map classes & drop unknown
    mapped = []
    for a in anns:
        c = canon_class(a["cls_name"])
        if c in CLASSES:
            mapped.append({**a, "cls_name": c})
        else:
            # Unknown class -> skip object
            pass

    # It's OK if an image ends with 0 valid objects, but we keep it for completeness
    records.append({
        "xml": xml,
        "img": img_path,
        "width": w,
        "height": h,
        "anns": mapped
    })

if missing_images:
    print(f"[WARN] {len(missing_images)} annotation(s) skipped due to missing image files.")

# Shuffle and split
random.shuffle(records)
n = len(records)
n_train = int(SPLITS["train"] * n)
n_val = int(SPLITS["val"] * n)
n_test = n - n_train - n_val
splits = {
    "train": records[:n_train],
    "val": records[n_train:n_train+n_val],
    "test": records[n_train+n_val:]
}
print(f"[INFO] Split sizes: train={len(splits['train'])}, val={len(splits['val'])}, test={len(splits['test'])}, total={n}")

# --------------------------
# Prepare output dirs
# --------------------------
images_out = {
    s: out_root / "data" / "images" / s for s in splits.keys()
}
labels_out = {
    s: out_root / "data" / "labels" / s for s in splits.keys()
}
for d in list(images_out.values()) + list(labels_out.values()):
    ensure_dir(d)

# --------------------------
# Write YOLO labels + copy images
# --------------------------
cls_to_id = {c: i for i, c in enumerate(CLASSES)}
split_counts = {s: 0 for s in splits}
class_counts = {c: 0 for c in CLASSES}

for s, recs in splits.items():
    for rec in tqdm(recs, desc=f"Writing {s}"):
        img_src = rec["img"]
        img_name = img_src.name
        label_stem = Path(img_name).stem

        # Copy image
        shutil.copy2(img_src, images_out[s] / img_name)

        # Write YOLO label
        yolo_lines = []
        for a in rec["anns"]:
            cls_id = cls_to_id[a["cls_name"]]
            cx, cy, bw, bh = yolo_bbox(a["xmin"], a["ymin"], a["xmax"], a["ymax"], rec["width"], rec["height"])
            yolo_lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {bw:.6f} {bh:.6f}")
            class_counts[a["cls_name"]] += 1

        with open(labels_out[s] / f"{label_stem}.txt", "w", encoding="utf-8") as f:
            f.write("\n".join(yolo_lines))

        split_counts[s] += 1

print("[INFO] YOLO labels written.")
print("[INFO] Class counts:", class_counts)

# --------------------------
# COCO JSON per split
# --------------------------
def coco_for_split(name, recs, out_path: Path):
    coco = {
        "info": {"description": f"Helmet dataset ({name})", "version": "1.0"},
        "licenses": [],
        "images": [],
        "annotations": [],
        "categories": [{"id": i, "name": c, "supercategory": "object"} for i, c in enumerate(CLASSES)]
    }
    ann_id = 1
    for img_id, rec in enumerate(recs, start=1):
        img_name = rec["img"].name
        coco["images"].append({
            "id": img_id,
            "file_name": img_name,
            "width": rec["width"],
            "height": rec["height"]
        })
        for a in rec["anns"]:
            cls_id = cls_to_id[a["cls_name"]]
            bbox = coco_bbox(a["xmin"], a["ymin"], a["xmax"], a["ymax"])
            coco["annotations"].append({
                "id": ann_id,
                "image_id": img_id,
                "category_id": cls_id,
                "bbox": bbox,
                "area": bbox[2]*bbox[3],
                "iscrowd": 0
            })
            ann_id += 1
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(coco, f, indent=2)

coco_dir = out_root / "coco"
ensure_dir(coco_dir)
for s, recs in splits.items():
    coco_for_split(s, recs, coco_dir / f"coco_annotations_{s}.json")
print("[INFO] COCO JSONs written to", coco_dir)

# --------------------------
# YAML (Ultralytics) dataset file
# --------------------------
yaml_path = out_root / "helmet_dataset.yaml"
yaml_data = {
    "path": str((out_root / "data").resolve()),
    "train": "images/train",
    "val": "images/val",
    "test": "images/test",
    "names": CLASSES
}
with open(yaml_path, "w", encoding="utf-8") as f:
    yaml.safe_dump(yaml_data, f, sort_keys=False)
print("[INFO] YAML written to", yaml_path)

# --------------------------
# PKL metadata
# --------------------------
meta = {
    "classes": CLASSES,
    "class_to_id": cls_to_id,
    "splits_counts": split_counts,
    "class_counts": class_counts,
    "seed": SEED,
    "source_images_dir": str(images_dir),
    "source_annotations_dir": str(ann_dir),
    "notes": "head mapped to no_helmet via VOC_TO_CANON."
}
pkl_path = out_root / "helmet_metadata.pkl"
with open(pkl_path, "wb") as f:
    pickle.dump(meta, f)
print("[INFO] PKL written to", pkl_path)

# --------------------------
# HDF5 pack (images + anns)
# --------------------------
# Store per-split datasets:
#   /train/images (N, ) bytes of encoded images
#   /train/labels  JSON strings per image with list of {cls_id, bbox[xmin,ymin,xmax,ymax]}
#   similarly for val/test
h5_path = out_root / "helmet_dataset.h5"
with h5py.File(h5_path, "w") as h5:
    for s, recs in splits.items():
        grp = h5.create_group(s)
        img_bytes_ds = grp.create_dataset("images", (len(recs),), dtype=h5py.special_dtype(vlen=np.dtype('uint8')))
        labels_ds = grp.create_dataset("labels_json", (len(recs),), dtype=h5py.string_dtype(encoding='utf-8'))
        names_ds = grp.create_dataset("image_names", (len(recs),), dtype=h5py.string_dtype(encoding='utf-8'))

        for i, rec in enumerate(tqdm(recs, desc=f"Packing H5 {s}")):
            # Read and store encoded bytes (keep original format)
            with open(rec["img"], "rb") as f:
                data = f.read()
            img_bytes = np.frombuffer(data, dtype=np.uint8)
            img_bytes_ds[i] = img_bytes

            names_ds[i] = rec["img"].name

            # Labels as JSON (cls_id + VOC bbox for fidelity)
            lab = []
            for a in rec["anns"]:
                lab.append({
                    "cls_id": cls_to_id[a["cls_name"]],
                    "bbox_voc": [a["xmin"], a["ymin"], a["xmax"], a["ymax"]],
                })
            labels_ds[i] = json.dumps(lab)

print("[INFO] HDF5 written to", h5_path)

print("\n[DONE] Outputs created:")
print(f" - YOLO data tree: {out_root}\\data\\(images,labels)\\(train,val,test)")
print(f" - COCO JSONs:     {coco_dir}\\coco_annotations_{{train|val|test}}.json")
print(f" - YAML:           {yaml_path}")
print(f" - PKL:            {pkl_path}")
print(f" - HDF5:           {h5_path}")


[INFO] Split sizes: train=534, val=114, test=116, total=764


Writing train: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 534/534 [00:02<00:00, 250.07it/s]
Writing val: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 114/114 [00:01<00:00, 100.90it/s]
Writing test: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 323.54it/s]


[INFO] YOLO labels written.
[INFO] Class counts: {'helmet': 0, 'no_helmet': 0}
[INFO] COCO JSONs written to C:\Users\sagni\Downloads\Helmet Checker\coco
[INFO] YAML written to C:\Users\sagni\Downloads\Helmet Checker\helmet_dataset.yaml
[INFO] PKL written to C:\Users\sagni\Downloads\Helmet Checker\helmet_metadata.pkl


Packing H5 train: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 534/534 [00:05<00:00, 96.34it/s]
Packing H5 val: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 114/114 [00:00<00:00, 708.15it/s]
Packing H5 test: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 116/116 [00:00<00:00, 741.33it/s]


[INFO] HDF5 written to C:\Users\sagni\Downloads\Helmet Checker\helmet_dataset.h5

[DONE] Outputs created:
 - YOLO data tree: C:\Users\sagni\Downloads\Helmet Checker\data\(images,labels)\(train,val,test)
 - COCO JSONs:     C:\Users\sagni\Downloads\Helmet Checker\coco\coco_annotations_{train|val|test}.json
 - YAML:           C:\Users\sagni\Downloads\Helmet Checker\helmet_dataset.yaml
 - PKL:            C:\Users\sagni\Downloads\Helmet Checker\helmet_metadata.pkl
 - HDF5:           C:\Users\sagni\Downloads\Helmet Checker\helmet_dataset.h5
