# 01 · Prepare YOLO Dataset

Convert the PCB defect annotations from Pascal VOC XML into YOLOv8-compatible TXT labels, organize the images/labels into train/val/test splits, and generate a `data.yaml` file for downstream training.


In [1]:
from __future__ import annotations

import random
import shutil
import xml.etree.ElementTree as ET
from dataclasses import dataclass
from pathlib import Path

import numpy as np
import pandas as pd
tqdm = __import__("tqdm").tqdm


In [2]:
DATA_ROOT = Path("../PCB_DATASET").resolve()
ANNOTATIONS_DIR = DATA_ROOT / "Annotations"
IMAGES_DIR = DATA_ROOT / "images"
YOLO_ROOT = DATA_ROOT / "yolo_dataset"
YOLO_IMAGES = YOLO_ROOT / "images"
YOLO_LABELS = YOLO_ROOT / "labels"
DATA_YAML = YOLO_ROOT / "data.yaml"

CLASS_NAMES = [
    "Missing_hole",
    "Mouse_bite",
    "Open_circuit",
    "Short",
    "Spur",
    "Spurious_copper",
]
CLASS_TO_ID = {name.lower(): idx for idx, name in enumerate(CLASS_NAMES)}

YOLO_ROOT.mkdir(exist_ok=True)
YOLO_IMAGES.mkdir(exist_ok=True)
YOLO_LABELS.mkdir(exist_ok=True)
print(f"Working inside {YOLO_ROOT}")


Working inside F:\infosys_springboard\PCB_DATASET\yolo_dataset


In [3]:
def voc_box_to_yolo(box, img_w, img_h):
    """Convert VOC bbox dict to YOLO (cx, cy, w, h) normalized."""
    xmin, ymin, xmax, ymax = box
    cx = (xmin + xmax) / 2.0 / img_w
    cy = (ymin + ymax) / 2.0 / img_h
    bw = (xmax - xmin) / img_w
    bh = (ymax - ymin) / img_h
    return cx, cy, bw, bh


def parse_annotation(xml_path: Path):
    root = ET.parse(xml_path).getroot()
    width = int(root.findtext("size/width"))
    height = int(root.findtext("size/height"))
    objects = []
    for obj in root.findall("object"):
        cls = obj.findtext("name").strip().lower()
        if cls not in CLASS_TO_ID:
            continue
        bbox = obj.find("bndbox")
        xmin = float(bbox.findtext("xmin"))
        ymin = float(bbox.findtext("ymin"))
        xmax = float(bbox.findtext("xmax"))
        ymax = float(bbox.findtext("ymax"))
        objects.append((CLASS_TO_ID[cls], (xmin, ymin, xmax, ymax)))
    return width, height, objects


def build_yolo_label(xml_path: Path, save_dir: Path):
    width, height, objects = parse_annotation(xml_path)
    if not objects:
        return None
    yolo_lines = []
    for class_id, box in objects:
        yolo_box = voc_box_to_yolo(box, width, height)
        yolo_lines.append(" ".join([str(class_id), *map(lambda v: f"{v:.6f}", yolo_box)]))
    label_path = save_dir / (xml_path.stem + ".txt")
    label_path.write_text("\n".join(yolo_lines))
    return label_path


In [4]:
records = []
for cls_dir in sorted(ANNOTATIONS_DIR.iterdir()):
    if not cls_dir.is_dir():
        continue
    for xml_path in tqdm(sorted(cls_dir.glob("*.xml")), desc=f"{cls_dir.name:>15}"):
        label_path = build_yolo_label(xml_path, YOLO_LABELS)
        if label_path is None:
            continue
        rel_img = Path("../images") / cls_dir.name / (xml_path.stem + ".jpg")
        img_path = IMAGES_DIR / cls_dir.name / (xml_path.stem + ".jpg")
        if not img_path.exists():
            raise FileNotFoundError(img_path)
        records.append({
            "class_name": cls_dir.name,
            "xml": xml_path,
            "image_rel": rel_img,
            "image_abs": img_path,
            "label_rel": Path("labels") / (xml_path.stem + ".txt"),
        })

df = pd.DataFrame(records)
print(df.head())
print(f"Converted {len(df)} annotations")


   Missing_hole: 100%|██████████| 115/115 [00:00<00:00, 459.46it/s]
     Mouse_bite: 100%|██████████| 115/115 [00:00<00:00, 710.42it/s]
   Open_circuit: 100%|██████████| 116/116 [00:00<00:00, 703.44it/s]
          Short: 100%|██████████| 116/116 [00:00<00:00, 705.03it/s]
           Spur: 100%|██████████| 115/115 [00:00<00:00, 644.92it/s]
Spurious_copper: 100%|██████████| 116/116 [00:00<00:00, 596.68it/s]

     class_name                                                xml  \
0  Missing_hole  F:\infosys_springboard\PCB_DATASET\Annotations...   
1  Missing_hole  F:\infosys_springboard\PCB_DATASET\Annotations...   
2  Missing_hole  F:\infosys_springboard\PCB_DATASET\Annotations...   
3  Missing_hole  F:\infosys_springboard\PCB_DATASET\Annotations...   
4  Missing_hole  F:\infosys_springboard\PCB_DATASET\Annotations...   

                                       image_rel  \
0  ..\images\Missing_hole\01_missing_hole_01.jpg   
1  ..\images\Missing_hole\01_missing_hole_02.jpg   
2  ..\images\Missing_hole\01_missing_hole_03.jpg   
3  ..\images\Missing_hole\01_missing_hole_04.jpg   
4  ..\images\Missing_hole\01_missing_hole_05.jpg   

                                           image_abs  \
0  F:\infosys_springboard\PCB_DATASET\images\Miss...   
1  F:\infosys_springboard\PCB_DATASET\images\Miss...   
2  F:\infosys_springboard\PCB_DATASET\images\Miss...   
3  F:\infosys_springboard\PCB_DATASET\imag




In [5]:
df.head()

Unnamed: 0,class_name,xml,image_rel,image_abs,label_rel
0,Missing_hole,F:\infosys_springboard\PCB_DATASET\Annotations...,..\images\Missing_hole\01_missing_hole_01.jpg,F:\infosys_springboard\PCB_DATASET\images\Miss...,labels\01_missing_hole_01.txt
1,Missing_hole,F:\infosys_springboard\PCB_DATASET\Annotations...,..\images\Missing_hole\01_missing_hole_02.jpg,F:\infosys_springboard\PCB_DATASET\images\Miss...,labels\01_missing_hole_02.txt
2,Missing_hole,F:\infosys_springboard\PCB_DATASET\Annotations...,..\images\Missing_hole\01_missing_hole_03.jpg,F:\infosys_springboard\PCB_DATASET\images\Miss...,labels\01_missing_hole_03.txt
3,Missing_hole,F:\infosys_springboard\PCB_DATASET\Annotations...,..\images\Missing_hole\01_missing_hole_04.jpg,F:\infosys_springboard\PCB_DATASET\images\Miss...,labels\01_missing_hole_04.txt
4,Missing_hole,F:\infosys_springboard\PCB_DATASET\Annotations...,..\images\Missing_hole\01_missing_hole_05.jpg,F:\infosys_springboard\PCB_DATASET\images\Miss...,labels\01_missing_hole_05.txt


In [6]:
SPLIT_MAP = {"train": 0.7, "val": 0.2, "test": 0.1}
assert abs(sum(SPLIT_MAP.values()) - 1.0) < 1e-6

random.seed(13)
YOLO_SPLITS = {split: {"images": YOLO_IMAGES / split, "labels": YOLO_LABELS / split} for split in SPLIT_MAP}
for split_dirs in YOLO_SPLITS.values():
    split_dirs["images"].mkdir(parents=True, exist_ok=True)
    split_dirs["labels"].mkdir(parents=True, exist_ok=True)

split_records = []
for cls_name, group in df.groupby("class_name"):
    paths = group.sample(frac=1.0, random_state=42)
    n = len(paths)
    train_end = int(n * SPLIT_MAP["train"])
    val_end = train_end + int(n * SPLIT_MAP["val"])
    splits = {
        "train": paths.iloc[:train_end],
        "val": paths.iloc[train_end:val_end],
        "test": paths.iloc[val_end:],
    }
    
    for split, split_df in splits.items():
        for _, row in split_df.iterrows():
            dst_img = YOLO_SPLITS[split]["images"] / row["image_rel"].name
            dst_lbl = YOLO_SPLITS[split]["labels"] / row["label_rel"].name
            shutil.copy2(row["image_abs"], dst_img)
            shutil.copy2(YOLO_LABELS / row["label_rel"].name, dst_lbl)
            split_records.append({"split": split, **row})

split_df = pd.DataFrame(split_records)
split_df.groupby(["split", "class_name"]).size().unstack(fill_value=0)


class_name,Missing_hole,Mouse_bite,Open_circuit,Short,Spur,Spurious_copper
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
test,12,12,12,12,12,12
train,80,80,81,81,80,81
val,23,23,23,23,23,23


In [7]:
yaml_text = f"""path: {YOLO_ROOT}
train: images/train
val: images/val
test: images/test
names:
""" + "\n".join([f"  {idx}: {name}" for idx, name in enumerate(CLASS_NAMES)])
DATA_YAML.write_text(yaml_text)
print(DATA_YAML.read_text())


path: F:\infosys_springboard\PCB_DATASET\yolo_dataset
train: images/train
val: images/val
test: images/test
names:
  0: Missing_hole
  1: Mouse_bite
  2: Open_circuit
  3: Short
  4: Spur
  5: Spurious_copper
