In [None]:
import os
import os
import cv2
import xml.etree.ElementTree as ET
from collections import defaultdict

In [None]:



xml_dir = "./dataset_organized/annotations/train"
class_counts = defaultdict(int)

for xml_file in os.listdir(xml_dir):
    if not xml_file.endswith(".xml"):
        continue
    xml_path = os.path.join(xml_dir, xml_file)
    tree = ET.parse(xml_path)
    root = tree.getroot()

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        class_counts[class_name] += 1

print(" Class Frequencies from XML:")
for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{cls}: {count} instances")


📊 Class Frequencies from XML:
D00: 16020 instances
D20: 9856 instances
D10: 8837 instances
D40: 5997 instances
D44: 5057 instances
D50: 3581 instances
D43: 793 instances
Repair: 277 instances
D01: 179 instances
D11: 45 instances


In [None]:


# Paths
image_dir = "./dataset_organized/images/train"
xml_dir = "./dataset_organized/annotations/train"
out_img_dir = "./dataset_augmented/images/train"
out_xml_dir = "./dataset_augmented/annotations/train"
os.makedirs(out_img_dir, exist_ok=True)
os.makedirs(out_xml_dir, exist_ok=True)

rare_classes = {"D11", "D01", "Repair", "D43"}
augment_count = 3

for xml_file in os.listdir(xml_dir):
    if not xml_file.endswith(".xml"):
        continue

    xml_path = os.path.join(xml_dir, xml_file)
    tree = ET.parse(xml_path)
    root = tree.getroot()
    filename = root.find("filename").text
    img_path = os.path.join(image_dir, filename)

    # Check if rare class exists
    class_names = [obj.find("name").text for obj in root.findall("object")]
    if not any(cls in rare_classes for cls in class_names):
        continue
    if not os.path.exists(img_path):
        continue

    # Read image
    image = cv2.imread(img_path)
    h, w = image.shape[:2]

    for i in range(augment_count):
        # Flip image
        flipped = cv2.flip(image, 1)
        new_img_name = f"aug_{i}_{filename}"
        new_xml_name = f"aug_{i}_{xml_file}"
        cv2.imwrite(os.path.join(out_img_dir, new_img_name), flipped)

        # Flip XML bounding boxes
        new_tree = ET.parse(xml_path)
        new_root = new_tree.getroot()
        new_root.find("filename").text = new_img_name

        for obj in new_root.findall("object"):
            bbox = obj.find("bndbox")
            xmin = int(bbox.find("xmin").text)
            xmax = int(bbox.find("xmax").text)
            new_xmin = w - xmax
            new_xmax = w - xmin
            bbox.find("xmin").text = str(new_xmin)
            bbox.find("xmax").text = str(new_xmax)

        new_tree.write(os.path.join(out_xml_dir, new_xml_name))

print(" Rare class images and XMLs augmented with horizontal flips.")

✅ Rare class images and XMLs augmented with horizontal flips.


In [None]:


xml_dir = "./dataset_augmented/annotations/train"
class_counts = defaultdict(int)

for xml_file in os.listdir(xml_dir):
    if not xml_file.endswith(".xml"):
        continue
    xml_path = os.path.join(xml_dir, xml_file)
    tree = ET.parse(xml_path)
    root = tree.getroot()

    for obj in root.findall("object"):
        class_name = obj.find("name").text
        class_counts[class_name] += 1

print(" Class Frequencies from XML:")
for cls, count in sorted(class_counts.items(), key=lambda x: x[1], reverse=True):
    print(f"{cls}: {count} instances")


📊 Class Frequencies from XML:
D43: 2379 instances
D00: 1389 instances
D20: 1080 instances
D50: 1014 instances
D10: 849 instances
Repair: 831 instances
D40: 543 instances
D01: 537 instances
D44: 498 instances
D11: 135 instances
