In [1]:
!git clone https://github.com/isabek/XmlToTxt.git

Cloning into 'XmlToTxt'...
remote: Enumerating objects: 108, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (14/14), done.[K
Receiving objects: 100% (108/108), 17.58 KiB | 1.95 MiB/s, done.
remote: Total 108 (delta 23), reused 19 (delta 17), pack-reused 76 (from 1)[K
Resolving deltas: 100% (52/52), done.


In [None]:
import os
os.chdir(r"/content/drive/MyDrive/PCB/imagedataset/XmlToTxt")
!python xmltotxt.py -c classes.txt -xml xml -out out



In [2]:
import os
import cv2
import numpy as np
import shutil
import glob
import random

def collect_files(source_folder, img_ext=".jpg", lbl_ext=".txt"):
    """Collect and return lists of image and label file paths."""
    img_files=[]
    label_files=[]
    for root, dirs, files in os.walk(source_folder):
        for file in files:
            if file.endswith('.txt'):
                label_files.append(file)
            else:
                img_files.append(file)

    return img_files, label_files

def split_dataset(image_files, train_ratio=0.8):
    """Shuffle and split image file paths into training and validation sets."""
    random.shuffle(image_files)
    train_count = int(len(image_files) * train_ratio)
    train_files = image_files[:train_count]
    val_files = image_files[train_count:]
    return train_files, val_files



In [3]:
def preprocess_image(image_path, target_size=(224, 224)):
    """Load, preprocess, and return an image as a numpy array."""
    img = cv2.imread(image_path)
    if img is None:
        return None
    img = cv2.resize(img, target_size)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.normalize(img.astype(np.float32), None, 0, 255, cv2.NORM_MINMAX)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    gauss_img = cv2.GaussianBlur(img, (5, 5), 0)
    unsharp_image = cv2.addWeighted(img, 2.0, gauss_img, -1.0, 0)
    return unsharp_image

In [6]:
def save_files(file_list, dest_img_dir, dest_lbl_dir, source_dir, label_ext=".txt"):
    """Save processed images and copy corresponding label files."""
    for img_path in file_list:
        img_name = img_path
        lbl_name = img_name.replace('.jpg', label_ext)
        lbl_path = os.path.join(source_dir, lbl_name)
        image_path= os.path.join(source_dir,img_name)

        # Check if label file exists before proceeding
        if not os.path.isfile(lbl_path):
            print(f"Warning: Label file {lbl_name} not found for image {img_name}. Skipping this pair.")
            continue

        processed_img = preprocess_image(image_path)
        if processed_img is None:
            print(f"Warning: Image processing failed for {img_name}. Skipping.")
            continue

        # Save the processed image
        img_save_path = os.path.join(dest_img_dir, img_name)
        cv2.imwrite(img_save_path, processed_img)


        # Copy the label file
        label_save_path = os.path.join(dest_lbl_dir, lbl_name)
        shutil.copy(lbl_path, label_save_path)


def to_v5_directories(train_img_dir, val_img_dir, train_lbl_dir, val_lbl_dir, source_folder):
    """Main function to organize dataset into train and validation sets."""
    images, labels = collect_files(source_folder)

    # Ensure we have matching images and labels

    if len(images) == 0:
        print("No matching image-label pairs found.")
        return

    train_imgs, val_imgs = split_dataset(images)

    # Ensure destination directories exist
    os.makedirs(train_img_dir, exist_ok=True)
    os.makedirs(val_img_dir, exist_ok=True)
    os.makedirs(train_lbl_dir, exist_ok=True)
    os.makedirs(val_lbl_dir, exist_ok=True)

    # Save train and validation files

    save_files(train_imgs, train_img_dir, train_lbl_dir, source_folder)

    save_files(val_imgs, val_img_dir, val_lbl_dir, source_folder)

    print("Training images:", len(train_imgs))
    print("Validation images:", len(val_imgs))

In [7]:
to_v5_directories("/content/drive/MyDrive/PCB/imagedataset/dataset/images/train2", "/content/drive/MyDrive/PCB/imagedataset/dataset/images/val2", "/content/drive/MyDrive/PCB/imagedataset/dataset/labels/train2","/content/drive/MyDrive/PCB/imagedataset/dataset/labels/val2","/content/drive/MyDrive/PCB/imagedataset/images")


Training images: 346
Validation images: 87
