In [None]:
!git clone https://github.com/isabek/XmlToTxt.git

Cloning into 'XmlToTxt'...
remote: Enumerating objects: 105, done.[K
remote: Counting objects: 100% (32/32), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 105 (delta 24), reused 22 (delta 20), pack-reused 73[K
Receiving objects: 100% (105/105), 16.44 KiB | 3.29 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [None]:
import os
os.chdir(r"/content/drive/MyDrive/PCB/imagedataset/XmlToTxt")
!python xmltotxt.py -c classes.txt -xml xml -out out



In [None]:
import os
import cv2
import numpy as np
import shutil
import glob
import random

def collect_files(source_folder, img_ext=".jpg", lbl_ext=".txt"):
    """Collect and return lists of image and label file paths."""
    image_files = sorted(glob.glob(os.path.join(source_folder, f"*{img_ext}")))
    label_files = sorted(glob.glob(os.path.join(source_folder, f"*{lbl_ext}")))
    return image_files, label_files

def split_dataset(image_files, train_ratio=0.8):
    """Shuffle and split image file paths into training and validation sets."""
    random.shuffle(image_files)
    train_count = int(len(image_files) * train_ratio)
    train_files = image_files[:train_count]
    val_files = image_files[train_count:]
    return train_files, val_files



In [None]:
def preprocess_image(image_path, target_size=(224, 224)):
    """Load, preprocess, and return an image as a numpy array."""
    img = cv2.imread(image_path)
    if img is None:
        return None
    img = cv2.resize(img, target_size)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.normalize(img.astype(np.float32), None, 0, 255, cv2.NORM_MINMAX)
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    img = cv2.GaussianBlur(img, (5, 5), 0)
    img = cv2.fastNlMeansDenoisingColored(img, None, 10, 10, 7, 21)
    return img

In [None]:
def save_files(file_list, dest_img_dir, dest_lbl_dir, label_ext=".txt"):
    """Save processed images and copy corresponding label files."""
    for img_path in file_list:
        img_name = os.path.basename(img_path)
        lbl_name = img_name.replace('.jpg', label_ext)
        lbl_path = os.path.join(os.path.dirname(img_path), lbl_name)

        processed_img = preprocess_image(img_path)
        if processed_img is not None:
            cv2.imwrite(os.path.join(dest_img_dir, img_name), processed_img)
            shutil.copy(lbl_path, os.path.join(dest_lbl_dir, lbl_name))

def to_v5_directories(train_img_dir, val_img_dir, train_lbl_dir, val_lbl_dir, source_folder):
    """Main function to organize dataset into train and validation sets."""
    images, labels = collect_files(source_folder)
    train_imgs, val_imgs = split_dataset(images)

    # Ensure destination directories exist
    os.makedirs(train_img_dir, exist_ok=True)
    os.makedirs(val_img_dir, exist_ok=True)
    os.makedirs(train_lbl_dir, exist_ok=True)
    os.makedirs(val_lbl_dir, exist_ok=True)

    # Save train and validation files
    save_files(train_imgs, train_img_dir, train_lbl_dir)
    save_files(val_imgs, val_img_dir, val_lbl_dir)

    print("Training images:", len(train_imgs))
    print("Validation images:", len(val_imgs))


In [None]:
to_v5_directories("/content/drive/MyDrive/PCB/imagedataset/dataset/images/train", "/content/drive/MyDrive/PCB/imagedataset/dataset/images/val", "/content/drive/MyDrive/PCB/imagedataset/dataset/labels/train","/content/drive/MyDrive/PCB/imagedataset/dataset/labels/val","/content/drive/MyDrive/PCB/imagedataset/images")




Training images are :  346
Validation images are :  86
