In [None]:
from sklearn.model_selection import train_test_split
import cv2
import os
import shutil
import yaml

# Root directory where images and labels are stored
root_dir = "/content/drive/MyDrive/MinorProject/Dataset/car-number-plate"
image_dir = os.path.join(root_dir, "images")
label_dir = os.path.join(root_dir, "labels")

# Folder where split dataset will be stored
output_dir = os.path.join(root_dir, "datasets")

# Accepted formats
image_formats = [".jpg", ".jpeg", ".png",".webp"]

def get_file_list(folder, extensions):
    files = []
    for root, _, filenames in os.walk(folder):
        for f in filenames:
            ext = os.path.splitext(f)[1].lower()
            if ext in extensions:
                files.append(os.path.join(root, f))
    return files

# Get all image paths
image_paths = get_file_list(image_dir, image_formats)
print("Total images found:", len(image_paths))

# Map image to label
image_label_pairs = []
for img_path in image_paths:
    img_name = os.path.splitext(os.path.basename(img_path))[0]
    label_path = os.path.join(label_dir, f"{img_name}.txt")
    if os.path.exists(label_path):
        image_label_pairs.append((img_path, label_path))
    else:
        print(f" Label not found for image: {img_path}")

# Split dataset
train_set, val_test = train_test_split(image_label_pairs, test_size=0.3, random_state=42)
val_set, test_set = train_test_split(val_test, test_size=0.7, random_state=42)

# Helper function to save files
def save_files(file_pairs, image_out, label_out):
    os.makedirs(image_out, exist_ok=True)
    os.makedirs(label_out, exist_ok=True)

    for img_path, lbl_path in file_pairs:
        img_name = os.path.basename(img_path)
        lbl_name = os.path.basename(lbl_path)

        shutil.copy(img_path, os.path.join(image_out, img_name))
        shutil.copy(lbl_path, os.path.join(label_out, lbl_name))

# Save to appropriate folders
save_files(train_set, os.path.join(output_dir, "images/train"), os.path.join(output_dir, "labels/train"))
save_files(val_set,   os.path.join(output_dir, "images/valid"), os.path.join(output_dir, "labels/valid"))
save_files(test_set,  os.path.join(output_dir, "images/test"),  os.path.join(output_dir, "labels/test"))

# Create YAML config
yaml_dict = {
    "path": output_dir,
    "train": "images/train",
    "val": "images/valid",
    "test": "images/test",
    "names": ["number plate"]
}

yaml_path = os.path.join(root_dir, "number-plate.yaml")
with open(yaml_path, "w") as f:
    yaml.dump(yaml_dict, f)

print("Dataset split and YAML file created at:", yaml_path)


Total images found: 304
Dataset split and YAML file created at: /content/drive/MyDrive/MinorProject/Dataset/car-number-plate/number-plate.yaml
