In [7]:
import os
import random
from shutil import copyfile
from tqdm import tqdm
from ultralytics import YOLO
import matplotlib.pyplot as plt

# Paths
dataset_folder = "train/images"  # Original folder containing all images
labels_folder = "train/labels"   # Folder containing corresponding labels
output_folder = "output"    # Folder for split dataset (train/test)

# Create output folder structure
os.makedirs(os.path.join(output_folder, "images/train"), exist_ok=True)
os.makedirs(os.path.join(output_folder, "images/val"), exist_ok=True)
os.makedirs(os.path.join(output_folder, "labels/train"), exist_ok=True)
os.makedirs(os.path.join(output_folder, "labels/val"), exist_ok=True)

# Split ratio
split_ratio = 0.8  # 80% train, 20% test

# Get all image files
image_files = [f for f in os.listdir(dataset_folder) if f.endswith(('.jpg', '.png', '.jpeg'))]

# Shuffle and split dataset
random.shuffle(image_files)
train_size = int(len(image_files) * split_ratio)
train_files = image_files[:train_size]
val_files = image_files[train_size:]

# Function to copy files to the corresponding folders
def copy_files(file_list, src_folder, dest_folder, label_src, label_dest):
    for file_name in tqdm(file_list, desc=f"Copying to {dest_folder}"):
        image_path = os.path.join(src_folder, file_name)
        label_path = os.path.join(label_src, file_name.replace('.jpg', '.txt').replace('.png', '.txt'))
        
        # Copy image
        copyfile(image_path, os.path.join(dest_folder, file_name))
        
        # Copy label if it exists
        if os.path.exists(label_path):
            copyfile(label_path, os.path.join(label_dest, file_name.replace('.jpg', '.txt').replace('.png', '.txt')))
        else:
            print(f"Label not found for {file_name}, skipping...")

# Copy train and validation files
copy_files(train_files, dataset_folder, os.path.join(output_folder, "images/train"), labels_folder, os.path.join(output_folder, "labels/train"))
copy_files(val_files, dataset_folder, os.path.join(output_folder, "images/val"), labels_folder, os.path.join(output_folder, "labels/val"))

print(f"Dataset split complete: {len(train_files)} train, {len(val_files)} val.")


print("Train images path:", os.path.abspath(os.path.join(output_folder, "images/train")))
print("Val images path:", os.path.abspath(os.path.join(output_folder, "images/val")))
print("Dataset YAML path:", yaml_path)

# List a few files for debugging
print("Sample train images:", os.listdir(os.path.join(output_folder, "images/train"))[:5])
print("Sample val images:", os.listdir(os.path.join(output_folder, "images/val"))[:5])


# Create dataset.yaml for YOLOv8
yaml_content = f"""
train: {os.path.abspath(os.path.join(output_folder, 'images/train'))}
val: {os.path.abspath(os.path.join(output_folder, 'images/val'))}

nc: 8  # Number of classes

names: [
  'Motorcycle',
  'Auto',
  'Car',
  'Bus',
  'LCV',
  'Truck',
  'Tractor',
  'Multi-Axle'
]
"""
yaml_path = os.path.join(output_folder, "dataset.yaml")
with open(yaml_path, 'w') as f:
    f.write(yaml_content)

print(f"Dataset configuration file saved to {yaml_path}.")

# Train YOLOv8 model
model = YOLO("yolov8s.pt")  # Load YOLOv8 small model

results = model.train(
    data=yaml_path,      # Path to dataset.yaml
    epochs=50,           # Number of epochs
    batch=16,            # Batch size
    imgsz=640,           # Image size
    device="cpu"         # Use "cuda:0" if GPU is available
)


# Extract training metrics
training_results = results.metrics

# Plot training and validation accuracy
epochs = range(1, len(training_results['box_map']) + 1)
train_map = training_results['box_map']  # Mean Average Precision for training
val_map = training_results['val_box_map']  # Mean Average Precision for validation

plt.figure(figsize=(10, 6))
plt.plot(epochs, train_map, label="Training Accuracy (mAP)", marker='o')
plt.plot(epochs, val_map, label="Validation Accuracy (mAP)", marker='s')
plt.title("Training and Validation Accuracy (mAP)")
plt.xlabel("Epochs")
plt.ylabel("Mean Average Precision (mAP)")
plt.legend()
plt.grid(True)
plt.show()


Copying to output/images/train: 100%|██████████| 6574/6574 [00:09<00:00, 680.15it/s]
Copying to output/images/val: 100%|██████████| 1644/1644 [00:01<00:00, 989.03it/s] 


Dataset split complete: 6574 train, 1644 val.
Train images path: /Users/advaithsajeev/Desktop/Software Engineering/output/images/train
Val images path: /Users/advaithsajeev/Desktop/Software Engineering/output/images/val
Dataset YAML path: output/dataset.yaml
Sample train images: ['Highway_1052_2020-07-30_jpg.rf.2ce6b8c4af445f6ef08c4492930da255.jpg', 'Highway_1415_2020-07-30_jpg.rf.821418c425e8559f8f20984547c82f35.jpg', 'Highway_1261_2020-07-30_jpg.rf.39ff0e3ad5bad698e0f7e8aea0a44855.jpg', 'Highway_288_2020-07-30_jpg.rf.ba20c424a1c57408ec8b6a9d0d37b28c.jpg', 'Highway_357_2020-07-30_jpg.rf.15e91ef197000093cbef88b920bed0ba.jpg']
Sample val images: ['Highway_127_2020-07-30_jpg.rf.0f9a95779179376acc5ca88f85ea120c.jpg', 'highway_3541_2020-08-26.jpg', 'highway_3554_2020-08-26.jpg', 'Highway_521_2020-07-30_jpg.rf.787aeec0fee759ad7a8358bf2bccd6ce.jpg', 'Highway_255_2020-07-30_jpg.rf.a212d8c97da485521361a4346844f941.jpg']
Dataset configuration file saved to output/dataset.yaml.
Downloading https

100%|██████████| 21.5M/21.5M [00:18<00:00, 1.23MB/s]


New https://pypi.org/project/ultralytics/8.3.67 available 😃 Update with 'pip install -U ultralytics'
Ultralytics 8.3.15 🚀 Python-3.9.19 torch-2.2.1 CPU (Apple M1)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov8s.pt, data=output/dataset.yaml, epochs=50, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt

[34m[1mtrain: [0mScanning /Users/advaithsajeev/Desktop/Software Engineering/output/labels/train... 6574 images, 15 backgrounds, 0 corrupt: 100%|██████████| 6574/6574 [00:03<00:00, 1703.89it/s]






[34m[1mtrain: [0mNew cache created: /Users/advaithsajeev/Desktop/Software Engineering/output/labels/train.cache
[34m[1malbumentations: [0mBlur(p=0.01, blur_limit=(3, 7)), MedianBlur(p=0.01, blur_limit=(3, 7)), ToGray(p=0.01, num_output_channels=3, method='weighted_average'), CLAHE(p=0.01, clip_limit=(1.0, 4.0), tile_grid_size=(8, 8))


  check_for_updates()
[34m[1mval: [0mScanning /Users/advaithsajeev/Desktop/Software Engineering/output/labels/val... 1644 images, 3 backgrounds, 0 corrupt: 100%|██████████| 1644/1644 [00:01<00:00, 1533.20it/s]

[34m[1mval: [0mNew cache created: /Users/advaithsajeev/Desktop/Software Engineering/output/labels/val.cache





Plotting labels to runs/detect/train/labels.jpg... 
[34m[1moptimizer:[0m 'optimizer=auto' found, ignoring 'lr0=0.01' and 'momentum=0.937' and determining best 'optimizer', 'lr0' and 'momentum' automatically... 
[34m[1moptimizer:[0m AdamW(lr=0.000833, momentum=0.9) with parameter groups 63 weight(decay=0.0), 70 weight(decay=0.0005), 69 bias(decay=0.0)
[34m[1mTensorBoard: [0mmodel graph visualization added ✅
Image sizes 640 train, 640 val
Using 0 dataloader workers
Logging results to [1mruns/detect/train[0m
Starting training for 50 epochs...

      Epoch    GPU_mem   box_loss   cls_loss   dfl_loss  Instances       Size


       1/50         0G      1.919      4.518      1.439         59        640:   1%|          | 3/411 [02:02<4:37:41, 40.84s/it]


KeyboardInterrupt: 