In [5]:
import pandas as pd
import os 
from pathlib import Path
import shutil
from sklearn.model_selection import train_test_split
import yaml
import numpy as np
import cv2
import torch
from ultralytics import YOLO
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
from PIL import Image

In [6]:
INPUT_DATA_DIR = Path('C:/Users/Reinhard/Documents/computer_vision_projects/amini_cocoa_contamination_challenge')

In [8]:
DATASETS_DIR = Path('datasets')
TRAIN_IMAGES_DIR = DATASETS_DIR /'images'/'train'
TRAIN_LABELS_DIR = DATASETS_DIR /'labels'/'train'
VAL_IMAGES_DIR = DATASETS_DIR /'images'/'val'
VAL_LABELS_DIR = DATASETS_DIR /'labels'/'val'
TEST_IMAGES_DIR = DATASETS_DIR /'images'/'test'

In [9]:
for DIR in [TRAIN_IMAGES_DIR, VAL_IMAGES_DIR, TEST_IMAGES_DIR, VAL_LABELS_DIR, DATASETS_DIR]:
    if DIR.exists():
        shutil.rmtree(DIR)
    DIR.mkdir(parents=True, exist_ok =True)


In [12]:
shutil.unpack_archive(INPUT_DATA_DIR / 'dataset.zip', DATASETS_DIR)

In [16]:
train = pd.read_csv(INPUT_DATA_DIR / 'Train.csv')
test = pd.read_csv(INPUT_DATA_DIR / 'test.csv')

In [17]:
train.head()

Unnamed: 0,Image_ID,class,confidence,ymin,xmin,ymax,xmax,class_id,ImagePath
0,ID_nBgcAR.jpg,healthy,1.0,75.0,15.0,162.0,195.0,2,dataset/images/train/ID_nBgcAR.jpg
1,ID_nBgcAR.jpg,healthy,1.0,58.0,1.0,133.0,171.0,2,dataset/images/train/ID_nBgcAR.jpg
2,ID_nBgcAR.jpg,healthy,1.0,42.0,29.0,377.0,349.0,2,dataset/images/train/ID_nBgcAR.jpg
3,ID_Kw2v8A.jpg,healthy,1.0,112.0,124.0,404.0,341.0,2,dataset/images/train/ID_Kw2v8A.jpg
4,ID_Kw2v8A.jpg,healthy,1.0,148.0,259.0,413.0,412.0,2,dataset/images/train/ID_Kw2v8A.jpg


In [18]:
class_map = {cls: i for i , cls in enumerate(sorted(train['class'].unique().tolist()))}
train['class'] = train['class'].str.strip()
train['class_id'] = train['class'].map(class_map)

In [20]:
# split the training data int training and validation sets
train_names, val_names = train_test_split(train['Image_ID'].unique(), test_size=0.15, random_state=42)
train_df = train[train['Image_ID'].isin(train_names)]
val_df = train[train['Image_ID'].isin(val_names)]

In [31]:
data_yaml = {
    'path': str(DATASETS_DIR.absolute()),
    'train': str(TRAIN_IMAGES_DIR.absolute()),
    'val': str(VAL_IMAGES_DIR.absolute()),
    'nc': len(class_map),
    'names': list(class_map.keys())
}

In [22]:
list(class_map.keys())

['anthracnose', 'cssvd', 'healthy']

In [24]:
yaml_path = 'data.yaml'
with open(yaml_path, 'w') as file:
    yaml.dump(data_yaml, file, default_flow_style=False)

In [27]:
model = YOLO('yolov8m.pt')

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8m.pt to 'yolov8m.pt'...


100%|██████████| 49.7M/49.7M [43:30<00:00, 20.0kB/s]  


In [29]:
model.train(data= 'data.yaml', epochs=1, imgsz=1024, batch=16, device ='cpu', patience = 5,
            optimizer='AdamW', lr0 = 1e-3, lrf=0.01, cos_lr = True, amp=True,
            freeze=10, hsv_h=0.015, hsv_s=0.7, hsv_v=0.4,
            mixup=0.2, mosaic=1.0, flipud=0.5, fliplr=0.5)

Ultralytics 8.3.16  Python-3.12.4 torch-2.5.0+cpu CPU (Intel Core(TM) i5-6300U 2.40GHz)
[34m[1mengine\trainer: [0mtask=detect, mode=train, model=yolov8m.pt, data=data.yaml, epochs=1, time=None, patience=5, batch=16, imgsz=1024, save=True, save_period=-1, cache=False, device=cpu, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=AdamW, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=True, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=10, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_masks=False, embed=None, show=False, save_frames=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, show_boxes=Tr

RuntimeError: Dataset 'data.yaml' error  
Dataset 'data.yaml' images not found , missing path 'C:\Users\Reinhard\Documents\computer_vision_projects\amini_cocoa_contamination_challenge\datasets\images\val'
Note dataset download directory is 'C:\Users\Reinhard\Documents\Lacuna malaria detection\datasets'. You can update this in 'C:\Users\Reinhard\AppData\Roaming\Ultralytics\settings.json'

In [None]:
model.val()


In [None]:
model = YOLO('runs/detect/train/weights/best.pt')


In [None]:
test_results = []
for image_file in tqdm(os.listdir(TEST_IMAGES_DIR));
    img_path = os.path.join(TEST_IMAGES_DIR, image_file)
    results = model(img_path)