### 1. Download and import the ultralytics library

In [1]:
# !pip install ultralytics
from ultralytics import YOLO
import cv2

### 2. (Optional) Load and train the model 

In [2]:
%%time

model = YOLO("yolov8x.pt")
model.train(data="./data.yaml", epochs=100)   

New https://pypi.org/project/ultralytics/8.0.94 available  Update with 'pip install -U ultralytics'
Ultralytics YOLOv8.0.91  Python-3.11.3 torch-2.0.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)
[34m[1myolo\engine\trainer: [0mtask=detect, mode=train, model=yolov8x.pt, data=./data.yaml, epochs=100, patience=50, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=None, exist_ok=False, pretrained=False, optimizer=SGD, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=0, resume=False, amp=True, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, show=False, save_txt=False, save_conf=False, save_crop=False, show_labels=True, show_conf=True, vid_stride=1, line_thickness=3, visualize=False, augment=False, agnostic_nms=False, classes=None, retina_

CPU times: total: 2min 11s
Wall time: 5min 43s


### 3. Load the model if already trained before

In [2]:
pre_trained = YOLO("./runs/detect/train/weights/best.pt")

### 4. Evaluate the trained model on all the datasets

In [4]:
_ = pre_trained.val(split='train', save_json=True)

Ultralytics YOLOv8.0.91  Python-3.11.3 torch-2.0.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)
Model summary (fused): 268 layers, 68126457 parameters, 0 gradients, 257.4 GFLOPs
[34m[1mval: [0mScanning C:\git-repos\spring-2023\CSE-573-CVIP\final_project\yolo\train\labels.cache... 5 images, 0 backgrounds, 0[0m
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:05<0
                   all          5        261      0.879       0.93      0.974        0.5
                  half          5        106      0.714      0.991      0.975      0.487
               quarter          5        124      0.961          1      0.995      0.525
                 whole          5         31      0.961      0.799      0.951      0.487
Speed: 8.8ms preprocess, 69.8ms inference, 0.0ms loss, 3.4ms postprocess per image
Saving runs\detect\val\predictions.json...
Results saved to [1mruns\detect\val[0m


In [5]:
_ = pre_trained.val(split='val', save_json=True)

Ultralytics YOLOv8.0.91  Python-3.11.3 torch-2.0.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)
[34m[1mval: [0mScanning C:\git-repos\spring-2023\CSE-573-CVIP\final_project\yolo\valid\labels.cache... 2 images, 0 backgrounds, 0[0m
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:03<0
                   all          2        144      0.881       0.83       0.97      0.495
                  half          2         51      0.706       0.98      0.959      0.476
               quarter          2         73      0.937      0.986      0.987      0.479
                 whole          2         20          1      0.524      0.965       0.53
Speed: 1.5ms preprocess, 168.0ms inference, 0.0ms loss, 5.6ms postprocess per image
Saving runs\detect\val2\predictions.json...
Results saved to [1mruns\detect\val2[0m


In [6]:
_ = pre_trained.val(split='test', save_json=True)

Ultralytics YOLOv8.0.91  Python-3.11.3 torch-2.0.0+cu118 CUDA:0 (NVIDIA GeForce RTX 3060 Laptop GPU, 6144MiB)
[34m[1mval: [0mScanning C:\git-repos\spring-2023\CSE-573-CVIP\final_project\yolo\test\labels.cache... 1 images, 0 backgrounds, 0 [0m
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100%|██████████| 1/1 [00:00<0
                   all          1         58      0.876      0.906      0.986      0.529
                  half          1         25      0.694          1      0.968       0.52
               quarter          1         18      0.933          1      0.995      0.499
                 whole          1         15          1      0.719      0.995      0.566
Speed: 2.0ms preprocess, 299.2ms inference, 0.0ms loss, 6.4ms postprocess per image
Saving runs\detect\val3\predictions.json...
Results saved to [1mruns\detect\val3[0m


In [9]:
results = pre_trained.predict('./test/images', save=True, save_txt=True, save_conf=True, conf=0.307, save_crop=True)

# at conf=0.307, we get our highest F1 score


image 1/1 C:\git-repos\spring-2023\CSE-573-CVIP\final_project\yolo\test\images\finding-you-1.png: 640x480 30 halfs, 21 quarters, 13 wholes, 318.5ms
Speed: 6.0ms preprocess, 318.5ms inference, 4.0ms postprocess per image at shape (1, 3, 640, 640)
Results saved to [1mruns\detect\predict2[0m
1 label saved to runs\detect\predict2\labels


### 5. Save the detected objects as a sequence in a specified directory

In [83]:
# Load the test image
img = cv2.imread('./test/images/finding-you-1.png')

# Get the classes 
class_dict = {
    0: 'half',
    1: 'quarter',
    2: 'whole'
}
classes = results[0].boxes.cls.cpu().numpy()

# Get bounding box tensor sorted by Y coordinate
bbox_tensor = results[0].boxes.xyxy
bbox_tensor = bbox_tensor[bbox_tensor[:, 1].argsort()]
min_box_height = min(results[0].boxes.xywh[:, 3]).item()     # 69

# Group bounding boxes by Y coordinate
groups = []
current_group = []
for i in range(len(bbox_tensor)):
    if i == 0:
        current_group.append(bbox_tensor[i])
    else:
        if bbox_tensor[i][1] - bbox_tensor[i-1][1] < min_box_height:
            current_group.append(bbox_tensor[i])
        else:
            groups.append(current_group)
            current_group = [bbox_tensor[i]]
    
# Add last group to groups list
groups.append(current_group)

# Sort each group by X min coordinate
for group_idx, group in enumerate(groups):
    group.sort(key=lambda bbox: bbox[0])
    
    # Iterate over the bounding boxes in the group and save them
    for bbox_idx, bbox in enumerate(group):
        
        # Get the coordinates of the bounding box
        x1, y1, x2, y2 = bbox.tolist()

        # Get ROI and save it
        roi = img[int(y1):int(y2), int(x1):int(x2)]
        
        label = class_dict[classes[bbox_idx]]
        cv2.imwrite('./results/group_{}_bbox_{}_{}.png'.format(group_idx+1, bbox_idx+1, label), roi)