In [1]:

import os
from ultralytics import YOLO
import json
import cv2
from tqdm.auto import tqdm
import torch
from torchmetrics.detection import MeanAveragePrecision

In [5]:
DATASET_DIR = "../hand-cursive-detr"
ANNOTATION_FILE_NAME = "_val_annotations_coco.json"
#HF_CACHE = "/home/ralvarez22/Documentos/llm_data/llm_cache"
DEVICE = "cuda"
CHECKPOINT = "../finetuned/yolo/Zani_V1/weights/last.pt"
CONFIDENCE_TRESHOLD = 0.5

In [6]:
DATASET_FILE = os.path.join(DATASET_DIR, ANNOTATION_FILE_NAME)

In [7]:
dataset_json = json.load(open(DATASET_FILE, "r"))

In [8]:
del dataset_json["info"]
del dataset_json["licenses"]
del dataset_json["categories"]

In [9]:
dataset_json.keys()

dict_keys(['images', 'annotations'])

In [10]:
images_with_boxes = []
for e in dataset_json["images"]:
    image_annotations = [
        x for x in dataset_json["annotations"] if x["image_id"] == e["id"]
    ]
    images_with_boxes.append(
        {"id": e["id"], "image": e["file_name"], "boxes": image_annotations}
    )

In [11]:
metric = MeanAveragePrecision(iou_type="bbox", class_metrics=True)

In [12]:
yolo_model = YOLO(CHECKPOINT)

In [13]:
def generate_bboxes(image):
    results = yolo_model.predict(image, conf=CONFIDENCE_TRESHOLD)[0].boxes
    return results.conf, results.xyxy, results.cls.int()

In [14]:
pred_data = []
tgt_data = []
for test_item in tqdm(images_with_boxes):
    boxes = torch.tensor([ [ x["bbox"][0], x["bbox"][1], x["bbox"][0] + x["bbox"][2], x["bbox"][1] + x["bbox"][3] ]  for x in test_item["boxes"]], device=DEVICE)
    cats = torch.tensor([x["category_id"] for x in test_item["boxes"]], device=DEVICE)
    input_image = os.path.join(DATASET_DIR, test_item["image"])
    image_pixels = cv2.imread(input_image)
    scores, pred_boxes, labels = generate_bboxes(image_pixels)
    
    tgt_data.append({
        "boxes": boxes,
        "labels": cats
    })
    pred_data.append({
        "boxes": pred_boxes,
        "scores": scores,
        "labels": labels
    })
    
    

  0%|          | 0/479 [00:00<?, ?it/s]


0: 640x512 53 words, 47.6ms
Speed: 5.4ms preprocess, 47.6ms inference, 105.6ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 53 words, 8.7ms
Speed: 2.2ms preprocess, 8.7ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 59 words, 10.0ms
Speed: 2.0ms preprocess, 10.0ms inference, 1.2ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 65 words, 9.1ms
Speed: 2.6ms preprocess, 9.1ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 56 words, 9.5ms
Speed: 3.0ms preprocess, 9.5ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 55 words, 9.5ms
Speed: 2.1ms preprocess, 9.5ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 44 words, 9.3ms
Speed: 2.8ms preprocess, 9.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 512)

0: 640x512 56 words, 11.8ms
Speed: 3.0ms preprocess, 11.8ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 512)


In [15]:
metric.update(pred_data, tgt_data)

In [16]:
metric.compute()

{'map': tensor(0.6210),
 'map_50': tensor(0.9735),
 'map_75': tensor(0.6745),
 'map_small': tensor(0.),
 'map_medium': tensor(0.5585),
 'map_large': tensor(0.6547),
 'mar_1': tensor(0.0138),
 'mar_10': tensor(0.1352),
 'mar_100': tensor(0.7224),
 'mar_small': tensor(0.),
 'mar_medium': tensor(0.6773),
 'mar_large': tensor(0.7515),
 'map_per_class': tensor(0.6210),
 'mar_100_per_class': tensor(0.7224),
 'classes': tensor(1, dtype=torch.int32)}