In [1]:

import os
from transformers import DetrForObjectDetection, DetrImageProcessor, RTDetrImageProcessor, RTDetrForObjectDetection, YolosImageProcessor, YolosForObjectDetection
import json
import random
import cv2
from tqdm.auto import tqdm
import torch
from torchmetrics.detection import MeanAveragePrecision

In [2]:
DATASET_DIR = "../hand-cursive-detr"
ANNOTATION_FILE_NAME = "_val_annotations_coco.json"
#HF_CACHE = "/home/ralvarez22/Documentos/llm_data/llm_cache"
DEVICE = "cuda"
CHECKPOINT = "../finetuned/yolos/Roccia/V_1"
CONFIDENCE_TRESHOLD = 0.5

In [3]:
DATASET_FILE = os.path.join(DATASET_DIR, ANNOTATION_FILE_NAME)

In [4]:
dataset_json = json.load(open(DATASET_FILE, "r"))

In [5]:
del dataset_json["info"]
del dataset_json["licenses"]
del dataset_json["categories"]

In [6]:
dataset_json.keys()

dict_keys(['images', 'annotations'])

In [7]:
images_with_boxes = []
for e in dataset_json["images"]:
    image_annotations = [
        x for x in dataset_json["annotations"] if x["image_id"] == e["id"]
    ]
    images_with_boxes.append(
        {"id": e["id"], "image": e["file_name"], "boxes": image_annotations}
    )

In [8]:
metric = MeanAveragePrecision(iou_type="bbox")

In [9]:
detr_proc = YolosImageProcessor.from_pretrained(CHECKPOINT)
detr_model = YolosForObjectDetection.from_pretrained(
    pretrained_model_name_or_path=CHECKPOINT, ignore_mismatched_sizes=True
).to(DEVICE)

In [10]:
def generate_bboxes(image):
    with torch.no_grad():

        # load image and predict
        inputs = detr_proc(images=image, return_tensors='pt').to(DEVICE)
        outputs = detr_model(**inputs)

        # post-process
        target_sizes = torch.tensor([image.shape[:2]]).to(DEVICE)
        results = detr_proc.post_process_object_detection(
            outputs=outputs, 
            threshold=CONFIDENCE_TRESHOLD, 
            target_sizes=target_sizes
        )[0]
    return results["scores"], results["boxes"], results["labels"]

In [11]:
pred_data = []
tgt_data = []
for test_item in tqdm(images_with_boxes):
    boxes = torch.tensor([ [ x["bbox"][0], x["bbox"][1], x["bbox"][0] + x["bbox"][2], x["bbox"][1] + x["bbox"][3] ]  for x in test_item["boxes"]], device=DEVICE)
    cats = torch.tensor([x["category_id"] for x in test_item["boxes"]], device=DEVICE)
    input_image = os.path.join(DATASET_DIR, test_item["image"])
    image_pixels = cv2.imread(input_image)
    scores, pred_boxes, labels = generate_bboxes(image_pixels)
    
    tgt_data.append({
        "boxes": boxes,
        "labels": cats
    })
    pred_data.append({
        "boxes": pred_boxes,
        "scores": scores,
        "labels": labels
    })
    
    

  0%|          | 0/479 [00:00<?, ?it/s]

In [12]:
pred_data[0]

{'boxes': tensor([[ 968.2692, 1314.0963, 1018.1793, 1388.9069],
         [ 654.6461, 1844.5946,  740.4993, 1928.9562],
         [ 124.0910, 1636.5319,  230.3690, 1712.5210],
         [ 953.8359, 1992.5837, 1050.0455, 2074.8975],
         [ 655.1810, 1996.2584,  743.6088, 2078.9512],
         [ 124.1609, 1448.0712,  232.4949, 1526.6598],
         [ 496.3203, 1547.2450,  719.8286, 1629.9586],
         [ 672.5476, 1632.8647,  929.8230, 1717.1392],
         [ 777.8441, 1301.4846,  965.3986, 1378.2616],
         [1022.0452,  965.6229, 1078.1942, 1041.1416],
         [ 686.5954, 1636.6053,  940.1331, 1709.1573],
         [ 358.7355, 1839.5397,  444.8556, 1921.1079],
         [1104.0913, 1990.5964, 1204.4180, 2078.1897],
         [ 814.6650, 1843.4429,  912.8052, 1928.7087],
         [ 496.4165, 1645.8458,  665.7089, 1722.1987],
         [1139.5825, 1213.0808, 1195.4320, 1290.7639],
         [1033.1924, 1558.0426, 1086.1774, 1633.2048],
         [1085.0557,  959.4350, 1250.2191, 1033.1709],
 

In [13]:
tgt_data[0]

{'boxes': tensor([[ 495., 1560.,  713., 1640.],
         [ 497.,  861.,  845.,  929.],
         [1250., 1852., 1339., 1931.],
         [1392., 1997., 1507., 2081.],
         [ 657., 2001.,  745., 2078.],
         [ 522., 1294.,  768., 1371.],
         [ 665., 1649.,  917., 1725.],
         [ 134., 1449.,  236., 1524.],
         [ 125.,  941.,  238., 1014.],
         [1001., 1659., 1106., 1726.],
         [ 216., 1992.,  311., 2074.],
         [1198., 1215., 1390., 1290.],
         [ 511., 1996.,  621., 2076.],
         [ 223., 1844.,  315., 1922.],
         [1084., 1567., 1249., 1643.],
         [ 128., 1109.,  243., 1181.],
         [ 367., 1843.,  447., 1921.],
         [ 368., 1991.,  459., 2070.],
         [1244., 1995., 1341., 2077.],
         [ 106.,  765.,  228.,  838.],
         [1084.,  956., 1242., 1029.],
         [1153.,  852., 1208.,  934.],
         [1015.,  953., 1069., 1030.],
         [ 663., 1847.,  739., 1931.],
         [ 956., 1996., 1040., 2077.],
         [ 527.,

In [14]:
metric.update(pred_data, tgt_data)

In [15]:
metric.compute()

{'map': tensor(0.5878),
 'map_50': tensor(0.9623),
 'map_75': tensor(0.6584),
 'map_small': tensor(0.),
 'map_medium': tensor(0.5173),
 'map_large': tensor(0.6345),
 'mar_1': tensor(0.0137),
 'mar_10': tensor(0.1291),
 'mar_100': tensor(0.6702),
 'mar_small': tensor(0.),
 'mar_medium': tensor(0.6057),
 'mar_large': tensor(0.7119),
 'map_per_class': tensor(-1.),
 'mar_100_per_class': tensor(-1.),
 'classes': tensor(1, dtype=torch.int32)}