## DETR Evaluation Notebook

This notebook allows to evaluate the finetuned model with a portion of the training dataset

The only metric for this evaluation is IOU (Intersection over union), because the DETR is only generating bounding boxes, not the label.


In [1]:

import os
from transformers import DetrForObjectDetection, DetrImageProcessor
import torch
import json
import random
import cv2

2024-07-03 11:52:46.909937: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [49]:
DATASET_DIR = "/home/ralvarez22/Documentos/trocr_hand/trocr_llm/datasets/cursive_hand_coco/train"
ANNOTATION_FILE_NAME = "_annotations.coco.json"
HF_CACHE = "/home/ralvarez22/Documentos/llm_data/llm_cache"
DEVICE = "cuda"
CHECKPOINT = '/home/ralvarez22/Documentos/trocr_hand/trocr_llm/finetuned/detr/Akivili/V_2'
CONFIDENCE_TRESHOLD = 0.8

In [3]:
DATASET_FILE = os.path.join(DATASET_DIR, ANNOTATION_FILE_NAME)

In [4]:
dataset_json = json.load(open(DATASET_FILE, "r"))

In [5]:
del dataset_json["info"]
del dataset_json["licenses"]
del dataset_json["categories"]

In [6]:
dataset_json.keys()

dict_keys(['images', 'annotations'])

In [7]:
len(dataset_json["annotations"])

3858

In [8]:
images_with_boxes = []
for e in dataset_json["images"]:
    image_annotations = [ x for x in dataset_json["annotations"] if x["image_id"] == e["id"] ]
    images_with_boxes.append({
        "id": e["id"],
        "image": e["file_name"],
        "boxes": image_annotations
    })

In [9]:
len(images_with_boxes)

59

In [50]:
items_to_eval = int(len(images_with_boxes) * 0.9)

In [51]:
dataset_test = random.sample(images_with_boxes, items_to_eval)

In [12]:
detr_proc = DetrImageProcessor.from_pretrained(CHECKPOINT, local_files_only=True)
detr_model = DetrForObjectDetection.from_pretrained(
    pretrained_model_name_or_path=CHECKPOINT, 
    ignore_mismatched_sizes=True, local_files_only=True
).to(DEVICE)

In [13]:
def intersection_over_union(gt, pred):
    # determine the (x, y)-coordinates of the intersection rectangle
    x_a = max(gt[0], pred[0])
    y_a = max(gt[1], pred[1])
    x_b = min(gt[2], pred[2])
    y_b = min(gt[3], pred[3])
    # if there is no overlap between predicted and ground-truth box
    if x_b < x_a or y_b < y_a:
        return 0.0
    # compute the area of intersection rectangle
    inter_area = max(0, x_b - x_a + 1) * max(0, y_b - y_a + 1)
    # compute the area of both the prediction and ground-truth
    # rectangles
    box_a_area = (gt[2] - gt[0] + 1) * (gt[3] - gt[1] + 1)
    box_b_area = (pred[2] - pred[0] + 1) * (pred[3] - pred[1] + 1)
    # compute the intersection over union by taking the intersection
    # area and dividing it by the sum of prediction + ground-truth
    # areas - the intersection area
    iou = inter_area / float(box_a_area + box_b_area - inter_area)
    # return the intersection over union value
    return iou

In [14]:
def generate_bboxes(image):
    with torch.no_grad():

        # load image and predict
        inputs = detr_proc(images=image, return_tensors='pt').to(DEVICE)
        outputs = detr_model(**inputs)

        # post-process
        target_sizes = torch.tensor([image.shape[:2]]).to(DEVICE)
        results = detr_proc.post_process_object_detection(
            outputs=outputs, 
            threshold=CONFIDENCE_TRESHOLD, 
            target_sizes=target_sizes
        )[0]
    return results["scores"], results["boxes"]

In [39]:
def get_image_iou(target_boxes, generated_boxes):
    image_iou = 0
    for gt_box in target_boxes:
        for gen_box in generated_boxes:
            tgt_box = gt_box.tolist()
            pred_box = gen_box.tolist()
            # The model and the dataset contains the following format for the bounding boxes:
            # X Top Left, Y Top Left, Width, Height
            # To match the correct format, I apply a transform to get the following:
            # X Top Left, Y Top Left, X Bottom Right, Y Bottom Right
            # --> XTL, YTL, XTL + Width, YTL + Height
            xy_tg_box = [ tgt_box[0], tgt_box[1], tgt_box[0] + tgt_box[2], tgt_box[1] + tgt_box[3] ]
            xy_pred_box = [ pred_box[0], pred_box[1], pred_box[0] + pred_box[2], pred_box[1] + pred_box[3] ]
            iou = intersection_over_union(xy_tg_box, xy_pred_box)
            if iou > 0:
                image_iou += iou
    return image_iou / len(target_boxes)

In [52]:
iou_prom = 0
for test_item in dataset_test:
    # For every item, I test the expected or ground truth box with all the generated boxes from the model
    # The main reason is because the model ignores the 'num_queries' configuration and generates all the posible bounding boxes
    gt_boxes = torch.tensor([ x["bbox"] for x in test_item["boxes"] ], device=DEVICE)
    input_image = os.path.join(DATASET_DIR, test_item["image"])
    image_pixels = cv2.imread(input_image)
    _, pred_boxes = generate_bboxes(image_pixels)
    
    item_iou = get_image_iou(gt_boxes, pred_boxes)
    print("Item IOU: {:.4f}".format(item_iou))
    
    iou_prom += item_iou / len(dataset_test)


Item IOU: 0.4643
Item IOU: 0.3287
Item IOU: 0.4593
Item IOU: 0.2956
Item IOU: 0.3990
Item IOU: 0.3272
Item IOU: 0.4109
Item IOU: 0.3832
Item IOU: 0.4066
Item IOU: 0.4713
Item IOU: 0.2625
Item IOU: 0.4241
Item IOU: 0.4225
Item IOU: 0.3209
Item IOU: 0.4025
Item IOU: 0.4129
Item IOU: 0.4049
Item IOU: 0.3520
Item IOU: 0.3868
Item IOU: 0.2751
Item IOU: 0.3477
Item IOU: 0.3817
Item IOU: 0.4049
Item IOU: 0.4524
Item IOU: 0.4289
Item IOU: 0.4018
Item IOU: 0.5095
Item IOU: 0.4304
Item IOU: 0.2751
Item IOU: 0.3535
Item IOU: 0.3501
Item IOU: 0.4622
Item IOU: 0.3215
Item IOU: 0.4121
Item IOU: 0.2989
Item IOU: 0.2586
Item IOU: 0.3059
Item IOU: 0.3760
Item IOU: 0.3524
Item IOU: 0.3494
Item IOU: 0.3382
Item IOU: 0.2941
Item IOU: 0.2637
Item IOU: 0.3223
Item IOU: 0.3005
Item IOU: 0.4123
Item IOU: 0.4226
Item IOU: 0.3543
Item IOU: 0.3164
Item IOU: 0.3735
Item IOU: 0.3452
Item IOU: 0.3911
Item IOU: 0.3468


In [54]:
print("DETR Avegarage IOU: {:.4f}".format(iou_prom))

DETR Avegarage IOU: 0.3691
