In [1]:
import os
from pathlib import Path
from functools import partial
os.chdir('..')

from inference.paligemma_coco_benchmark import PaliGemmaCocoPredictor

In [2]:
json_annotations = "assets/instances_val2017.json"
predictions_root = Path("assets/predictions")

def calculate_coco_metric(json_predictions):
    PaliGemmaCocoPredictor.calculate_coco_metrics(json_annotations,
                                                  predictions_root / json_predictions,
                                                  nms_callable=partial(
                                                    PaliGemmaCocoPredictor.apply_nms_to_predictions,
                                                    conf_threshold=0.001,
                                                    iou_threshold=0.7,
                                                    class_agnostic=False,
                                                  )
                                                 )

## YOLOv11x (TensorRT+Triton)

In [3]:
calculate_coco_metric("yolo_v11x.json")

loading annotations into memory...
Done (t=0.40s)
creating index...
index created!
Loading and preparing results...
DONE (t=1.62s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=21.37s).
Accumulating evaluation results...
DONE (t=3.03s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.536
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.704
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.585
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.370
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.587
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.691
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.388
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.641
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDet

## Paligemma-2-3b-pt-448

In [4]:
calculate_coco_metric("paligemma2-3b-pt-448_classes_per_call_1.json")

loading annotations into memory...
Done (t=0.33s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.40s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=8.46s).
Accumulating evaluation results...
DONE (t=1.32s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.239
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.365
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.253
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.087
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.409
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.295
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.349
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-10b-pt-448

In [5]:
calculate_coco_metric("paligemma2-10b-pt-448_classes_per_call_1.json")

loading annotations into memory...
Done (t=0.28s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.24s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=5.93s).
Accumulating evaluation results...
DONE (t=0.91s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.223
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.337
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.237
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.071
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.236
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.404
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.260
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.309
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

In [6]:
calculate_coco_metric("paligemma2-10b-pt-448_classes_per_call_40.json")

loading annotations into memory...
Done (t=0.27s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.25s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=6.80s).
Accumulating evaluation results...
DONE (t=0.97s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.186
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.286
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.193
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.049
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.207
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.353
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.236
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.289
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-10b-mix-448

In [7]:
calculate_coco_metric("paligemma2-10b-mix-448_classes_per_call_10.json")

loading annotations into memory...
Done (t=0.33s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.28s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=6.31s).
Accumulating evaluation results...
DONE (t=0.84s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.220
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.339
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.230
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.073
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.249
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.394
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.271
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.330
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-28b-pt-448

In [8]:
calculate_coco_metric("paligemma2-28b-pt-448_classes_per_call_4.json")

loading annotations into memory...
Done (t=0.28s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.02s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=5.09s).
Accumulating evaluation results...
DONE (t=0.68s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.009
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.019
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.008
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.002
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.004
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.019
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.018
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.019
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-10b-pt-224

In [9]:
calculate_coco_metric("paligemma2-10b-pt-224_classes_per_call_1.json")

loading annotations into memory...
Done (t=0.17s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.03s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=6.21s).
Accumulating evaluation results...
DONE (t=0.84s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.192
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.302
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.200
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.033
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.200
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.382
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.223
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.257
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-10b-pt-896

In [10]:
calculate_coco_metric("paligemma2-10b-pt-896_classes_per_call_1.json")

loading annotations into memory...
Done (t=0.32s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.07s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=7.56s).
Accumulating evaluation results...
DONE (t=1.05s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.218
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.329
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.230
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.069
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.231
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.403
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.262
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.309
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Paligemma-2-10b-pt-448 finetuned for closed set object detection

In [11]:
calculate_coco_metric("paligemma2-10b-pt-448-finetuned_closed_set.json")

loading annotations into memory...
Done (t=0.29s)
creating index...
index created!
Loading and preparing results...
DONE (t=0.33s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *bbox*
DONE (t=8.30s).
Accumulating evaluation results...
DONE (t=1.28s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.296
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.442
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.314
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.097
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.332
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.537
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.285
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.396
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets

## Getting per-class metrics

In [12]:
import fiftyone as fo
import fiftyone.utils.coco as fouc
from fiftyone import ViewField as F
import json
from functools import partial

# def get_accurate_report(dataset, results, eval_key="eval"):
    
#     gt_support = dataset.count_values("ground_truth_detections.detections.label")
    
#     classes = sorted(gt_support.keys())
    
#     print(f"\n{'='*75}")
#     print(f"{'Class':<20} {'Precision':>10} {'Recall':>10} {'F1':>10} {'Support':>10}")
#     print(f"{'='*75}")
    
#     total_support = 0
#     total_weighted_precision = 0
#     total_weighted_recall = 0
    
#     for cls in classes:
#         support = gt_support[cls]
        
#         cls_metrics = results.metrics(classes=[cls])
#         precision = cls_metrics.get('precision', 0.0)
#         recall = cls_metrics.get('recall', 0.0)
#         f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
        
#         print(f"{cls:<20} {precision:>10.2f} {recall:>10.2f} {f1:>10.2f} {support:>10d}")
        
#         total_support += support
#         total_weighted_precision += precision * support
#         total_weighted_recall += recall * support
    
#     weighted_precision = total_weighted_precision / total_support if total_support > 0 else 0
#     weighted_recall = total_weighted_recall / total_support if total_support > 0 else 0
#     weighted_f1 = 2 * weighted_precision * weighted_recall / (weighted_precision + weighted_recall) if (weighted_precision + weighted_recall) > 0 else 0
    
#     print(f"{'='*75}")
#     print(f"{'weighted avg':<20} {weighted_precision:>10.2f} {weighted_recall:>10.2f} {weighted_f1:>10.2f} {total_support:>10d}")
#     print(f"\n{'mAP (COCO 0.5:0.95)':<30} {results.mAP():>10.4f}")
#     print(f"{'mAR (COCO 0.5:0.95)':<30} {results.mAR():>10.4f}")


def get_class_metrics(json_predictions):
    """
    Loads COCO dataset, predictions and computes metrics
    """
    print(f"\n{'='*80}")
    print(f"Processing: {json_predictions}")
    print(f"{'='*80}")
    
    IMAGES_DIR = "/mnt/d/Sergey/ML/datasets/coco/images/val2017/"
    
    if fo.dataset_exists("coco-eval"):
        fo.delete_dataset("coco-eval")
    
    print("\nüì• Loading COCO dataset...")
    dataset = fo.Dataset.from_dir(
        dataset_type=fo.types.COCODetectionDataset,
        data_path=IMAGES_DIR,
        labels_path=json_annotations,
        include_id=True,
        label_field="ground_truth",
        name="coco-eval",
    )
    
    if "ground_truth_segmentations" in dataset.get_field_schema():
        dataset.delete_sample_field("ground_truth_segmentations")
        print("‚úÖ Deleted ground_truth_segmentations")
    
    num_images = len(dataset)
    num_detections = dataset.count("ground_truth_detections.detections")
    print(f"\nüìä Ground Truth:")
    print(f"  Images: {num_images}")
    print(f"  Detections: {num_detections}")
    
    print(f"\nüîÆ Loading predictions...")
    PRED_JSON = str(predictions_root / json_predictions)
    
    predictions = PaliGemmaCocoPredictor._appply_nms_benchmark_format(
        PRED_JSON,
        nms_callable=partial(
            PaliGemmaCocoPredictor.apply_nms_to_predictions,
            conf_threshold=0.001,
            iou_threshold=0.7,
            class_agnostic=False,
        )
    )
    
    print(f"  Total predictions: {len(predictions)}")
    print(f"  Unique image_ids: {len(set(p['image_id'] for p in predictions))}")
    
    categories = dataset.info["categories"]
    
    print(f"\nüì• Adding predictions to dataset...")
    fouc.add_coco_labels(
        dataset,
        label_field="predictions",
        labels_or_path=predictions,
        categories=categories,
        coco_id_field="ground_truth_coco_id",
    )
    
    print(f"\n‚öôÔ∏è Running COCO evaluation...")
    results = dataset.evaluate_detections(
        "predictions",
        gt_field="ground_truth_detections",
        method="coco",
        eval_key="eval",
        compute_mAP=True,
    )
    
    print(f"\nüìà Evaluation Results:")
    # get_accurate_report(dataset, results, eval_key="eval")
    print("mAP:", results.mAP())
    print("mAR:", results.mAR())
    results.print_report()
    
    dataset.delete()
    print(f"\n‚úÖ Done!")



prediction_files = [
   "paligemma2-10b-pt-448-finetuned_closed_set.json",
   # "paligemma2-3b-pt-448_classes_per_call_1.json",
   # "paligemma2-10b-pt-448_classes_per_call_1.json",
   # "paligemma2-10b-pt-448_classes_per_call_40.json",
   # "paligemma2-10b-mix-448_classes_per_call_10.json",
   # "paligemma2-28b-pt-448_classes_per_call_4.json",
   # "paligemma2-10b-pt-224_classes_per_call_1.json",
   # "paligemma2-10b-pt-896_classes_per_call_1.json",
   # "yolo_v11x.json",
]

for pred_file in prediction_files:
    get_class_metrics(pred_file)


Processing: paligemma2-10b-pt-448-finetuned_closed_set.json

üì• Loading COCO dataset...
 100% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [57.7s elapsed, 0s remaining, 93.1 samples/s]      
‚úÖ Deleted ground_truth_segmentations

üìä Ground Truth:
  Images: 5000
  Detections: 36781

üîÆ Loading predictions...
  Total predictions: 62783
  Unique image_ids: 4952

üì• Adding predictions to dataset...

‚öôÔ∏è Running COCO evaluation...
Evaluating detections...
 100% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [1.7m elapsed, 0s remaining, 58.4 samples/s]      
Performing IoU sweep...
 100% |‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5000/5000 [1.2m elapsed, 0s remaining, 82.5 samples/s]      

üìà Evaluation Results:
mAP: 0.2961506103061076
mAR: 0.3992885693284017
                precision    recall  f1-score   support

      airplane       0.47      0.79      0.59       143
         apple       0.19      0.28      0.23       244
      backpack       0