# Evaluate inferrence with ground truth

After inferrence by Mask-RCNN or YOLOv7, convert annotations into coco json format with annotations.ipynb

Import the inferred annotation into an annotation software (CVAT by default) to review results. Add/amend/delete segmented masks as necessary.

Output the reviewed results in COCO json format. The reviewed results can now be used as the "ground truth" to compare with the inferred annotation.

Utility codes for evaluation obtained from https://github.com/cocodataset/cocoapi/issues/426 and the library pycocotools is used.

In [1]:
# Utilities

from pycocotools.coco import COCO
from pycocotools.cocoeval import COCOeval

__all__ = ['COCOEvaluator']

class COCOEvaluator(object):

    def __init__(self, anno_gt_file, anno_dt_file):
        self.coco_gt = COCO(anno_gt_file)
        # self.coco_dt = self.coco_gt.loadRes(anno_dt_file)
        self.coco_dt = COCO(anno_dt_file)
        self._hack_coco_dt()

    def _hack_coco_dt(self):
        # inferred file from Mask-R-CNN has score. 
        # YOLOv7 doesn't support exporting the score in annotation files (although the score is included in the prediction tensor det[:,4] in predict.py)
        if 'score' in self.coco_dt.dataset['annotations']: pass
        else:
            for ann in self.coco_dt.dataset['annotations']:
                ann['score'] = 1.0
        
        # the ground truths (after editing in CVAT) doesn't have scores
        for anno in self.coco_gt.dataset['annotations']:
            anno['score'] = 1.0

    def evaluate(self, iou_type='segm'):
        coco_eval = COCOeval(self.coco_gt, self.coco_dt, iou_type)
        coco_eval.evaluate()
        coco_eval.accumulate()
        coco_eval.summarize()
        coco_eval.summarize_per_category()
        return coco_eval

In [2]:
# utilities for annotation file corrections

import pandas as pd
import json

def findCategory(data):
    # find categories
    cats = data["categories"]
    category = pd.DataFrame(cats)
    category = category.drop(['supercategory'], axis=1)
    category = category.rename(columns={'id': 'category_id'})
    return category

def findImages(data):
    img = data["images"]
    images = pd.DataFrame(img)
    
    # unwanted columns exist if exported from CVAT. Not if generated by my code
    if set(['license','flickr_url','coco_url','date_captured']).issubset(images.columns):
        images = images.drop(columns=['license','flickr_url','coco_url','date_captured'])
    
    return images

def findAnnotations(data):
    anno = data["annotations"]
    df = pd.DataFrame(anno)
    return df

# convert all np.integer, np.floating and np.ndarray into json recognisable int, float and lists
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        if isinstance(obj, np.floating):
            return float(obj)
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)

In [3]:
# Files to be evaluated
gt_file = './input/checked.json'
dt_file = './input/combined.json'

# "image_id" won't match because CVAT (gt_file) outputs all images. My inference (dt_file) only outputs images with positive annotations
# Evaluations are based on the number of images reviewed - hence the nos_image of gt_file
# Task here: Correct image_id of the DETECTION FILE (dt_file)

# Store categories, images and annotations in separate dataframes
with open(gt_file, 'r') as file:
    data = json.load(file)
    category = findCategory(data)
    images = findImages(data)
    nos_image = images['id'].max()
with open(dt_file, 'r') as file:
    data2 = json.load(file)
    category2 = findCategory(data2)
    images2 = findImages(data2)
    nos_image2 = images2['id'].max()
    df2 = findAnnotations(data2)
    df2 = df2.merge(images2[['id','file_name']], left_on='image_id', right_on='id')
    df2 = df2.rename(columns={'id_x': 'id'})
    df2 = df2.drop(columns=['iscrowd','attributes','id_y'])

# Check categories
for i in range(len(category['name'])):
    if category['name'][i] != category2['name'][i]:
        print('category id: {} , {} in file 1 different from category id: {} , {} in file 2. Please check'.format(category['category_id'][i], category['name'][i], category2['category_id'][i], category2['name'][i]))
# clean category for json dump
category = category.rename(columns={'category_id': 'id'})
category['supercategory'] = ""

# Check numbers of images - does the detection file contain fewer images than the ground truth file?
if nos_image2 > nos_image:
    print("detection file contains {} images and ground truth only contains {}. Check file".format(nos_image2, nos_image))
else:
    print("Number of images OK!")

# Change image id in dt_file to the one of gt_file
for i in range(len(df2['id'])):
    df2.loc[i, 'image_id'] = images.loc[(images['file_name']==df2['file_name'][i]), 'id'].values
df2['iscrowd'] = 0
df2['attributes'] = [{'occluded':False}] * len(df2['id'])
df2 = df2.drop(columns=['file_name'])

# JSON with revised image_id exported for evaluation
dict_to_json = {
    "categories": category.to_dict('records'),
    "images": images.to_dict('records'),
    "annotations": df2.to_dict('records')
    }
with open("./input/dt_corrected.json", "w") as outfile:
    json.dump(dict_to_json, outfile, cls=NpEncoder)



Number of images OK!


In [4]:
gt_file = gt_file # follow from last comment tab
dt_file = './input/dt_corrected.json'

eval = COCOEvaluator(anno_gt_file=gt_file, anno_dt_file=dt_file)
result = eval.evaluate()

# Paste results into a DataFrame
# Get titles of metrics
metrics = list()
with open('./original/metrics.csv', 'r', encoding='utf-8-sig') as file:
    metrics = file.readline().strip().split(',')
# Assemble Dataframe
stats = [list(i) for i in zip(*result.category_stats)]
assessed = pd.DataFrame(stats, columns=metrics)
# Copy categories
assessed['category'] = category['name']
metrics.insert(0,'category')
assessed = assessed.reindex(columns = metrics)
# Export Dataframe
assessed.to_csv('evaluated.csv', index=False)

loading annotations into memory...
Done (t=0.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.09s)
creating index...
index created!
Running per image evaluation...
Evaluate annotation type *segm*
DONE (t=0.71s).
Accumulating evaluation results...
DONE (t=0.09s).
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.033
 Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.094
 Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.016
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = -1.000
 Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.001
 Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.041
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.098
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.125
 Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxD