In [47]:
import pandas as pd

In [56]:
df_ground_truth = pd.read_csv('../annotations/ground_truth_sample.csv', index_col=0, sep=';')
df_yolo_predictions = pd.read_csv('../annotations/yolo_labels_sample.csv', index_col=0, sep=';')
# TODO: add +1 for frame_id in yolo real predictions

df_ground_truth.set_index(['filename', 'camera_id'], inplace=True)
df_yolo_predictions.set_index(['filename', 'camera_id'], inplace=True)

df_yolo_predictions

Unnamed: 0_level_0,Unnamed: 1_level_0,datetime,frame_id,label_class,x,y,w,h,confidance
filename,camera_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,2023-06-09 05:00:02,1087,8,1332.005,544.440,187.09,81.58,1
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,2023-06-09 05:00:02,1088,8,1331.500,542.500,187.00,81.00,1
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,2023-06-09 05:00:02,1089,8,1332.500,543.500,187.00,81.00,1
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,2023-06-09 05:00:02,1090,8,1332.500,543.500,187.00,81.00,1
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,2023-06-09 05:00:02,1091,8,1332.500,543.500,187.00,81.00,1
...,...,...,...,...,...,...,...,...,...
cfg_raw_cam_01_fhd_h265_20230609T064001.mkv,1,2023-06-09 06:40:01,1106,8,1778.550,695.210,282.90,214.38,1
cfg_raw_cam_01_fhd_h265_20230609T064001.mkv,1,2023-06-09 06:40:01,1107,8,1789.550,698.350,258.90,208.70,1
cfg_raw_cam_01_fhd_h265_20230609T064001.mkv,1,2023-06-09 06:40:01,1108,8,1802.020,715.750,235.96,208.00,1
cfg_raw_cam_01_fhd_h265_20230609T064001.mkv,1,2023-06-09 06:40:01,1109,8,1815.850,729.330,208.30,170.74,1


In [64]:
def calculate_iou(ground_truth:tuple, prediction:tuple):
    """
        Calculate intersection over union for two bounding boxes.
        Args:
            ground_truth: tuple of (x, y, w, h)
            prediction: tuple of (x, y, w, h)
    """
    gt_xtl = ground_truth[0]-ground_truth[2]/2
    gt_ytl = ground_truth[1]-ground_truth[3]/2
    gt_xbr = ground_truth[0]+ground_truth[2]/2
    gt_ybr = ground_truth[1]+ground_truth[3]/2
    pr_xtl = prediction[0]-prediction[2]/2
    pr_ytl = prediction[1]-prediction[3]/2
    pr_xbr = prediction[0]+prediction[2]/2
    pr_ybr = prediction[1]+prediction[3]/2
    intersection_xtl = max(gt_xtl, pr_xtl)
    intersection_ytl = max(gt_ytl, pr_ytl)
    intersection_xbr = min(gt_xbr, pr_xbr)
    intersection_ybr = min(gt_ybr, pr_ybr)
    intersection_area = max(0, intersection_xbr - intersection_xtl) * max(0, intersection_ybr - intersection_ytl)
    union_area = ground_truth[2] * ground_truth[3] + prediction[2] * prediction[3] - intersection_area
    return intersection_area / union_area

# group quality results by name, aggregate over frame_id and calculate true positive, false positive, false negative when comparing corresponding names and frame from both dataset
evaluation_dict = dict()
for id in set(df_ground_truth.index) | set(df_yolo_predictions.index):
    evaluation_dict[id] = dict()
    corresponding_frames = set(df_ground_truth.loc[id].frame_id) & set(df_yolo_predictions.loc[id].frame_id)
    evaluation_dict[id]['true_positive'] = len(corresponding_frames)
    evaluation_dict[id]['false_positive'] = len(set(df_yolo_predictions.loc[id].frame_id) - set(df_ground_truth.loc[id].frame_id))
    evaluation_dict[id]['false_negative'] = len(set(df_ground_truth.loc[id].frame_id) - set(df_yolo_predictions.loc[id].frame_id))
    frames_iou = []
    for frame_id in corresponding_frames:
        # calulate iou for each frame
        ground_truth_frame = df_ground_truth.loc[id].loc[df_ground_truth.loc[id].frame_id == frame_id].iloc[0]
        prediction_frame = df_yolo_predictions.loc[id].loc[df_yolo_predictions.loc[id].frame_id == frame_id].iloc[0]
        frames_iou.append(calculate_iou(ground_truth_frame[['x', 'y', 'w', 'h']].values, prediction_frame[['x', 'y', 'w', 'h']].values))
    evaluation_dict[id]['iou'] = sum(frames_iou) / len(frames_iou)
    evaluation_dict[id]['frames_iou'] = frames_iou

df_evaluation = pd.DataFrame().from_dict(evaluation_dict, orient='index')
df_evaluation['f1'] = 2 * df_evaluation['true_positive'] / (2 * df_evaluation['true_positive'] + df_evaluation['false_positive'] + df_evaluation['false_negative'])
df_evaluation['recall'] = df_evaluation['true_positive'] / (df_evaluation['true_positive'] + df_evaluation['false_negative'])
df_evaluation['precision'] = df_evaluation['true_positive'] / (df_evaluation['true_positive'] + df_evaluation['false_positive'])
df_evaluation

Unnamed: 0,Unnamed: 1,true_positive,false_positive,false_negative,iou,frames_iou,f1,recall,precision
cfg_raw_cam_01_fhd_h265_20230609T064001.mkv,1,95,0,0,1.0,"[0.9999999999999992, 1.0, 1.0, 0.9999999999999...",1.0,1.0,1.0
cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,1,22,0,0,1.0,"[1.000000000000001, 1.0, 1.0, 1.0, 1.0, 1.0, 1...",1.0,1.0,1.0
