In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

Lets parse annotations from XML file, use all attributes saved by CVAT annotation tool and check which attributes can be ommited.

In [8]:
def parse_annotations_tracks(soup: object):
    frame_annotation_data = list()
    tasks_data = list()
    for task in soup.find_all('task'):
        tasks_data.append({
            'task_id': task.find('id').text,
            'name': task.find('name').text,
        })
    tasks_df = pd.DataFrame(tasks_data)
    tasks_df['camera_id'] = tasks_df['name'].apply(lambda x: x.split('_')[3])
    tasks_df['datetime'] = tasks_df['name'].apply(lambda x: x.split('_')[6].replace('.mkv', ''))
    tasks_df['datetime'] = pd.to_datetime(tasks_df['datetime'], format='%Y%m%dT%H%M%S')
    tasks_df.set_index('task_id', inplace=True)
    for track in soup.findAll('track'):
        track_annotation_dict = {}
        for attribute_name in ['id', 'label', 'source', 'task_id', 'subset']:
            track_annotation_dict[attribute_name] = track[attribute_name]
        label = track['label']
        for box in track.findAll('box'):
            frame_annotation_dict = track_annotation_dict.copy()
            for attribute_name in ['frame', 'keyframe', 'outside', 'occluded', 'xtl', 'ytl', 'xbr', 'ybr', 'z_order']:
                frame_annotation_dict[attribute_name] = box[attribute_name]

            if label == 'spz':
                frame_annotation_dict['text_spz'] = box.find('attribute', {'name': 'text_spz'}).text
            if label == 'spz_alt':
                frame_annotation_dict['text_spz'] = box.find('attribute', {'name': 'spz_alt_text'}).text
            elif label == 'text':
                frame_annotation_dict['unspecified_text'] = box.find('attribute', {'name': 'unspecified_text'}).text

            frame_annotation_dict['name'] = tasks_df.loc[frame_annotation_dict['task_id'], 'name']
            frame_annotation_dict['camera_id'] = tasks_df.loc[frame_annotation_dict['task_id'],'camera_id']
            frame_annotation_dict['datetime'] = tasks_df.loc[frame_annotation_dict['task_id'],'datetime']
            frame_annotation_data.append(frame_annotation_dict)

    frame_annotation_df = pd.DataFrame(frame_annotation_data)
    return frame_annotation_df


with open('project_camera_01_09_to_11_06_23-2023_07_20_12_42_13-cvat for video 1.1.xml', 'r', encoding='utf-8') as f:
	xml_annotations = f.read() 
soup = BeautifulSoup(xml_annotations, 'xml')
frame_annotation_df_1 = parse_annotations_tracks(soup)

with open('project_camera_02_09_to_11_06_23-2023_07_17_13_14_54-cvat for video 1.1.xml', 'r', encoding='utf-8') as f:
    xml_annotations = f.read() 
soup = BeautifulSoup(xml_annotations, 'xml')
frame_annotation_df_2 = parse_annotations_tracks(soup)

frame_annotation_df = pd.concat([frame_annotation_df_1, frame_annotation_df_2])
frame_annotation_df.rename({'id': 'track_id'}, axis=1, inplace=True)
frame_annotation_df.reset_index(drop=True, inplace=True)
frame_annotation_df.shape

(43437, 19)

In [9]:
frame_annotation_df

Unnamed: 0,track_id,label,source,task_id,subset,frame,keyframe,outside,occluded,xtl,ytl,xbr,ybr,z_order,name,camera_id,datetime,text_spz,unspecified_text
0,0,boat,semi-auto,1,default,1087,1,0,0,1238.46,503.65,1425.55,585.23,0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,,
1,0,boat,semi-auto,1,default,1088,1,0,0,1238.00,502.00,1425.00,583.00,0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,,
2,0,boat,semi-auto,1,default,1089,1,0,0,1239.00,503.00,1426.00,584.00,0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,,
3,0,boat,semi-auto,1,default,1090,1,0,0,1239.00,503.00,1426.00,584.00,0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,,
4,0,boat,semi-auto,1,default,1091,1,0,0,1239.00,503.00,1426.00,584.00,0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43432,627,spz_alt,semi-auto,699,default,933691,1,0,0,115.36,527.69,226.60,579.69,0,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,Martina,
43433,627,spz_alt,semi-auto,699,default,933692,1,0,0,60.00,530.67,171.00,582.67,0,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,Martina,
43434,627,spz_alt,semi-auto,699,default,933693,1,0,0,11.33,542.00,122.33,594.00,0,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,Martina,
43435,627,spz_alt,semi-auto,699,default,933694,1,1,0,0.00,543.00,72.33,595.00,0,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,Martina,


# Prepare COCO format csv export

In [10]:
df_boat_coco = frame_annotation_df[frame_annotation_df.label.isin(['boat', 'boat_alt'])][['name', 'camera_id', 'datetime', 'frame', 'label', 'xtl', 'ytl', 'xbr', 'ybr',]]
df_boat_coco['label'] = 8
df_boat_coco['confidance'] = 1
df_boat_coco.to_csv('ground_truth_boat_frames_coco.csv', sep=';')
df_boat_coco


Unnamed: 0,name,camera_id,datetime,frame,label,xtl,ytl,xbr,ybr,confidance
0,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,1087,8,1238.46,503.65,1425.55,585.23,1
1,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,1088,8,1238.00,502.00,1425.00,583.00,1
2,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,1089,8,1239.00,503.00,1426.00,584.00,1
3,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,1090,8,1239.00,503.00,1426.00,584.00,1
4,cfg_raw_cam_01_fhd_h265_20230609T050002.mkv,01,2023-06-09 05:00:02,1091,8,1239.00,503.00,1426.00,584.00,1
...,...,...,...,...,...,...,...,...,...,...
43358,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,933700,8,0.00,351.33,258.67,660.33,1
43359,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,933701,8,0.00,367.00,258.00,676.00,1
43360,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,933702,8,0.00,349.67,175.00,658.67,1
43361,cfg_raw_cam_02_fhd_h265_20230707T190001.mkv,02,2023-07-07 19:00:01,933703,8,0.00,351.00,175.00,660.00,1
