In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import itertools

# Load and clean prediciton data

In [2]:
df_ground_truth = pd.read_csv('../annotations/ground_truth_boat_frames_coco.csv', index_col=0, sep=';')
df_yolo_predictions = pd.read_csv('../annotations/yolov8x_no_train_labels.csv', index_col=0, sep=';')

df_ground_truth['datetime'] = pd.to_datetime(df_ground_truth.datetime)
df_ground_truth['date'] = pd.to_datetime(df_ground_truth.datetime.dt.date, format='%Y-%m-%d')
df_yolo_predictions['datetime'] = pd.to_datetime(df_yolo_predictions['datetime'])
df_yolo_predictions['date'] = pd.to_datetime(df_yolo_predictions.datetime.dt.date, format='%Y-%m-%d')
print('shape of loaded data', df_ground_truth.shape, df_yolo_predictions.shape)
print('Ground truth data:')
print(df_ground_truth.groupby(['date', 'camera_id']).datetime.count())
print('YOLO predictions data:')
print(df_yolo_predictions.groupby(['date', 'camera_id']).datetime.count())

# filter out predictions for dates with ground truth data
df_ground_truth = df_ground_truth[df_ground_truth.date.isin(['2023-06-09', '2023-06-10'])].copy()
df_yolo_predictions = df_yolo_predictions[\
    ((df_yolo_predictions.date == '2023-06-09') & (df_yolo_predictions.camera_id.isin([1,2]))) |\
    ((df_yolo_predictions.date == '2023-06-10') & (df_yolo_predictions.camera_id == 2))
].copy()

# crop bounding boxes from right side of camera 2 field of view
# df_yolo_predictions.drop(index=df_yolo_predictions[(df_yolo_predictions.camera_id == 2) & (df_yolo_predictions.x > 1800)].index, inplace=True)
# df_ground_truth.drop(index=df_ground_truth[(df_ground_truth.camera_id == 2) & (df_ground_truth.x > 1800)].index, inplace=True)

df_ground_truth.set_index('filename', inplace=True)
df_yolo_predictions.set_index('filename', inplace=True)

df_ground_truth.shape, df_yolo_predictions.shape

shape of loaded data (25443, 11) (826239, 11)
Ground truth data:
date        camera_id
2023-06-09  1            4149
            2            3481
2023-06-10  2            6321
2023-07-07  2            6500
2023-07-08  1            4992
Name: datetime, dtype: int64
YOLO predictions data:
date        camera_id
2023-06-09  1            214940
            2             12452
2023-06-10  1            132223
            2             28347
2023-06-11  1             65558
            2              5577
2023-07-07  2            189338
2023-07-08  1            177804
Name: datetime, dtype: int64


((13951, 10), (255739, 10))

In [3]:
df_ground_truth = df_ground_truth.reset_index().set_index(['filename', 'frame_id'])
df_yolo_predictions = df_yolo_predictions.reset_index().set_index(['filename', 'frame_id'])

In [4]:
df_ground_truth.index.drop_duplicates().size, df_yolo_predictions.index.drop_duplicates().size

(13794, 195112)

In [5]:
tp = df_ground_truth.index.intersection(df_yolo_predictions.index)
fp = df_yolo_predictions.index.difference(df_ground_truth.index)
fn = df_ground_truth.index.difference(df_yolo_predictions.index)
tp.size, fp.size, fn.size

(12995, 182117, 799)

In [6]:
filenames_unique =  df_ground_truth.reset_index().filename.unique()
no_boat_indexes = pd.Index(itertools.product(filenames_unique, range(0, 2300)))
no_boat_indexes.names = ['filename', 'frame_id']
no_boat_indexes.size

253000

In [7]:
no_boat_indexes = no_boat_indexes.difference(df_yolo_predictions.index).difference(df_ground_truth.index)
no_boat_indexes.size

180483

In [8]:
np.random.seed(1)
no_boat_indexes_sample = np.random.choice(no_boat_indexes, size=1500, replace=False)
fp_indexes_sample = np.random.choice(fp, size=1500, replace=False)

In [9]:
df = pd.DataFrame(np.concatenate([no_boat_indexes_sample, fp_indexes_sample]))
df['filename'] = df[0].apply(lambda x: x[0])
df['frame_id'] = df[0].apply(lambda x: x[1])
df.drop(columns=[0], inplace=True)
df['label_class'] = -2
df.iloc[:len(no_boat_indexes_sample), -1] = -1
df['camera_id'] = df['filename'].apply(lambda x: '01' if 'cam_01' in x else '02')
df['datetime'] = df['filename'].apply(lambda x: x.split('_')[6].replace('.mkv',''))
df['datetime'] = pd.to_datetime(df['datetime'], format='%Y%m%dT%H%M%S')
for col in ['x', 'y', 'w', 'h', 'confidance']:
    df[col] = 0
df

Unnamed: 0,filename,frame_id,label_class,camera_id,datetime,x,y,w,h,confidance
0,cfg_raw_cam_01_fhd_h265_20230609T091000.mkv,301,-1,01,2023-06-09 09:10:00,0,0,0,0,0
1,cfg_raw_cam_02_fhd_h265_20230609T091002.mkv,1176,-1,02,2023-06-09 09:10:02,0,0,0,0,0
2,cfg_raw_cam_02_fhd_h265_20230609T154001.mkv,1806,-1,02,2023-06-09 15:40:01,0,0,0,0,0
3,cfg_raw_cam_02_fhd_h265_20230610T151001.mkv,1122,-1,02,2023-06-10 15:10:01,0,0,0,0,0
4,cfg_raw_cam_01_fhd_h265_20230609T105000.mkv,576,-1,01,2023-06-09 10:50:00,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
2995,cfg_raw_cam_01_fhd_h265_20230609T143001.mkv,1309,-2,01,2023-06-09 14:30:01,0,0,0,0,0
2996,cfg_raw_cam_01_fhd_h265_20230609T182002.mkv,1258,-2,01,2023-06-09 18:20:02,0,0,0,0,0
2997,cfg_raw_cam_01_fhd_h265_20230609T162000.mkv,1890,-2,01,2023-06-09 16:20:00,0,0,0,0,0
2998,cfg_raw_cam_01_fhd_h265_20230609T024002.mkv,257,-2,01,2023-06-09 02:40:02,0,0,0,0,0


In [11]:
df[['filename', 'camera_id', 'datetime', 'frame_id', 'label_class', 'x', 'y', 'w', 'h', 'confidance']].to_csv('frames_with_no_boat_for_training.csv', sep=';')