References
* F2 score : [competition metric implementation](https://www.kaggle.com/bamps53/competition-metric-implementation)
* CV strategy: [subsequences splitting](https://www.kaggle.com/julian3833/reef-a-cv-strategy-subsequences)
* Training (with modification on local run other than config augs): [higher res training](https://www.kaggle.com/steamedsheep/yolov5-high-resolution-training)

* [Discussion link](https://www.kaggle.com/c/tensorflow-great-barrier-reef/discussion/300638#1651347) which made me aware of this pit-fall

Correction(s):
* Only taking predictions above 0.15 confidence added (not in original implemetation) -> gave significant CV boost for higher image size inference but trend is still decreasing

Other Observations:
* You should correctly tune inference confidence thss since higher image size seems to be requiring a higher threshold (so they probably have more false positives)

This notebook is an implementation of calculating CV scores using subsequences split and higher resolution setting during validation. 

Note: you need to create model using same fold and resolution to get accurate results. 

## Init

In [None]:
from itertools import groupby
import numpy as np
from tqdm.notebook import tqdm
tqdm.pandas()
import pandas as pd
import os
import pickle
import cv2
from multiprocessing import Pool
import matplotlib.pyplot as plt
import ast
import glob
import time
import torch

import shutil
from shutil import copyfile
import sys
sys.path.append('../input/tensorflow-great-barrier-reef')

from joblib import Parallel, delayed

from IPython.display import display, HTML

from matplotlib import animation, rc
rc('animation', html='jshtml')

%matplotlib inline

## Define Validation Images and Labels Dir

In [None]:
!rm -r images labels

In [None]:
!mkdir -p ./images/train
!mkdir -p ./images/val

!mkdir -p ./labels/train
!mkdir -p ./labels/val

In [None]:
splitting = 'yolorules' # 'subseq' else video_id

In [None]:
train = pd.read_csv('../input/reef-a-cv-strategy-subsequences/cross-validation/train-5folds.csv')

fold = 1

annos = []
for i, x in train.iterrows():
    if splitting == 'subseq' and x.fold == fold: mode = 'val'
    else if splitting != 'subseq' and x.video_id == fold: mode = 'val'
    else:
        mode = 'train'
        if not x.has_annotations: continue
    
    copyfile(x.image_path, f'./images/{mode}/{x.image_id}.jpg')
    
    if not x.has_annotations: continue  
    
    r = ''; anno = eval(x.annotations)
    for an in anno:
        r += '0 {} {} {} {}\n'.format((an['x'] + an['width'] / 2) / 1280,
                                        (an['y'] + an['height'] / 2) / 720,
                                        an['width'] / 1280, an['height'] / 720)
    with open(f'./labels/{mode}/{x.image_id}.txt', 'w') as fp:
        fp.write(r)

In [None]:
!ls /kaggle/working/labels/train | wc -l #884 for fold1

## YOLOV5 Install

In [None]:
!rm -r /kaggle/working/yolov5
!git clone https://github.com/ultralytics/yolov5 # clone
%cd yolov5
%pip install -qr requirements.txt  # install

from yolov5 import utils
display = utils.notebook_init()  # check

## Data Config

In [None]:
data = '''
path: /kaggle/working  # dataset root dir
train: images/train  # train images (relative to 'path') 128 images
val: images/val  # val images (relative to 'path') 128 images
test:  # test images (optional)

nc: 1  # number of classes
names: ['reef']  # class names
'''

with open('fold0.yaml', 'w') as fp:
    fp.write(data)

## F2 Score Helpers
reference : [competition metric implementation](https://www.kaggle.com/bamps53/competition-metric-implementation)

In [None]:
def calc_iou(bboxes1, bboxes2, bbox_mode='xywh'):
    assert len(bboxes1.shape) == 2 and bboxes1.shape[1] == 4
    assert len(bboxes2.shape) == 2 and bboxes2.shape[1] == 4
    bboxes1 = bboxes1.copy()
    bboxes2 = bboxes2.copy()
    
    if bbox_mode == 'xywh':
        bboxes1[:, 2:] += bboxes1[:, :2]
        bboxes2[:, 2:] += bboxes2[:, :2]

    x11, y11, x12, y12 = np.split(bboxes1, 4, axis=1)
    x21, y21, x22, y22 = np.split(bboxes2, 4, axis=1)
    xA = np.maximum(x11, np.transpose(x21))
    yA = np.maximum(y11, np.transpose(y21))
    xB = np.minimum(x12, np.transpose(x22))
    yB = np.minimum(y12, np.transpose(y22))
    interArea = np.maximum((xB - xA + 1), 0) * np.maximum((yB - yA + 1), 0)
    boxAArea = (x12 - x11 + 1) * (y12 - y11 + 1)
    boxBArea = (x22 - x21 + 1) * (y22 - y21 + 1)
    iou = interArea / (boxAArea + np.transpose(boxBArea) - interArea)
    return iou

def f_beta(tp, fp, fn, beta=2):
    return (1+beta**2)*tp / ((1+beta**2)*tp + beta**2*fn+fp)

def calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th, verbose=False):
    gt_bboxes = gt_bboxes.copy()
    pred_bboxes = pred_bboxes.copy()
    
    tp = 0
    fp = 0
    for k, pred_bbox in enumerate(pred_bboxes): # fixed in ver.7
        ious = calc_iou(gt_bboxes, pred_bbox[None, 1:])
        max_iou = ious.max()
        if max_iou > iou_th:
            tp += 1
            gt_bboxes = np.delete(gt_bboxes, ious.argmax(), axis=0)
        else:
            fp += 1
        if len(gt_bboxes) == 0:
            fp += len(pred_bboxes) - (k + 1) # fix in ver.7
            break

    fn = len(gt_bboxes)
    return tp, fp, fn

def calc_is_correct(gt_bboxes, pred_bboxes):
    """
    gt_bboxes: (N, 4) np.array in xywh format
    pred_bboxes: (N, 5) np.array in conf+xywh format
    """
    if len(gt_bboxes) == 0 and len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, 0
        return tps, fps, fns
    
    elif len(gt_bboxes) == 0:
        tps, fps, fns = 0, len(pred_bboxes)*11, 0
        return tps, fps, fns
    
    elif len(pred_bboxes) == 0:
        tps, fps, fns = 0, 0, len(gt_bboxes)*11
        return tps, fps, fns
    
    pred_bboxes = pred_bboxes[pred_bboxes[:,0].argsort()[::-1]] # sort by conf
    
    tps, fps, fns = 0, 0, 0
    for iou_th in np.arange(0.3, 0.85, 0.05):
        tp, fp, fn = calc_is_correct_at_iou_th(gt_bboxes, pred_bboxes, iou_th)
        tps += tp
        fps += fp
        fns += fn
    return tps, fps, fns

def calc_f2_score(gt_bboxes_list, pred_bboxes_list, verbose=False):
    """
    gt_bboxes_list: list of (N, 4) np.array in xywh format
    pred_bboxes_list: list of (N, 5) np.array in conf+xywh format
    """
    tps, fps, fns = 0, 0, 0
    for gt_bboxes, pred_bboxes in zip(gt_bboxes_list, pred_bboxes_list):
        tp, fp, fn = calc_is_correct(gt_bboxes, pred_bboxes)
        tps += tp
        fps += fp
        fns += fn
        if verbose:
            num_gt = len(gt_bboxes)
            num_pred = len(pred_bboxes)
            print(f'num_gt:{num_gt:<3} num_pred:{num_pred:<3} tp:{tp:<3} fp:{fp:<3} fn:{fn:<3}')
    return f_beta(tps, fps, fns, beta=2)

In [None]:
from glob import glob
paths = glob('/kaggle/working/labels/val/*')
val_len = len(paths) 

## Iterate over different up-sizing values to note CV at each step

In [None]:
CV=True

if CV:
    for i in range(3600,10000,1200):

        print("#######################################\n"*3, f'Starting Inference for image size {i}')
        start_time = time.time()

        !python val.py --data ./fold0.yaml\
            --weights /kaggle/input/reef-baseline-fold12/l6_3600_uflip_vm5_f12_up/f1/best.pt\
            --imgsz $i\
            --batch 4\
            --conf-thres 0.01\
            --iou-thres 0.3\
            --save-txt\
            --save-conf\
            --exist-ok
        t=(time.time() - start_time)/60
        print(f'Inference Complete in {t:.3f} minutes')
        print('Starting Cross Validation')
        start_time = time.time()
        scores = []
        for j in range(15,40):
            confidence=j/100
            gt_bboxs_list, prd_bboxs_list = [], []

            count=0
            for image_file in paths:
                gt_bboxs = []; prd_bboxs = []
                with open(image_file, 'r') as f:
                    while True:
                        r = f.readline().rstrip()
                        if not r: break
                        r = r.split()[1:]
                        bbox = np.array(list(map(float, r))); gt_bboxs.append(bbox)

                pred_path = '/kaggle/working/yolov5/runs/val/exp/labels/'
                pred_file = pred_path+image_file[27:]

                no_anns = True
                if os.path.exists(pred_file):
                    with open(pred_file, 'r') as f:
                        while True:
                            r = f.readline().rstrip()
                            if not r: break
                            r = r.split()[1:]; r = [r[4], *r[:4]]
                            conf=float(r[0])
                            if conf>confidence: 
                                bbox = np.array(list(map(float, r)))
                                prd_bboxs.append(bbox)
                                no_anns = False

                if no_anns: count+=1

                gt_bboxs, prd_bboxs= np.array(gt_bboxs), np.array(prd_bboxs)
                prd_bboxs_list.append(prd_bboxs); gt_bboxs_list.append(gt_bboxs)

            score = calc_f2_score(gt_bboxs_list, prd_bboxs_list, verbose=False)
            scores.append([score, confidence, count])
            if confidence%5: print(f'confidence: {confidence}, images w/o anns: {count}, total: {val_len}')

        best = max(scores)
        print(f'best confidence: {best[1]}, images w/o anns: {best[2]}, total: {val_len}')
        print(f'img size: {i}, f2 score: {best[0]}') 
        t=(time.time() - start_time)/60
        print(f'cross validation complete in {t.3f} minutes')
        torch.cuda.empty_cache()

Seems like upsizing during inference might be causing some kind of overfit to Public LB but more experimentation (with your own augmentations) is required. 

You can also use this notebook to estimate your inference time on hidden test set by multiplying time by 12500/[size of your validation folder]. For image size 3600 and val folder size 884, this came out to 1.5 hours which matched inference time on LB submission
