# ScaledYOLOv4 Inference Notebook for COTS

Paper: https://arxiv.org/pdf/2011.08036.pdf

Wang et al's ScaledYOLOv4 is a YOLOv4-based architecture that can scale up for better accuracy. There are 3 'large' models:
* P5: contains P3, P4 and P5 detection heads
* P6: P3, P4, P5, and P6 detection heads
* P7: P3, P4, P5, P6 and P7 detection heads

<img src="https://blog.roboflow.com/content/images/2020/12/image-2.png">

courtesy https://blog.roboflow.com/scaled-yolov4-tops-efficientdet/

## Notebook Control Panel

In [None]:
# CONTROL PANEL

CHECKPOINT_FILE = '/kaggle/input/scaledyolov4-cots-p6/e12_FT_e8_freeze/last_020.pt'

SIZE = 1920

# SY4's augment does not work well. Use manual ensemble where model is inferenced at several sizes
AUGMENT = False
AUGMENTED_SIZES = [1920, 2560, 2880] # must be multiples of 32. Scaling by [1.0, 1.333, 1.5]

# Whether to use single resolution or multi-resolution WBF for inference
USE_AUGMENTED_INFERENCE = True

CONF = 0.12
NMS_THRESHOLD = 0.4
WBF_IOU = NMS_THRESHOLD
USE_HALF_PRECISION = True

device_type = '0' # (either 'cpu' for CPU, or '0' for GPU)

## Model Results

### Rationale for training
* This model was trained using a concept similar to that used by YOLOX
  * First, train a model with heavy mixup. The model will achieve modest mAP, but its backbone learns a lot because of the mixup.
  * On fine-tuning, shut down the mixup and reduce the augmentations, in order to maximise mAP. Also, learn at reduced learning rate, so that knowledge gained from the previous mixup training is not lost. 
  * To ensure mixup knowledge is not lost, CSPDark layers are frozen during fine-tuning. This is implemented via my custom fork of ScaledYOLOv4: https://github.com/alexchwong/ScaledYOLOv4

### Model Hyperparameters
* Important training parameters:
  * Model type: yolov4-p6
  * Scratch (epochs 0-12):
    * Res: 1920p
    * mixup: 1.0
    * degrees: 30.0
    * translate: 0.5
    * scale: 0.2
    * shear: 0.0
  * then Fine-tine (epochs 12-20)
    * Res: 1920p
    * mixup: 0.0
    * degrees: 0.0
    * translate: 0.2
    * scale: 0.2
    * lr0: 0.002 (default 0.01)
    * frozen layers: 1 to 12 (i.e. CSPDark backbone)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(30,15))
plt.axis('off')
plt.imshow(plt.imread('/kaggle/input/scaledyolov4-cots-p6/e12_FT_e8_freeze/results.png'));

## ScaledYOLOv4 installation

In [None]:
!cp -r /kaggle/input/scaledyolov4-installation/ScaledYOLOv4 /kaggle/working/

## FFMPEG for videos

In [None]:
# Install ffmpeg for video compression
%cd /kaggle/working

! tar xvf ../input/ffmpeg-static-build/ffmpeg-git-amd64-static.tar.xz

import subprocess

FFMPEG_BIN = "/kaggle/working/ffmpeg-git-20191209-amd64-static/ffmpeg"

## Required Modules

In [None]:
import warnings
warnings.filterwarnings("ignore")

import time

import os
import torch
import importlib
import cv2 
import pandas as pd
import numpy as np

import ast
import shutil
import sys

from tqdm.notebook import tqdm
tqdm.pandas()

from PIL import Image
from IPython.display import display

## Load the ScaledYOLOv4 Model

In [None]:
%cd /kaggle/working/ScaledYOLOv4

imgsz = SIZE
weights = CHECKPOINT_FILE

from models.experimental import attempt_load
from utils.datasets import LoadImages
from utils.general import check_img_size, non_max_suppression, scale_coords, xyxy2xywh
from utils.torch_utils import select_device

device = select_device(device_type)
half = (USE_HALF_PRECISION and (device_type != 'cpu'))

model = attempt_load(weights, map_location=device)  # load FP32 model
imgsz = check_img_size(imgsz, s=model.stride.max())  # check img_size
if half:
    model.half()  # to FP16
    
img = torch.zeros((1, 3, imgsz, imgsz), device=device)  # init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None  # run once

COCO_CLASSES = ['cots']

## ScaledYOLOv4 utility functions

In [None]:
def yolo2coco(bboxes, image_height=720, image_width=1280):
    """
    yolo => [xmid, ymid, w, h] (normalized)
    coco => [xmin, ymin, w, h]
    """ 
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    
    # denormalizing
    bboxes[..., [0, 2]]= bboxes[..., [0, 2]]* image_width
    bboxes[..., [1, 3]]= bboxes[..., [1, 3]]* image_height
    
    # converstion (xmid, ymid) => (xmin, ymin) 
    bboxes[..., [0, 1]] = bboxes[..., [0, 1]] - bboxes[..., [2, 3]]/2
    
    return bboxes

In [None]:
from utils.datasets import letterbox

def preprocess_image(im0, new_shape = None):
    if new_shape is None:
        new_shape = im0.shape
    img = letterbox(im0, new_shape=new_shape)[0]
    img = img[:, :, ::-1].transpose(2, 0, 1)  # BGR to RGB, to 3x416x416
    img = np.ascontiguousarray(img)
    return(img)

In [None]:
def sy4_inference(img0, model, test_size, conf_threshold = 0.4, verbose = True): 
    bboxes = []
    bbclasses = []
    scores = []
    ts = time.perf_counter()

    img = preprocess_image(img0, new_shape = test_size)
    img = torch.from_numpy(img).to(device)
    img = img.half() if half else img.float()  # uint8 to fp16/32
    img /= 255.0  # 0 - 255 to 0.0 - 1.0
    if img.ndimension() == 3:
        img = img.unsqueeze(0)

    # Inference
    pred = model(img, augment=AUGMENT)[0]
    # Apply NMS
    pred = non_max_suppression(pred, conf_threshold, NMS_THRESHOLD, classes=[0], agnostic=False)

    det = pred[0]
    bboxes = []
    bbclasses = []
    scores = []
    gn = torch.tensor(img0.shape)[[1, 0, 1, 0]]
    
    if det is not None and len(det):
        det[:, :4] = scale_coords(img.shape[2:], det[:, :4], img0.shape).round()
        for *xyxy, conf, cls in det:
            bbclasses.append(int(cls.item()))
            scores.append(conf.item())
            
            xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
            bboxes.append(xywh)                       
                            
    if len(bboxes) == 0:
            if verbose:
                te = time.perf_counter()
                print(f"ScaledYOLOv4 inference time {round(te - ts, 4)} s")
            return [], [], []
    
    bboxes_coco = yolo2coco(np.array(bboxes), img0.shape[0], img0.shape[1])
    
    if verbose:
        te = time.perf_counter()
        print(f"ScaledYOLOv4 inference time {round(te - ts, 4)} s")
    
    return bboxes_coco, bbclasses, scores

In [None]:
from copy import deepcopy

def sy4_inference_augmented(img0, model, test_sizes, conf_threshold = 0.4, verbose = True): 
    bboxes_list = []
    bbclasses_list = []
    scores_list = []
    ts = time.perf_counter()
    
    for test_size in test_sizes:
        bboxes, bbclasses, scores = sy4_inference(img0, model, test_size, conf_threshold, verbose = False)
        if len(bboxes):
            bboxes_list.append(deepcopy(bboxes))
            bbclasses_list.append(deepcopy(bbclasses))
            scores_list.append(deepcopy(scores))

    bboxes_final, scores_final, bbclasses_final = wbf_coco(bboxes_list, scores_list, bbclasses_list, img0, conf_type = 'max', verbose = False)
    if len(bboxes_final) == 0:
        if verbose:
            te = time.perf_counter()
            print(f"Augmented ScaledYOLOv4 inference time {round(te - ts, 4)} s")
        return [], [], []
    if verbose:
        te = time.perf_counter()
        print(f"Augmented ScaledYOLOv4 inference time {round(te - ts, 4)} s")
    return bboxes_final, bbclasses_final, scores_final

In [None]:
# Modified from https://www.kaggle.com/remekkinas/yolox-inference-on-kaggle-for-cots-lb-0-507

def draw_yolox_predictions(img, bboxes, scores, bbclasses, classes_dict, boxcolor = (0,0,255)):
    outimg = img.copy()
    for i in range(len(bboxes)):
        box = bboxes[i]
        cls_id = int(bbclasses[i])
        score = scores[i]
        x0 = int(box[0])
        y0 = int(box[1])
        x1 = x0 + int(box[2])
        y1 = y0 + int(box[3])

        cv2.rectangle(outimg, (x0, y0), (x1, y1), boxcolor, 2)
        cv2.putText(outimg, '{}:{:.1f}%'.format(classes_dict[cls_id], score * 100), (x0, y0 - 3), cv2.FONT_HERSHEY_PLAIN, 0.8, boxcolor, thickness = 1)
    return outimg

## Weighted Box fusion for Model Ensemble

In [None]:
sys.path.append('/kaggle/input/packages-wbf/packages')
from ensemble_boxes.ensemble_boxes_wbf import *

def wbf(boxes_list, scores_list, labels_list):
    """
    vocnorm => [x1, y1, x2, y2] (normalized)
    """
    iou_thr = WBF_IOU
    skip_box_thr = 0.0001
    return(weighted_boxes_fusion(boxes_list, scores_list, labels_list, iou_thr=iou_thr, skip_box_thr=skip_box_thr))

def wbf_coco(boxes_list, scores_list, labels_list, img, conf_type='max', verbose = True):
    """
    coco => [xmin, ymin, w, h]
    """
    ts = time.perf_counter()
    height = img.shape[0]
    width = img.shape[1]
    new_bboxes_list = []
    new_scores_list = []
    new_labels_list = []
    for i in range(len(boxes_list)):
        if len(boxes_list[i]):
            new_bboxes_list.append(coco2vocnorm(boxes_list[i], height, width))
            new_scores_list.append(scores_list[i])
            new_labels_list.append(labels_list[i])
    iou_thr = WBF_IOU
    skip_box_thr = 0.0001
    bboxes, confs, labels = weighted_boxes_fusion(new_bboxes_list, new_scores_list, new_labels_list, iou_thr=iou_thr, skip_box_thr=skip_box_thr, conf_type=conf_type)
    bboxes  = vocdenorm(bboxes,height,width)
    bboxes  = voc2coco(bboxes).astype(int)
    te = time.perf_counter()
    if verbose:
        print(f"WBF inference time {te - ts}")
    return bboxes, confs, labels

def coco2vocnorm(bboxes, image_height=720, image_width=1280):
    """
    coco => [xmin, ymin, w, h]
    vocnorm => [x1, y1, x2, y2] (normalized)
    """
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]/ image_height
    
    bboxes[..., [2, 3]] = bboxes[..., [2, 3]] + bboxes[..., [0, 1]]
    
    return bboxes

def voc2coco(bboxes):
    bboxes = bboxes.copy()
    bboxes[..., [2, 3]] = bboxes[..., [2, 3]] - bboxes[..., [0, 1]]
    return bboxes

def vocnorm(bboxes, image_height=720, image_width=1280):
    """
    voc  => [x1, y1, x2, y2]
    vocnorm => [x1, y1, x2, y2] (normalized)
    """
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]]/ image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]]/ image_height
    
    return bboxes

def vocdenorm(bboxes, image_height=720, image_width=1280):
    """
    vocnorm  => [x1, y1, x2, y2] (normalized)
    voc => [x1, y1, x2, y2] 
    """
    bboxes = bboxes.copy().astype(float) # otherwise all value will be 0 as voc_pascal dtype is np.int
    bboxes[..., [0, 2]] = bboxes[..., [0, 2]] * image_width
    bboxes[..., [1, 3]] = bboxes[..., [1, 3]] * image_height
    
    return bboxes

## Loading the validation dataset

In [None]:
%cd /kaggle/working

from sklearn.model_selection import GroupKFold

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_path(row):
    row['image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

ROOT_DIR  = '/kaggle/input/tensorflow-great-barrier-reef/'

df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")


df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))
df_train = df

#Annotations 
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)

df_train = df_train.progress_apply(get_path, axis=1)

kf = GroupKFold(n_splits = 5) 
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y = df_train.video_id.tolist(), groups=df_train.sequence)):
    df_train.loc[val_idx, 'fold'] = fold

df_train.head(5)

In [None]:
df_test = df_train[df_train.fold == 4]

In [None]:
image_paths = df_test.image_path.tolist()
gt = df_test.bboxes.tolist()

### Test Inference

In [None]:
i = 1350
TEST_IMAGE_PATH = image_paths[i]
img = cv2.imread(TEST_IMAGE_PATH)

# Draw Green ground truth box
out_image0 = draw_yolox_predictions(img, gt[i], [1.0] * len(gt[i]), [0] * len(gt[i]), COCO_CLASSES, (0,255,0))

# Base model inference
bboxes, bbclasses, scores = sy4_inference(img, model, SIZE, CONF)
out_image = draw_yolox_predictions(out_image0, bboxes, scores, bbclasses, COCO_CLASSES, (0,0,255))
# Convert BGR to RGB
out_image = cv2.cvtColor(out_image, cv2.COLOR_BGR2RGB)
display(Image.fromarray(out_image))

# Augmented model inference
bboxes, bbclasses, scores = sy4_inference_augmented(img, model, AUGMENTED_SIZES, CONF)
out_image = draw_yolox_predictions(out_image0, bboxes, scores, bbclasses, COCO_CLASSES, (0,0,255))
# Convert BGR to RGB
out_image = cv2.cvtColor(out_image, cv2.COLOR_BGR2RGB)
display(Image.fromarray(out_image))

# Make Videos

In [None]:
%cd /kaggle/working

video_size = (1280, 720)

out1 = cv2.VideoWriter('ScaledYOLOv4.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, video_size)
out2 = cv2.VideoWriter('ScaledYOLOv4_augmented.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, video_size)

for i in tqdm(range(1250, 1600)):
# for i in tqdm(range(len(image_paths))):
    TEST_IMAGE_PATH = image_paths[i]
    img = cv2.imread(TEST_IMAGE_PATH)
    out_image0 = draw_yolox_predictions(img, gt[i], [1.0] * len(gt[i]), [0] * len(gt[i]), COCO_CLASSES, (0,255,0))
    
    # Draw Base model predictions
    bboxes, bbclasses, scores = sy4_inference(img, model, SIZE, CONF, verbose = False)
    out_image = draw_yolox_predictions(out_image0, bboxes, scores, bbclasses, COCO_CLASSES, (0,0,255))
    out1.write(out_image)

    # Draw augmented model predictions
    bboxes, bbclasses, scores = sy4_inference_augmented(img, model, AUGMENTED_SIZES, CONF, verbose = False)
    out_image = draw_yolox_predictions(out_image0, bboxes, scores, bbclasses, COCO_CLASSES, (0,0,255))
    out2.write(out_image)
    
out1.release()

# Compress video files

AVI2MP4 = "-ac 2 -b:v 2000k -c:a aac -c:v libx264 -b:a 160k -vprofile high -bf 0 -strict experimental -f mp4"

command = f"{FFMPEG_BIN} -i ScaledYOLOv4.avi {AVI2MP4} ScaledYOLOv4.mp4"
subprocess.call(command, shell=True)

command = f"{FFMPEG_BIN} -i ScaledYOLOv4_augmented.avi {AVI2MP4} ScaledYOLOv4_augmented.mp4"
subprocess.call(command, shell=True)

## Display Video

In [None]:
from IPython.display import HTML
from base64 import b64encode

def play(filename):
    html = ''
    video = open(filename,'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=800 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src 
    return HTML(html)

### Scaled YOLOv4 without size TTA

In [None]:
play('ScaledYOLOv4.mp4')

### Scaled YOLOv4 with size TTA

In [None]:
play('ScaledYOLOv4_augmented.mp4')

# SUBMIT PREDICTION TO COMPETITION

In [None]:
%cd /kaggle/working/

In [None]:
import greatbarrierreef

env = greatbarrierreef.make_env()   # initialize the environment
iter_test = env.iter_test()  

In [None]:
for (image_np, sample_prediction_df) in iter_test:
    img0 = image_np[:,:,::-1]
    
    if USE_AUGMENTED_INFERENCE:
        bboxes, bbclasses, scores = sy4_inference_augmented(img0, model, AUGMENTED_SIZES, CONF, verbose = False)
    else:
        bboxes, bbclasses, scores = sy4_inference(img0, model, SIZE, CONF, verbose = False)

    predictions = []
    for i in range(len(bboxes)):
        box = bboxes[i]
        cls_id = int(bbclasses[i])
        score = scores[i]
        if score < CONF:
            continue
        x_min = int(box[0])
        y_min = int(box[1])
        
        bbox_width = int(box[2])
        bbox_height = int(box[3])
        
        predictions.append('{:.2f} {} {} {} {}'.format(score, x_min, y_min, bbox_width, bbox_height))
    
    prediction_str = ' '.join(predictions)
    sample_prediction_df['annotations'] = prediction_str
    env.predict(sample_prediction_df)

    print('Prediction:', prediction_str)

In [None]:
sub_df = pd.read_csv('submission.csv')
sub_df.head()

### Cleanup

In [None]:
%cd /kaggle/working
!rm *.avi
!rm -r ffmpeg*
!rm -r ScaledYOLOv4