# Yolov5 is all you need - Make compact videos

* This notebook is based on https://www.kaggle.com/steamedsheep/yolov5-is-all-you-need from steamedsheep (version 2)

# Why this notebook?

* Besides measuring the usual metrics (mAP, F1, F2, P, R, etc), creating compact videos is essential for understanding how your model is performing.
* Each model has its strengths and weaknesses. It is difficult to appreciate these by just looking at the metrics; you need to see it in action, comparing against ground truth
* As this competition is based on object detection of video frames, watching the video of the inference helps us understand model weaknesses, including:
  * Whether the model can detect large or small objects, or both
  * Which COTS are consistently not detected, and why
  * Why are some COTS detected on some video frames and not others (i.e. the importance of object tracking algorithms like DeepSort and Norfair)
  
Only from understanding your model can you improve it by adding augmentation, modifying train / inference resolution, etc
  
# What's new in this notebook?

* Plotting videos with both ground-truth and inference bounding boxes in the same video (ground truth in green, inference in red)
* AVI to mp4 compression for easy download 

In [None]:
# CONTROL PANEL (change your settings here instead)

# The pato to your yolov5 model
MODEL_PATH = '../input/reef-baseline-fold12/l6_3600_uflip_vm5_f12_up/f1/best.pt'

# Confidence cutoff
INFER_CONF = 0.15

# Inference size
INFER_SIZE = 7200

# Whether to use yolov5 TTA
INFER_TTA = True

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
from tqdm import tqdm
import sys

sys.path.append('../input/tensorflow-great-barrier-reef')

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
# import torch
import importlib
import cv2 
# import pandas as pd
# import numpy as np

import ast
import shutil
import sys

from tqdm.notebook import tqdm
tqdm.pandas()

from PIL import Image
from IPython.display import display

In [None]:
!mkdir -p /root/.config/Ultralytics
!cp /kaggle/input/yolov5-font/Arial.ttf /root/.config/Ultralytics/

# Define the Model Here

In [None]:
model = torch.hub.load('../input/yolov5-lib-ds', 
                       'custom', 
                       path=MODEL_PATH,
                       source='local',
                       force_reload=True)  # local repo
model.conf = 0.01

# Creating Videos

## Install ffmpeg for Kaggle

* I'm not sure if this is necessary?

In [None]:
# Install ffmpeg for video compression
%cd /kaggle/working

! tar xvf ../input/ffmpeg-static-build/ffmpeg-git-amd64-static.tar.xz

import subprocess

FFMPEG_BIN = "/kaggle/working/ffmpeg-git-20191209-amd64-static/ffmpeg"

### Utility Functions

In [None]:
# Modified from https://www.kaggle.com/remekkinas/yolox-inference-on-kaggle-for-cots-lb-0-507

def draw_yolox_predictions(img, bboxes, scores, bbclasses, classes_dict, boxcolor = (0,0,255)):
    outimg = img.copy()
    for i in range(len(bboxes)):
        box = bboxes[i]
        cls_id = int(bbclasses[i])
        score = scores[i]
        x0 = int(box[0])
        y0 = int(box[1])
        x1 = x0 + int(box[2])
        y1 = y0 + int(box[3])

        cv2.rectangle(outimg, (x0, y0), (x1, y1), boxcolor, 2)
        cv2.putText(outimg, '{}:{:.1f}%'.format(classes_dict[cls_id], score * 100), (x0, y0 - 3), cv2.FONT_HERSHEY_PLAIN, 0.8, boxcolor, thickness = 1)
    return outimg

## Importing the Training Dataset and selecting videos

* Note that it is important to also include videos with NO COTS, so you can understand where false positive detections may arise

In [None]:
# Modified from https://www.kaggle.com/remekkinas/yolox-inference-on-kaggle-for-cots-lb-0-507

%cd /kaggle/working

from sklearn.model_selection import GroupKFold

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_path(row):
    row['image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

ROOT_DIR  = '/kaggle/input/tensorflow-great-barrier-reef/'

df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")


df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))
df_train = df

#Annotations 
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)

df_train = df_train.progress_apply(get_path, axis=1)

kf = GroupKFold(n_splits = 5) 
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y = df_train.video_id.tolist(), groups=df_train.sequence)):
    df_train.loc[val_idx, 'fold'] = fold

df_train.head(5)

### Select the validation dataset (fold-4 in my private models, probably not the same as steamedsheep's YoloV5 model)

* Take care not to shuffle the videos! Keep them in the original order by not sorting the data frame

In [None]:
df_test = df_train[df_train.fold == 4]

### Define image paths and ground truth bounding boxes

In [None]:
image_paths = df_test.image_path.tolist()
gt = df_test.bboxes.tolist()

## Inference on Validation Videos and Recording the Video

### Test inference

In [None]:
i = 1350
TEST_IMAGE_PATH = image_paths[i]
img = cv2.imread(TEST_IMAGE_PATH)

r = model(img, size=3600, augment=True)

In [None]:
r.pandas().xyxy[0]
res = np.array(r.pandas().xyxy[0])
bboxes = res[..., 0:4]
bboxes[:,2:4] -= bboxes[:,0:2]
bboxes

In [None]:
%cd /kaggle/working

video_size = (1280, 720)
COCO_CLASSES = ("starfish")

out1 = cv2.VideoWriter('Video.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, video_size)

for i in tqdm(range(1250, 2200)):
    # Test a small video first. For the full video, substitute "tqdm(range(start, finish))" with "tqdm(range(len(image_paths)))"
    TEST_IMAGE_PATH = image_paths[i]
    img = cv2.imread(TEST_IMAGE_PATH)

    # Draw GT
    out_image0 = draw_yolox_predictions(img, gt[i], [1.0] * len(gt[i]), [0] * len(gt[i]), COCO_CLASSES, (0,255,0)) # Green ground truth box

    # Insert your inference code here:
    r = model(img, size=INFER_SIZE, augment=INFER_TTA)
    
    bboxes = []
    bbclasses = []
    scores = []
    if r.pandas().xyxy[0].shape[0] > 0:
        res = np.array(r.pandas().xyxy[0])
        for r in res:
            # Filter by INFER_CONF
            if r[4] > INFER_CONF:
                bb = r[0:4]
                bb[2:4] -= bb[0:2] # Convert to xywh format
                bboxes.append(bb)
                scores.append(r[4])
                bbclasses.append(r[5])
        
    out_image = draw_yolox_predictions(out_image0, bboxes, scores, bbclasses, COCO_CLASSES, (0,0,255)) # Red ground truth box (as image is BGR, not RGB)
    out1.write(out_image)
    
# Finalize AVI
out1.release()

## Convert AVI to compressed mp4 for more convenient downloading

* The created AVI is a large file. Compress the video file so you can download it and watch it locally

In [None]:
# I'm open to suggestions as to what compression settings might be better?
AVI2MP4 = "-ac 2 -b:v 2000k -c:a aac -c:v libx264 -b:a 160k -vprofile high -bf 0 -strict experimental -f mp4"

command = f"{FFMPEG_BIN} -i Video.avi {AVI2MP4} Video.mp4"
subprocess.call(command, shell=True)

# Show off your video!

* Green boxes are ground truth
* Red boxes are model inference

In [None]:
from IPython.display import HTML
from base64 import b64encode

def play(filename):
    html = ''
    video = open(filename,'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=800 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src 
    return HTML(html)

play('Video.mp4')

In [None]:
# Cleanup

!rm *.avi
!rm -r ffmpeg*

# Competition Inference (left unchanged)

In [None]:
import greatbarrierreef
env = greatbarrierreef.make_env()# initialize the environment
iter_test = env.iter_test()      # an iterator which loops over the test set and sample submission

In [None]:
for idx, (img, pred_df) in enumerate(tqdm(iter_test)):
    anno = ''
    r = model(img, size=INFER_SIZE, augment=INFER_TTA)
    if r.pandas().xyxy[0].shape[0] == 0:
        anno = ''
    else:
        for idx, row in r.pandas().xyxy[0].iterrows():
            if row.confidence > INFER_CONF:
                anno += '{} {} {} {} {} '.format(row.confidence, int(row.xmin), int(row.ymin), int(row.xmax-row.xmin), int(row.ymax-row.ymin))
#                 pred.append([row.confidence, row.xmin, row.ymin, row.xmax-row.xmin, row.ymax-row.ymin])
    pred_df['annotations'] = anno.strip(' ')
    env.predict(pred_df)

# Credits:

https://www.kaggle.com/steamedsheep/yolov5-is-all-you-need for example model inference

https://www.kaggle.com/remekkinas/yolox-inference-on-kaggle-for-cots-lb-0-507 for utility functions

https://www.kaggle.com/bamps53/create-annotated-video for inspiration on video creation and compression