# TensorFlow Object Detection - Make compact videos

**<span style="color:red">If you liked this notebook, please don't forget to upvote it!</span>**

* This notebook is based on https://www.kaggle.com/alexchwong/yolov5-is-all-you-need-make-compact-videos from alexchwong (version 3). Please upvote it as well!

I used these notebooks to create this one:
* https://www.kaggle.com/alexchwong/yolov5-is-all-you-need-make-compact-videos
* https://www.kaggle.com/khanhlvg/cots-detection-w-tensorflow-object-detection-api
* https://www.kaggle.com/bamps53/create-annotated-video

Also I used code from:
* https://www.tensorflow.org/hub/tutorials/object_detection

Please upvote them as well!


In [None]:
# Path to TF model
# We will use model from https://www.kaggle.com/khanhlvg/cots-detection-w-tensorflow-object-detection-api for demo purposes
MODEL_DIR = '../input/cots-detection-w-tensorflow-object-detection-api/cots_efficientdet_d0'

# Detection parameters
DETECTION_THRESHOLD  = 0.3
MAX_BOXES = 10

In [None]:
import numpy as np
import pandas as pd 
import torch
from tqdm import tqdm
import sys
import tensorflow as tf

from PIL import Image
from PIL import ImageColor
from PIL import ImageDraw
from PIL import ImageFont
from PIL import ImageOps

sys.path.append('../input/tensorflow-great-barrier-reef')

In [None]:
import warnings
warnings.filterwarnings("ignore")

import os
import importlib
import cv2 

import ast
import shutil
import sys
import time

from tqdm.notebook import tqdm
tqdm.pandas()

from PIL import Image
from IPython.display import display

# Define the Model Here

In [None]:
start_time = time.time()
tf.keras.backend.clear_session()
detect_fn_tf_odt = tf.saved_model.load(os.path.join(os.path.join(MODEL_DIR, 'output'), 'saved_model'))
end_time = time.time()
elapsed_time = end_time - start_time
print('Elapsed time: ' + str(elapsed_time) + 's')

# Creating Videos

## Install ffmpeg for Kaggle

In [None]:
# Install ffmpeg for video compression
%cd /kaggle/working

! tar -xf ../input/ffmpeg-static-build/ffmpeg-git-amd64-static.tar.xz

import subprocess

FFMPEG_BIN = "/kaggle/working/ffmpeg-git-20191209-amd64-static/ffmpeg"

### Utility Functions

In [None]:
# https://www.tensorflow.org/hub/tutorials/object_detection
def draw_bounding_box_on_image(image,
                               ymin,
                               xmin,
                               ymax,
                               xmax,
                               color,
                               font,
                               thickness=4,
                               display_str_list=()):
    """Adds a bounding box to an image."""
    draw = ImageDraw.Draw(image)
    im_width, im_height = image.size
    (left, right, top, bottom) = (xmin * im_width, xmax * im_width,
                                ymin * im_height, ymax * im_height)
    draw.line([(left, top), (left, bottom), (right, bottom), (right, top),
             (left, top)],
            width=thickness,
            fill=color)

    # If the total height of the display strings added to the top of the bounding
    # box exceeds the top of the image, stack the strings below the bounding box
    # instead of above.
    display_str_heights = [font.getsize(ds)[1] for ds in display_str_list]
    # Each display_str has a top and bottom margin of 0.05x.
    total_display_str_height = (1 + 2 * 0.05) * sum(display_str_heights)

    if top > total_display_str_height:
        text_bottom = top
    else:
        text_bottom = top + total_display_str_height
    # Reverse list and print from bottom to top.
    for display_str in display_str_list[::-1]:
        text_width, text_height = font.getsize(display_str)
        margin = np.ceil(0.05 * text_height)
        draw.rectangle([(left, text_bottom - text_height - 2 * margin),
                        (left + text_width, text_bottom)],
                       fill=color)
        draw.text((left + margin, text_bottom - text_height - margin),
                  display_str,
                  fill="black",
                  font=font)
        text_bottom -= text_height - 2 * margin


def draw_boxes(image, boxes, class_names, scores, color, max_boxes=MAX_BOXES, min_score=DETECTION_THRESHOLD):
    """Overlay labeled boxes on an image with formatted scores and label names."""
    colors = list(ImageColor.colormap.values())

    font = ImageFont.load_default()
    for i in range(min(boxes.shape[0], max_boxes)):
        if scores[i] >= min_score and len(boxes[i]) > 1:
            #print(boxes[i])
            ymin, xmin, ymax, xmax = tuple(boxes[i])
            display_str = "{}: {}%".format(class_names[i],
                                         int(100 * scores[i]))
            #color = colors[hash(class_names[i]) % len(colors)]
            image_pil = Image.fromarray(np.uint8(image)).convert("RGB")
            draw_bounding_box_on_image(
              image_pil,
              ymin,
              xmin,
              ymax,
              xmax,
              color,
              font,
              display_str_list=[display_str])
            np.copyto(image, np.array(image_pil))
    return image

## Importing the Training Dataset and selecting videos

* Note that it is important to also include videos with NO COTS, so you can understand where false positive detections may arise

In [None]:
# Modified from https://www.kaggle.com/remekkinas/yolox-inference-on-kaggle-for-cots-lb-0-507

%cd /kaggle/working

from sklearn.model_selection import GroupKFold

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_path(row):
    row['image_path'] = f'{ROOT_DIR}/train_images/video_{row.video_id}/{row.video_frame}.jpg'
    return row

ROOT_DIR  = '/kaggle/input/tensorflow-great-barrier-reef/'

df = pd.read_csv("/kaggle/input/tensorflow-great-barrier-reef/train.csv")


df["num_bbox"] = df['annotations'].apply(lambda x: str.count(x, 'x'))
df_train = df

#Annotations 
df_train['annotations'] = df_train['annotations'].progress_apply(lambda x: ast.literal_eval(x))
df_train['bboxes'] = df_train.annotations.progress_apply(get_bbox)

df_train = df_train.progress_apply(get_path, axis=1)

kf = GroupKFold(n_splits = 5) 
df_train = df_train.reset_index(drop=True)
df_train['fold'] = -1
for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y = df_train.video_id.tolist(), groups=df_train.sequence)):
    df_train.loc[val_idx, 'fold'] = fold

df_train.head(5)

### Select the validation dataset

* Take care not to shuffle the videos! Keep them in the original order by not sorting the data frame

In [None]:
df_test = df_train[df_train.fold == 4]

### Define image paths and ground truth bounding boxes

In [None]:
image_paths = df_test.image_path.tolist()

gt = []
for i, row in df_test.iterrows():
    if len(row['bboxes']) > 1:
        x0 = row['bboxes'][0][0] / 1280
        y0 = row['bboxes'][0][1] / 720
        x1 = x0 + row['bboxes'][0][2] / 1280
        y1 = y0 + row['bboxes'][0][3] / 720
        gt.append([y0, x0, y1, x1])
    else:
        gt.append([])

## Inference on Validation Videos and Recording the Video

In [None]:
def load_img(path):
    img = tf.io.read_file(path)
    img = tf.image.decode_jpeg(img, channels=3)
    return img

def detect(image_np):
    input_tensor = np.expand_dims(image_np, 0)
    start_time = time.time()
    detections = detect_fn_tf_odt(input_tensor)
    return detections

def run_detector(detector, path, color):
    img = load_img(path)#load_image_into_numpy_array(path)

    ##converted_img  = tf.image.convert_image_dtype(img, tf.float32)[tf.newaxis, ...]
    start_time = time.time()
    result = detect(img)
    ##result = detector(converted_img)
    end_time = time.time()

    result = {key:value.numpy()[0] for key,value in result.items()}

    #print("Found %d objects." % len(result["detection_scores"]))
    #print("Inference time: ", end_time-start_time)

    detection_classes = ['cots']*len(result["detection_boxes"])
    image_with_boxes = draw_boxes(
        img.numpy(), result["detection_boxes"],
        detection_classes, result["detection_scores"],
        color)

    return image_with_boxes


def add_correct_box(img, ind, color):
    detection_classes_t = ['cots']
    detection_scores_t = [1]
    image_with_boxes = draw_boxes(
        img, np.array([gt[ind]]),
        detection_classes_t, detection_scores_t,
        color)
    return image_with_boxes

In [None]:
%cd /kaggle/working

video_size = (1280, 720)
COCO_CLASSES = ("starfish")

out1 = cv2.VideoWriter('Video.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, video_size)

for i in tqdm(range(1250, 2200)):
    # Test a small video first. For the full video, substitute "tqdm(range(start, finish))" with "tqdm(range(len(image_paths)))"
    TEST_IMAGE_PATH = image_paths[i]
    img = cv2.imread(TEST_IMAGE_PATH)
    out_image = run_detector(detect_fn_tf_odt, TEST_IMAGE_PATH, color='#ff0000')
    out_image = add_correct_box(out_image, i, color='#00ff00')
    out_image = cv2.cvtColor(out_image, cv2.COLOR_BGR2RGB)
    out1.write(out_image)
    
# Finalize AVI
out1.release()

## Convert AVI to compressed mp4 for more convenient downloading

* The created AVI is a large file. Compress the video file so you can download it and watch it locally

In [None]:
AVI2MP4 = "-ac 2 -b:v 2000k -c:a aac -c:v libx264 -b:a 160k -vprofile high -bf 0 -strict experimental -f mp4"

command = f"{FFMPEG_BIN} -i Video.avi {AVI2MP4} Video.mp4"
subprocess.call(command, shell=True)

# Show off your video!

* Green boxes are ground truth
* Red boxes are model inference

In [None]:
from IPython.display import HTML
from base64 import b64encode

def play(filename):
    html = ''
    video = open(filename,'rb').read()
    src = 'data:video/mp4;base64,' + b64encode(video).decode()
    html += '<video width=800 controls autoplay loop><source src="%s" type="video/mp4"></video>' % src 
    return HTML(html)

play('Video.mp4')

In [None]:
# Cleanup

!rm *.avi
!rm -r ffmpeg*