# Example Code of Seq-NMS

I will share part of my evaluation code.
Although the prediction code is private, you can imagine how to use Seq-NMS[1] in real-time situation.

I used implementation [2]. You also can integrate it with your own project.

* [1] https://arxiv.org/abs/1602.08465
* [2] https://github.com/tmoopenn/seq-nms

# Environment Setup

In [None]:
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()

personal_key_for_api = user_secrets.get_secret("github-api-token")

! git clone https://{personal_key_for_api}:x-oauth-basic@github.com/bilzard/great-barrier-reef.git /tmp/great-barrier-reef
! cd /tmp/great-barrier-reef && git log | head

In [None]:
%%bash
cat << 'EOF' > /tmp/run.bash
PIP_DEP_PATH='/kaggle/input/starfish-build-yolov5-runtime-environment/pip_deps'
echo ${PIP_DEP_PATH}
pip install ${PIP_DEP_PATH}/* -f ./ --no-index --no-deps --find-links="${PIP_DEP_PATH}"

EOF
chmod +x /tmp/run.bash

In [None]:
!time /tmp/run.bash

In [None]:
!mkdir -p /root/.config/Ultralytics
!cp ../input/yolov5-runtime-environment/arial.ttf /root/.config/Ultralytics/Arial.ttf

# setup seq_nms

In [None]:
%%bash

cat << 'EOF' > /tmp/run.bash

cd /tmp/great-barrier-reef/seq_nms
python setup.py build_ext --inplace

EOF
chmod +x /tmp/run.bash

In [None]:
!/tmp/run.bash

# Prepare Inference Utilities

In [None]:
import os
import sys
from collections import deque
from importlib import reload

sys.path.append('/tmp/great-barrier-reef')
sys.path.append('/tmp/great-barrier-reef/seq_nms')
sys.path.append('/tmp/great-barrier-reef/yolov5')

import cv2
import numpy as np
import pandas as pd
import torch
from omegaconf import OmegaConf
from PIL import Image
from tqdm.notebook import tqdm

from seq_nms import seq_nms

%load_ext autoreload
%autoreload 2

In [None]:
ROOT_DIR  = '/kaggle/input/tensorflow-great-barrier-reef/'
CKPT_PATH = '../input/starfishyolov5models/01_split_by_partition_400_10_fold_img_640_l_heavyx2_g0_pw15_iou20_5x3_neg03fold_8_best.pt'
AUGMENT   = False
CONF = 0.001
MAX_DET = 200
IOU = 0.3
IMG_SIZE  = 2560
CONF_THRE = 0.458
P_KEEP = 1.0
RANDOM_SEED = 123

# ===========
# around-edge, low-confidence boxes removal
# ===========
EDGE_PIXELS = 60
EDGE_SCORE_THR = 0.2

# ===========
# Seq-NMS
# ===========
SEQ_NMS_NMS_THR = 0.3
SEQ_NMS_NUM_FRAMES = 20
SEQ_NMS_LK_THR = 0.3
SEQ_NMS_SCORE_METRIC = 'max'
SEQ_NMS_DECAY = 0.9

# generate fold split

In [None]:
import os
os.environ['METADATA_PATH'] = '/kaggle/input/tensorflow-great-barrier-reef'
os.environ['WORK_DIR'] = '/tmp'
os.environ['IMAGE_DIR'] = '/kaggle/input/tensorflow-great-barrier-reef/train_images'
!export | grep MEDATADA_PATH
!export | grep WORK_DIR
!export | grep IMAGE_DIR

In [None]:
!echo $METADATA_PATH
!echo $WORK_DIR
!echo $IMAGE_DIR

In [None]:
%%bash
cd /tmp/great-barrier-reef/etl
python 01_fold_split_by_partition.py 10

In [None]:
# Train Data
df = pd.read_csv(f'/tmp/great-barrier-reef/data/01_split_by_partition_400_10_fold.csv')
df['image_path'] = (
    f'{ROOT_DIR}/train_images/video_' + 
    df['video_id'].astype(str) + 
    '/' + 
    df['video_frame'].astype(str) +
    '.jpg'
)
df['annotations'] = df['annotations'].apply(lambda x: eval(x))
display(df.head(2))

In [None]:
df['num_bbox'] = df['annotations'].apply(lambda x: len(x))
data = (df.num_bbox>0).value_counts()/len(df)*100
print(f"No BBox: {data[0]:0.2f}% | With BBox: {data[1]:0.2f}%")

In [None]:
def normalize_voc(bboxes, image_height=720, image_width=1280):
    '''
    voc => [x1, y1, x2, y2]
    normalized voc => [sx1, sy1, sx2, sy2]
    '''
    bboxes[..., 0::2] /= image_width
    bboxes[..., 1::2] /= image_height
    return bboxes

    
def renormalize_voc(bboxes, image_height=720, image_width=1280):
    '''
    normalized voc => [sx1, sy1, sx2, sy2]
    voc => [x1, y1, x2, y2]
    '''
    bboxes[..., 0::2] *= image_width
    bboxes[..., 1::2] *= image_height
    bboxes = bboxes.astype(int)
    return bboxes


def voc2coco(bboxes, image_height=720, image_width=1280):
    '''
    voc  => [x1, y1, x2, y2]
    coco => [xmin, ymin, w, h]
    '''
    bboxes[..., 2:] = bboxes[..., 2:] - bboxes[..., :2]
    return bboxes


def load_image(image_path):
    return cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB)


def plot_one_box(x, img, color=None, label=None, line_thickness=None):
    # Plots one bounding box on image img
    tl = line_thickness or round(0.002 * (img.shape[0] + img.shape[1]) / 2) + 1  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label:
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(img, label, (c1[0], c1[1] - 2), 0, tl / 3, 
                    [63, 63, 63], thickness=tf, lineType=cv2.LINE_AA)

        
def draw_bboxes(img, bboxes, classes, class_ids, colors=None, show_classes=None, bbox_format='yolo', class_name=False, line_thickness=2):  
     
    image = img.copy()
    show_classes = classes if show_classes is None else show_classes
    colors = (0, 255 ,0) if colors is None else colors

    if bbox_format == 'yolo':
        
        for idx in range(len(bboxes)):  
            
            bbox  = bboxes[idx]
            cls   = classes[idx]
            cls_id = class_ids[idx]
            color = colors[cls_id] if type(colors) is list else colors
            
            if cls in show_classes:
            
                x1 = round(float(bbox[0])*image.shape[1])
                y1 = round(float(bbox[1])*image.shape[0])
                w  = round(float(bbox[2])*image.shape[1]/2) #w/2 
                h  = round(float(bbox[3])*image.shape[0]/2)

                voc_bbox = (x1-w, y1-h, x1+w, y1+h)
                plot_one_box(voc_bbox, 
                             image,
                             color=color,
                             label=cls if class_name else str(get_label(cls)),
                             line_thickness=line_thickness)
            
    elif bbox_format == 'coco':
        
        for idx in range(len(bboxes)):  
            
            bbox  = bboxes[idx]
            cls   = classes[idx]
            cls_id = class_ids[idx]
            color = colors[cls_id] if type(colors) is list else colors
            
            if cls in show_classes:            
                x1 = int(round(bbox[0]))
                y1 = int(round(bbox[1]))
                w  = int(round(bbox[2]))
                h  = int(round(bbox[3]))

                voc_bbox = (x1, y1, x1+w, y1+h)
                plot_one_box(voc_bbox, 
                             image,
                             color=color,
                             label=cls if class_name else str(cls_id),
                             line_thickness=line_thickness)

    elif bbox_format == 'voc_pascal':
        
        for idx in range(len(bboxes)):  
            
            bbox  = bboxes[idx]
            cls   = classes[idx]
            cls_id = class_ids[idx]
            color = colors[cls_id] if type(colors) is list else colors
            
            if cls in show_classes: 
                x1 = int(round(bbox[0]))
                y1 = int(round(bbox[1]))
                x2 = int(round(bbox[2]))
                y2 = int(round(bbox[3]))
                voc_bbox = (x1, y1, x2, y2)
                plot_one_box(voc_bbox, 
                             image,
                             color = color,
                             label = cls if class_name else str(cls_id),
                             line_thickness = line_thickness)
    else:
        raise ValueError('wrong bbox format')

    return image

def get_bbox(annots):
    bboxes = [list(annot.values()) for annot in annots]
    return bboxes

def get_imgsize(row):
    row['width'], row['height'] = imagesize.get(row['image_path'])
    return row

np.random.seed(32)
colors = [
    (255, 0, 0),  # red
    (0, 255, 0),  # green
    (255, 255, 0), # yellow
    (0, 127, 255), # cyan
]

In [None]:
def load_model(ckpt_path, conf, iou):
    model = torch.hub.load('/tmp/great-barrier-reef/yolov5',
                           'custom',
                           path=ckpt_path,
                           source='local',
                           force_reload=True,
                           tile_helper=None,
                          )  # local repo
    model.conf = conf  # NMS confidence threshold
    model.iou  = iou  # NMS IoU threshold
    model.classes = None   # (optional list) filter by class, i.e. = [0, 15, 16] for persons, cats and dogs
    model.multi_label = False  # NMS multiple labels per box
    model.max_det = MAX_DET  # maximum number of detections per image
    return model

In [None]:
def drop_pred(bboxes, scores, p_keep):
    '''
    randomly drop prediction for the probability (1 - p_keep).
    '''
    if p_keep == 1:
        return bboxes, scores
    bboxes = bboxes.copy()
    scores = scores.copy()
    assert len(bboxes) == len(scores)
    pp = np.random.uniform(size=len(bboxes))
    bboxes = bboxes[pp <= p_keep]
    scores = scores[pp <= p_keep]
    return bboxes, scores


def filter_by_threshold(bboxes, scores, conf_thre, verbose=0):
    bboxes = bboxes.copy()
    scores = scores.copy()
    
    filter_by_score = scores >= conf_thre
    bboxes, scores = bboxes[filter_by_score], scores[filter_by_score]
    assert len(bboxes) == len(scores)
    if verbose >= 2:
        print('final predictions:')
        print(bboxes, scores)
    return bboxes, scores
    

def predict(model, img, size=768, augment=False, verbose=0):
    results = model(img, size=size, augment=augment)  # custom inference size
    preds = results.pandas().xyxy[0]
    bboxes = preds[['xmin','ymin','xmax','ymax']].values
    if len(bboxes) == 0:
        bboxes, scores = np.zeros((0, 4), dtype=int), np.zeros((0), dtype=int)

    scores = preds.confidence.values

    return bboxes, scores
    
def format_prediction(bboxes, scores):
    annot = ''
    if len(bboxes) > 0:
        for idx in range(len(bboxes)):
            xmin, ymin, w, h = bboxes[idx]
            conf = scores[idx]
            annot += f'{conf:.8f} {xmin} {ymin} {w} {h}'
            annot +=' '
        annot = annot.strip(' ')
    return annot

def show_img(img, bboxes, scores, gt_boxes, image_id, newly_detected, newly_suppressed, bbox_format='yolo', img_size=(800, 400)):
    labels = [0] * len(bboxes)
    img = draw_bboxes(img=img,
                      bboxes=gt_boxes,
                      classes=["GT" for _ in range(len(gt_boxes))],
                      class_ids=[1] * len(gt_boxes),
                      class_name=True,
                      colors=colors, 
                      bbox_format=bbox_format,
                      line_thickness=2)
    img = draw_bboxes(img=img,
                      bboxes=bboxes,
                      classes=[f"Pred:{score:.2f}" for score in scores],
                      class_ids=labels,
                      class_name=True, 
                      colors=colors, 
                      bbox_format=bbox_format,
                      line_thickness=2)
    img = draw_bboxes(img=img,
                      bboxes=newly_detected[0],
                      classes=[f"Det:{score:.2f}" for score in newly_detected[1]],
                      class_ids=[2] * len(newly_detected[0]),
                      class_name=True, 
                      colors=colors, 
                      bbox_format=bbox_format,
                      line_thickness=2)
    img = draw_bboxes(img=img,
                      bboxes=newly_suppressed[0],
                      classes=[f"Sup:{score:.2f}" for score in newly_suppressed[1]],
                      class_ids=[3] * len(newly_suppressed[0]),
                      class_name=True, 
                      colors=colors, 
                      bbox_format=bbox_format,
                      line_thickness=2)
    cv2.putText(img, image_id, (0, 25), 0, 1, [255, 255, 255], thickness=1, lineType=cv2.LINE_AA)
    return Image.fromarray(img).resize(img_size)

In [None]:
class SeqNmsPostProcessor:
    def __init__(self, num_frames=5, decay=0.9, verbose=0):
        self.queue = deque(maxlen=num_frames)
        self.verbose = verbose
        self.num_frames = num_frames
        self.decay = decay
        if self.verbose:
            print(f"SeqNmsPostProcessor(num_frames={num_frames} decay={decay})")
        
    def _pad_sequence(self, frame_bboxes, frame_scores):
        """
        pad bboxes and frame_scores in order to shape (f, b, 4) and (f, b) respectively
        """
        max_len = max([len(item) for item in frame_scores])
        for idx, (bboxes, scores) in enumerate(zip(frame_bboxes, frame_scores)):
            pad_bboxes = np.zeros((max_len - len(scores), 4))
            pad_scores = np.zeros((max_len - len(scores)))
            frame_bboxes[idx] = np.concatenate([bboxes, pad_bboxes])
            frame_scores[idx] = np.concatenate([scores, pad_scores])
            
        return frame_bboxes, frame_scores
    
    def _drop_zero(self, bboxes, scores):
        """
        drop padded zero prediction
        """
        return bboxes[scores > 0], scores[scores > 0]
 
    def apply(self, bboxes, scores, linkage_threshold, nms_threshold, score_metric):
        bboxes, scores = bboxes.copy(), scores.copy()
        if self.verbose:
            original_scores = scores.copy()
        self.queue.append((bboxes, scores))
        frame_bboxes, frame_scores = zip(*self.queue)
        frame_bboxes, frame_scores = list(frame_bboxes), list(frame_scores)
        frame_bboxes, frame_scores = self._pad_sequence(frame_bboxes, frame_scores)
        frame_bboxes = np.stack(frame_bboxes)
        frame_scores = np.stack(frame_scores)
        seq_nms(
            frame_bboxes,
            frame_scores,
            labels=[],
            linkage_threshold=linkage_threshold,
            nms_threshold=nms_threshold,
            score_metric=score_metric,
        )
        # decay
        for i in range(len(self.queue)):
            bb, ss = self.queue[i]
            self.queue[i] = (bb, ss * self.decay)

        bboxes, scores = frame_bboxes[-1, :], frame_scores[-1, :]
        bboxes, scores = self._drop_zero(bboxes, scores)
        if self.verbose:
            newly_detected = (original_scores < CONF_THRE) & (scores >= CONF_THRE)
            newly_suppressed = (original_scores >= CONF_THRE) & (scores < CONF_THRE)
            if self.verbose >= 2:
                print("Newly Detected Labels:")
                print(f"  - Original Score: {original_scores[newly_detected]}")
                print(f"  - Updated Score: {scores[newly_detected]}")
                print(f"  - Coordinate: {bboxes[newly_detected]}")
                print("Newly Suppressed Labels:")
                print(f"  - Original Score: {original_scores[newly_suppressed]}")
                print(f"  - Updated Score: {scores[newly_suppressed]}")
                print(f"  - Coordinate: {bboxes[newly_suppressed]}")
        
        if self.verbose:
            return bboxes, scores, (voc2coco(bboxes[newly_detected]), scores[newly_detected]), (voc2coco(bboxes[newly_suppressed]), scores[newly_suppressed])
        else:
            return bboxes, scores, (np.zeros((0, 4)), np.zeros((0))), (np.zeros((0, 4)), np.zeros((0)))

In [None]:
def remove_boxes_around_edges(bboxes, scores, width, height, pixels=40, score_thr=0.2):
    """
    remove low confidence boxes around the edges
    coodinate should be in voc (x1, y1, x2, y2)
    """
    filter_idx = (
        (bboxes[:, 2] < pixels) | (bboxes[:, 0] >= width - pixels) | 
        ((bboxes[:, 3] < pixels)) | (bboxes[:, 1] >= height - pixels)
    ) & (scores < score_thr)
    
    return bboxes[~filter_idx], scores[~filter_idx]

In [None]:
def detect_pipeline(img, model, post_processor, verbose=0):
    height, width = img.shape[:2]
    
    # predict -> voc(x1, y1, x2, y2)
    bboxes, scores = predict(model, img, size=IMG_SIZE, augment=AUGMENT, verbose=verbose)
    bboxes = normalize_voc(bboxes, height, width)

    # renormalize box coordinate
    bboxes = renormalize_voc(bboxes, height, width)
    
    # remove low confidence boxes around the edges
    bboxes, scores = remove_boxes_around_edges(bboxes, scores, width, height, pixels=EDGE_PIXELS, score_thr=EDGE_SCORE_THR)
    
    if post_processor.num_frames >= 2:
        # Seq-NMS: https://arxiv.org/abs/1602.08465
        bboxes, scores, newly_detected, newly_suppressed = post_processor.apply(
            bboxes,
            scores,
            linkage_threshold=SEQ_NMS_LK_THR,
            nms_threshold=SEQ_NMS_NMS_THR,
            score_metric=SEQ_NMS_SCORE_METRIC,
        )
    else:
        newly_detected, newly_suppressed = (np.zeros((0, 4)), np.zeros((0))), (np.zeros((0, 4)), np.zeros((0)))
    return bboxes, scores, newly_detected, newly_suppressed

# Run Inference on Train

In [None]:
eval_df = df[(df.num_bbox >= 1) & (df.fold_id == 8)]
len(eval_df)

In [None]:
start_frame = 370
duration = 20

model = load_model(CKPT_PATH, conf=CONF, iou=IOU)
post_processor = SeqNmsPostProcessor(num_frames=SEQ_NMS_NUM_FRAMES, decay=SEQ_NMS_DECAY, verbose=2)

for idx, item in enumerate(eval_df.head(start_frame + duration).tail(duration).itertuples()):
    path = item.image_path
    image_id = item.image_id
    gt_boxes = item.annotations
    gt_boxes = [[b['x'], b['y'], b['width'], b['height']] for b in gt_boxes]
    gt_boxes = np.array(gt_boxes)

    img = cv2.imread(path)[...,::-1]
    bboxes, scores, newly_detected, newly_suppressed = detect_pipeline(img, model, post_processor, verbose=2)

    # convert coordinate
    bboxes = voc2coco(bboxes)
    
    # filter by conf threshold
    bboxes, scores = filter_by_threshold(bboxes, scores, CONF_THRE, verbose=2)
    
    # display image with boxes
    display(show_img(img, bboxes, scores, gt_boxes, image_id, newly_detected, newly_suppressed, bbox_format='coco', img_size=(960, 540)))
