# MMDetection CascadeRCNN 

#### **The purpse of this notebook is to explore method for object detection, concretly helmet detection, using `MMDetection` package and CascadeRCNN model.**

We'll use labeled data set provided for training.

# MMDdetection

MMDetection is an open source object detection toolbox based on PyTorch.

## Major features

* **Modular Design**

The detection framework consist of different components and one can easily construct a customized object detection framework by combining different modules.

* **Support of multiple frameworks out of box**

The toolbox directly supports popular and contemporary detection frameworks, e.g. Faster RCNN, Mask RCNN, RetinaNet, etc.

* **High efficiency**

All basic bbox and mask operations run on GPUs. The training speed is faster than or comparable to other codebases, including Detectron2, maskrcnn-benchmark and SimpleDet.

* **State of the art**

The toolbox stems from the codebase developed by the MMDet team, who won COCO Detection Challenge in 2018, and we keep pushing it forward.


### References

* MMDetection repository https://github.com/open-mmlab/mmdetection
* SIIM MMDetection+CascadeRCNN+Weight&Bias https://www.kaggle.com/sreevishnudamodaran/siim-mmdetection-cascadercnn-weight-bias
* NFL Helmet Assignment - Getting Started Guide https://www.kaggle.com/robikscube/nfl-helmet-assignment-getting-started-guide

**Plese check my related notebook**
* MMDetection player tracking for beginners https://www.kaggle.com/eneszvo/mmdetection-player-tracking-for-beginners
* https://zhuanlan.zhihu.com/p/385702286

It is recommended to install MMDetection with MIM, which automatically handle the dependencies of OpenMMLab projects, including mmcv and other python packages.

In [None]:
!pip install openmim
!mim install mmdet
!git clone https://github.com/open-mmlab/mmdetection.git
%cd mmdetection
!pip install -q -e .
%cd ..

In [None]:
import sys
sys.path.insert(0, "./mmdetection")
import os
# Check Pytorch installation
import torch, torchvision
print(torch.__version__, torch.cuda.is_available())

# Check mmcv installation
from mmcv.ops import get_compiling_cuda_version, get_compiler_version
from mmcv import Config
print(get_compiling_cuda_version())
print(get_compiler_version())

# Check MMDetection installation
from mmdet.apis import set_random_seed

# Imports
import mmdet
from mmdet.apis import set_random_seed
from mmdet.datasets import build_dataset, build_dataloader
from mmdet.models import build_detector
from mmdet.apis import train_detector, single_gpu_test
from mmdet.apis import init_detector, inference_detector, show_result_pyplot

import mmcv
from mmcv.runner import load_checkpoint
from mmcv.parallel import MMDataParallel

import random
import numpy as np
from pathlib import Path
import datetime
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split
import cv2
import json
import matplotlib.pyplot as plt
from IPython.core.display import Video, display
import subprocess
import gc
import shutil


In [None]:
# seed everything

global_seed = 0

def set_seed(seed=global_seed):
    """Sets the random seeds."""
    set_random_seed(seed, deterministic=False)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()

In [None]:
# baseline models
# cfg = Config.fromfile('/kaggle/working/mmdetection/configs/vfnet/vfnet_r50_fpn_mdconv_c3-c5_mstrain_2x_coco.py')
# cfg = Config.fromfile("/kaggle/working/mmdetection/configs/vfnet/vfnet_r50_fpn_mstrain_2x_coco.py")
# cfg = Config.fromfile("/kaggle/working/mmdetection/configs/gfl/gfl_r50_fpn_mstrain_2x_coco.py")
# baseline_cfg_path = "/kaggle/working/mmdetection/configs/cascade_rcnn/cascade_rcnn_r50_fpn_20e_coco.py"
baseline_cfg_path = "/kaggle/working/mmdetection/configs/cascade_rcnn/cascade_rcnn_x101_32x4d_fpn_1x_coco.py"
cfg = Config.fromfile(baseline_cfg_path)

In [None]:
# model_name = 'vfnet_r50_fpn'
# model_name = 'cascade_rcnn_r50_fpn'
model_name = 'cascade_rcnn_x101_32x4d_fpn_1x'
fold = 0
job = 4

# Folder to store model logs and weight files
job_folder = f'/kaggle/working/job{job}_{model_name}_fold{fold}'
cfg.work_dir = job_folder

# Set seed thus the results are more reproducible
cfg.seed = global_seed

if not os.path.exists(job_folder):
    os.makedirs(job_folder)

print("Job folder:", job_folder)

In [None]:
import pandas as pd

# Load image level csv file
extra_df = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/image_labels.csv')
print('Number of ground truth bounding boxes: ', len(extra_df))

# Number of unique labels
label_to_id = {label: i for i, label in enumerate(extra_df.label.unique())}
print('Unique labels: ', label_to_id)

In [None]:
# Set the number of classes
for head in cfg.model.roi_head.bbox_head:
    head.num_classes = 5

cfg.gpu_ids = [0]

# Setting pretrained model in the init_cfg which is required 
# for transfer learning as per the latest MMdetection update
cfg.model.backbone.init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')
cfg.model.backbone.init_cfg=dict(type='Pretrained', checkpoint='open-mmlab://resnext101_32x4d')
cfg.model.pop('pretrained', None)

# Epochs for the runner that runs the workflow 
# Consider increase number of epochs for better performance
cfg.runner.max_epochs = 1 
cfg.total_epochs = 1 

# Learning rate of optimizers. 
# The LR is divided by 8 since the config file is originally for 8 GPUs
cfg.optimizer.lr = 0.02/8

# Learning rate scheduler config used to register LrUpdater hook
cfg.lr_config = dict(
    policy='CosineAnnealing', # The policy of scheduler, also support CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9.
    by_epoch=False,
    warmup='linear', # The warmup policy, also support `exp` and `constant`.
    warmup_iters=500, # The number of iterations for warmup
    warmup_ratio=0.001, # The ratio of the starting learning rate used for warmup
    min_lr=1e-07)

# config to register logger hook
cfg.log_config.interval = 20 # Interval to print the log

# Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation.
cfg.checkpoint_config.interval = 1 # The save interval is 1

Create COCO format data set which is the standard format in object detection.

In [None]:
def create_ann_file(df, category_id):
    
    now = datetime.datetime.now()

    data = dict(
        info=dict(
            description='NFL-Helmet-Assignment',
            url=None,
            version=None,
            year=now.year,
            contributor=None,
            date_created=now.strftime('%Y-%m-%d %H:%M:%S.%f'),
        ),
        licenses=[dict(
            url=None,
            id=0,
            name=None,
        )],
        images=[
            # license, url, file_name, height, width, date_captured, id
        ],
        type='instances',
        annotations=[
            # segmentation, area, iscrowd, image_id, bbox, category_id, id
        ],
        categories=[
            # supercategory, id, name
        ],
    )
    
    class_name_to_id = {}
    labels =  ["__ignore__",
                'Helmet',
              'Helmet-Blurred', 
              'Helmet-Difficult', 
              'Helmet-Sideline',
              'Helmet-Partial']

    for i, each_label in enumerate(labels):
        class_id = i - 1  # starts with -1
        class_name = each_label
        if class_id == -1:
            assert class_name == '__ignore__'
            continue
        class_name_to_id[class_name] = class_id
        data['categories'].append(dict(
            supercategory=None,
            id=class_id,
            name=class_name,
        ))
    
    box_id = 0
    for i, image in tqdm(enumerate(os.listdir(TRAIN_PATH))):

        img = cv2.imread(TRAIN_PATH+'/'+image)
        height, width, _ = img.shape

        data['images'].append({
            'license':0, 
            'url': None,
            'file_name': image,
            'height': height,
            'width': width,
            'date_camputured': None,
            'id': i
        })

        df_temp = df[df.image == image]
        for index, row in df_temp.iterrows():

            area = round(row.width*row.height, 1)
            bbox =[row.left, row.top, row.width, row.height]

            data['annotations'].append({
                'id': box_id,
                'image_id': i,
                'category_id': category_id[row.label],
                'area': area,
                'bbox':bbox,
                'iscrowd':0
            })
            box_id+=1
    
    return data

In [None]:
TRAIN_PATH = '../input/nfl-health-and-safety-helmet-assignment/images'
extra_df = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/image_labels.csv')

category_id = {'Helmet':0, 'Helmet-Blurred':1,
               'Helmet-Difficult':2, 'Helmet-Sideline':3,
               'Helmet-Partial':4}

df_train, df_val = train_test_split(extra_df, test_size=0.2, random_state=42)
ann_file_train = create_ann_file(df_train, category_id)
ann_file_val = create_ann_file(df_val, category_id)

In [None]:
# save data sets
os.makedirs('../tmp', exist_ok=True)

with open('../tmp/ann_file_train.json', 'w') as f:
    json.dump(ann_file_train, f, indent=4)
        
with open('../tmp/ann_file_val.json', 'w') as f:
    json.dump(ann_file_val, f, indent=4)

In [None]:
cfg.dataset_type = 'CocoDataset' # Dataset type, this will be used to define the dataset
cfg.classes = ('Helmet', 'Helmet-Blurred', 'Helmet-Difficult', 'Helmet-Sideline',
               'Helmet-Partial')

cfg.data.train.img_prefix = TRAIN_PATH # Prefix of image path
cfg.data.train.classes = cfg.classes
cfg.data.train.ann_file = '../tmp/ann_file_train.json'
cfg.data.train.type='CocoDataset'

cfg.data.val.img_prefix = TRAIN_PATH # Prefix of image path
cfg.data.val.classes = cfg.classes
cfg.data.val.ann_file = '../tmp/ann_file_val.json'
cfg.data.val.type='CocoDataset'

cfg.data.test.img_prefix = TRAIN_PATH # Prefix of image path
cfg.data.test.classes = cfg.classes
cfg.data.test.ann_file =  '../tmp/ann_file_val.json'
cfg.data.test.type='CocoDataset'

cfg.data.samples_per_gpu = 4 # Batch size of a single GPU used in testing
cfg.data.workers_per_gpu = 2 # Worker to pre-fetch data for each single GPU

In [None]:
# The config to build the evaluation hook, refer to https://github.com/open-mmlab/mmdetection/blob/master/mmdet/core/evaluation/eval_hooks.py#L7 for more details.
cfg.evaluation.metric = 'bbox' # Metrics used during evaluation

# Set the epoch intervel to perform evaluation
cfg.evaluation.interval = 1

# Set the iou threshold of the mAP calculation during evaluation
cfg.evaluation.iou_thrs = [0.5]

# cfg.evaluation.save_best='bbox_mAP_50'

In [None]:
# consider including other transformations for training

albu_train_transforms = [
    dict(type='ShiftScaleRotate', shift_limit=0.0625,
        scale_limit=0.15, rotate_limit=15, p=0.4),
    dict(type='RandomBrightnessContrast', brightness_limit=0.2,
         contrast_limit=0.2, p=0.5),
#     dict(type='IAAAffine', shear=(-10.0, 10.0), p=0.4),
# #     dict(type='MixUp', p=0.2, lambd=0.5),
#     dict(type="Blur", p=1.0, blur_limit=7),
#     dict(type='CLAHE', p=0.5),
#     dict(type='Equalize', mode='cv', p=0.4),
#     dict(
#         type="OneOf",
#         transforms=[
#             dict(type="GaussianBlur", p=1.0, blur_limit=7),
#             dict(type="MedianBlur", p=1.0, blur_limit=7),
#         ],
#         p=0.4,
#     ),
    
#     dict(type='MixUp', p=0.2, lambd=0.5),
#     dict(type='RandomRotate90', p=0.5),
#     dict(type='CLAHE', p=0.5),
#     dict(type='InvertImg', p=0.5),
#     dict(type='Equalize', mode='cv', p=0.4),
#     dict(type='MedianBlur', blur_limit=3, p=0.1)
    ]


cfg.train_pipeline = [
    dict(type='LoadImageFromFile'), # First pipeline to load images from file path
    dict(type='LoadAnnotations',
         with_bbox=True,# Whether to use bounding box, True for detection
         with_mask=True, # Whether to use instance mask, True for instance segmentation
# Whether to convert the polygon mask to instance mask, set False for acceleration and to save memory
         poly2mask=False), 
    dict(type='Resize', # Augmentation pipeline that resize the images and their annotations
         img_scale=(1333, 800), # The largest scale of image
         keep_ratio=True), # whether to keep the ratio between height and width.
    dict(type='RandomFlip', flip_ratio=0.5), # Augmentation pipeline that flip the images and their annotations
    dict(
        type='Albu',
        transforms=albu_train_transforms, # transformations defined above
        bbox_params=dict(
        type='BboxParams',
        format='pascal_voc',
        label_fields=['gt_labels'],
        min_visibility=0.0,
        filter_lost_elements=True),
        keymap=dict(img='image', gt_bboxes='bboxes'),
        update_pad_shape=False,
        skip_img_without_anno=True),
    dict(
        type='Normalize', # Augmentation pipeline that normalize the input images
        mean=[123.675, 116.28, 103.53], # These keys are the same of img_norm_cfg since the
        std=[58.395, 57.12, 57.375], # keys of img_norm_cfg are used here as arguments
        to_rgb=True),
    dict(type='Pad', size_divisor=32),# Padding config, The number the padded images should be divisible
    dict(type='DefaultFormatBundle'), # Default format bundle to gather data in the pipeline
    dict(type='Collect',# Pipeline that decides which keys in the data should be passed to the detector
         keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks'])
]
cfg.test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug', # An encapsulation that encapsulates the testing augmentations
        img_scale=(1333, 800), # Decides the largest scale for testing, used for the Resize pipeline
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'), # Thought RandomFlip is added in pipeline, it is not used because flip=False
            dict(
                type='Normalize',
                mean=[123.675, 116.28, 103.53],
                std=[58.395, 57.12, 57.375],
                to_rgb=True),
            dict(type='Pad', size_divisor=32), # Padding config to pad images divisable by 32.
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img']) # Collect pipeline that collect necessary keys for testing.
        ])
]

In [None]:
cfg_path = f'{job_folder}/job{job}_{Path(baseline_cfg_path).name}'
print(cfg_path)

# Save config file for inference later
cfg.dump(cfg_path)
print(f'Config:\n{cfg.pretty_text}')

In [None]:
# Build Dataset and Start Training

model = build_detector(cfg.model,
                       train_cfg=cfg.get('train_cfg'),
                       test_cfg=cfg.get('test_cfg'))
model.init_weights()
datasets = [build_dataset(cfg.data.train)]

In [None]:
train_detector(model, datasets[0], cfg, distributed=False, validate=True)

In [None]:
# Get the best epoch number
import json
from collections import defaultdict

log_file = f'{job_folder}/None.log.json'

# Source: mmdetection/tools/analysis_tools/analyze_logs.py 
def load_json_logs(json_logs):
    # load and convert json_logs to log_dict, key is epoch, value is a sub dict
    # keys of sub dict is different metrics, e.g. memory, bbox_mAP
    # value of sub dict is a list of corresponding values of all iterations
    log_dicts = [dict() for _ in json_logs]
    for json_log, log_dict in zip(json_logs, log_dicts):
        with open(json_log, 'r') as log_file:
            for line in log_file:
                log = json.loads(line.strip())
                # skip lines without `epoch` field
                if 'epoch' not in log:
                    continue
                epoch = log.pop('epoch')
                if epoch not in log_dict:
                    log_dict[epoch] = defaultdict(list)
                for k, v in log.items():
                    log_dict[epoch][k].append(v)
    return log_dicts

log_dict = load_json_logs([log_file])
# [(print(inner['bbox_mAP']) for inner in item) for item in log_dict]
# [print(item) for item in log_dict[0]]
best_epoch = np.argmax([item['bbox_mAP'][0] for item in log_dict[0].values()])+1
best_epoch

In [None]:
data_dir = '/kaggle/input/nfl-health-and-safety-helmet-assignment/'
example_video = f'{data_dir}/test/57906_000718_Endzone.mp4'

frac = 0.65
display(Video(example_video, embed=True, height=int(720*frac), width=int(1280*frac)))

The process of converting video into frames, predicting bboxes and converting back into video is described here https://www.kaggle.com/eneszvo/mmdetection-player-tracking-for-beginners

In [None]:
# create frames 
img_ext = 'png'
image_name = '57906_000718_Endzone'
frame_dir = '/kaggle/tmp/mp4_img/'
os.makedirs(frame_dir, exist_ok=True)

cmd = 'ffmpeg -i \"{}\" -qscale:v 2 \"{}/{}_%d.{}\"'.format(example_video, frame_dir, image_name, img_ext)
print(cmd)
subprocess.call(cmd, shell=True)

frame_bbox_dir = '/kaggle/tmp/mp4_img_bbox/'
os.makedirs(frame_bbox_dir, exist_ok=True)
checkpoint = f'{job_folder}/epoch_{best_epoch}.pth'
print("Loading weights from:", checkpoint)
cfg = Config.fromfile(cfg_path)

for f in tqdm(os.listdir(frame_dir)):
    img = f'{frame_dir}/{f}'
    # the model is initialized and deleted each time because of RAM usage
    model = init_detector(cfg, checkpoint, device='cuda:0')
    # get results
    result = inference_detector(model, img)
    # save image with bboxes into out_file
    model.show_result(img, result, out_file=os.path.join(frame_bbox_dir,f))
    del result, model
    gc.collect()
    
# make video from frames
video_name = '57906_000718_Endzone_fps60.mp4'
tmp_video_path = os.path.join('/kaggle/working/', f'tmp_{video_name}')
video_path = os.path.join('/kaggle/working/', video_name)

frame_rate = 60

images = [img for img in os.listdir(frame_bbox_dir)]
images.sort(key = lambda x: int(x.split('_')[-1][:-4]))

frame = cv2.imread(os.path.join(frame_bbox_dir, images[0]))
height, width, layers = frame.shape

video = cv2.VideoWriter(tmp_video_path, cv2.VideoWriter_fourcc(*'MP4V'),
                        frame_rate, (width,height))

for f in images:
    img = cv2.imread(os.path.join(frame_bbox_dir, f))
    video.write(img)

video.release()

# Not all browsers support the codec, we will re-load the file at tmp_video_path
# and convert to a codec that is more broadly readable using ffmpeg

if os.path.exists(video_path):
    os.remove(video_path)
    
subprocess.run(["ffmpeg", "-i", tmp_video_path, "-crf", "18", "-preset", "veryfast",
                "-vcodec","libx264", video_path,])

os.remove(tmp_video_path)

I'm not sure how to change label above boxes, but anyway, we can see that results are good even with one epoch of training.

In [None]:
frac = 0.65
display(Video(video_path, embed=True, height=int(720*frac), width=int(1280*frac)))

In [None]:
# remove directories with frames (optional)

for path in [frame_dir, frame_bbox_dir]:
    try:
        shutil.rmtree(path)
    except OSError as e:
        print ("Error: %s - %s." % (e.filename, e.strerror))