# Helmet detection using MaskRCNN without downscaling

This notebook has for goal to share a MaskRCNN model that I had trained for the previous [NFL Impact Detection competition](https://www.kaggle.com/c/nfl-impact-detection). While this model takes images of dimension 512x512 pixels, there is also a set of functions in this notebook that allows to scan over the image in its original resolution. All the boxes detected corresponding to helmets are then merged and filtered when the overlap is too high between 2 boxes. I hope the code is readable enough as I did not initially mean to share it and that it will help people appreciate how good the baseline helmet boxes are before building their own!

In [None]:
import sys
from pathlib import Path
import os
from datetime import datetime
import time
import random
import cv2
import pandas as pd
import numpy as np
import albumentations as A
import matplotlib.pyplot as plt
from albumentations.pytorch.transforms import ToTensorV2
from sklearn.model_selection import StratifiedKFold
from glob import glob
import pandas as pd
import gc
import warnings
from scipy import ndimage
from skimage.measure import find_contours
from matplotlib.patches import Polygon
import copy

warnings.filterwarnings("ignore")

# Get the MaskRCNN ready

In the following cell, the first line moves the current directory to the MaskRCNN folder. This is very important for future relative path!

In [None]:
# os.chdir('../input/mask-rcnn')
os.chdir('../input/maskrcnn-tf2-keras')
DATA_DIR = Path('../kaggle/input/')
ROOT_DIR = "../../input"
sys.path.append(ROOT_DIR+'/maskrcnn-tf2-keras')
from mrcnn.config import Config

from mrcnn import utils
import mrcnn.model as modellib
from mrcnn import visualize
from mrcnn.model import log

The MaskRCNN has many parameters. We have a fairly standard set of parameters with an input size of 512 pixels and a ResNet50 as a backbone.

In [None]:
NUM_CATS=1
IMAGE_SIZE=512
class HelmetConfig(Config):
    NAME = "helmet_detection"
    NUM_CLASSES = NUM_CATS + 1 # +1 for the background class
    
    GPU_COUNT = 1
    IMAGES_PER_GPU = 2 # a memory error occurs when IMAGES_PER_GPU is too high
    
    BACKBONE = 'resnet50'
    
    IMAGE_MIN_DIM = IMAGE_SIZE
    IMAGE_MAX_DIM = IMAGE_SIZE    
    IMAGE_RESIZE_MODE = 'none'
    
    BACKBONE_STRIDES = [4, 8, 16, 32, 64]
    BACKBONESHAPE = (8, 16, 24, 32, 48)
    RPN_ANCHOR_SCALES = (8,16,24,32,48)
    RPN_TRAIN_ANCHORS_PER_IMAGE = 300
    POST_NMS_ROIS_TRAINING = 800
    POST_NMS_ROIS_INFERENCE = 700
    MAX_GROUNDTRUTH_INSTANCES = 50
    TRAIN_ROI_PER_IMAGE = 300
    ROI_POSITIVE_RATIO = 0.33
    DETECTION_MAX_INSTANCES = 300
    DETECTION_MIN_CONFIDENCE = 0.7
    DETECTION_NMS_THRESHOLD = 0.5
    
config = HelmetConfig()
config.display()

In [None]:
# Fix overlapping masks
def refine_masks(masks, rois):
    areas = np.sum(masks.reshape(-1, masks.shape[-1]), axis=0)
    mask_index = np.argsort(areas)
    union_mask = np.zeros(masks.shape[:-1], dtype=bool)
    for m in mask_index:
        masks[:, :, m] = np.logical_and(masks[:, :, m], np.logical_not(union_mask))
        union_mask = np.logical_or(masks[:, :, m], union_mask)
    for m in range(masks.shape[-1]):
        mask_pos = np.where(masks[:, :, m]==True)
        if np.any(mask_pos):
            y1, x1 = np.min(mask_pos, axis=1)
            y2, x2 = np.max(mask_pos, axis=1)
            rois[m, :] = [y1, x1, y2, x2]
    return masks, rois

def resize_image(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMAGE_SIZE, IMAGE_SIZE), interpolation=cv2.INTER_AREA)  
    return img

def decode_rle(rle, height, width):
    s = rle.split()
    starts, lengths = [np.asarray(x, dtype=int) for x in (s[0:][::2], s[1:][::2])]
    starts -= 1
    ends = starts + lengths
    img = np.zeros(height*width, dtype=np.uint8)
    for lo, hi in zip(starts, ends):
        img[lo:hi] = 1
    return img.reshape((height, width)).T

def annotations_to_mask(annotations, height, width):
    if isinstance(annotations, list):
        # The annotation consists in a list of RLE codes
        mask = np.zeros((height, width, len(annotations)))
        for i, rle_code in enumerate(annotations):
            mask[:, :, i] = decode_rle(rle_code, height, width)
    else:
        error_message = "{} is expected to be a list or str but received {}".format(annotation, type(annotation))
        raise TypeError(error_message)
    return mask

In [None]:
class InferenceConfig(HelmetConfig):
    GPU_COUNT = 1
    IMAGES_PER_GPU = 1
    IMAGE_MIN_DIM = IMAGE_SIZE
    IMAGE_MAX_DIM = IMAGE_SIZE    
    IMAGE_RESIZE_MODE = 'none'
    DETECTION_NMS_THRESHOLD = 0.5

inference_config = InferenceConfig()

model = modellib.MaskRCNN(mode='inference', 
                          config=inference_config,
                          model_dir="")

model.load_weights("../../input/maskrcnn-helmet-detection/mask_rcnn_helmet_detection.h5", by_name=True)

# Functions to scan over the image and clean outputs

In [None]:
def predict_on_image(img, y_origin, x_origin, y_margin, x_margin):
    img = np.array(img)[y_origin:y_origin+512,x_origin:x_origin+512]
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    result = model.detect([img])
    r = result[0]
    
    x_start_box = 0
    y_start_box = 0
    x_end_box = 512
    y_end_box = 512
    if x_origin==0 and y_origin==0:
        x_end_box = x_end_box-x_margin
        y_end_box = y_end_box-y_margin
    elif x_origin>0 and y_origin==0:
        x_end_box = x_end_box-x_margin
        y_end_box = y_end_box-y_margin
        if x_origin<768:
            x_start_box = x_start_box+x_margin
    elif x_origin>0 and y_origin>0:
        y_start_box = y_start_box+y_margin 
        x_start_box = x_start_box+x_margin
        if x_origin<768:
            x_end_box = x_end_box-x_margin
    elif x_origin==0 and y_origin>0:
        y_start_box = y_start_box+y_margin 
        x_end_box = x_end_box-x_margin
    
    box_list = []
    rois_list = copy.deepcopy(r['rois'])

    if r['masks'].size > 0:
        masks = np.zeros((img.shape[0], img.shape[1], r['masks'].shape[-1]), dtype=np.uint8)
        for m in range(r['masks'].shape[-1]):
            masks[:, :, m] = cv2.resize(r['masks'][:, :, m].astype('uint8'), 
                                        (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)

        for idx in range(masks.shape[-1]-1):
            #maybe x and y centroids are swapped, double check
            y_centroid, x_centroid = ndimage.measurements.center_of_mass(masks[:,:,idx])
            if x_centroid>x_start_box and x_centroid<x_end_box and y_centroid>y_start_box and y_centroid<y_end_box:
                roi = copy.deepcopy(rois_list[idx])
                absolute_x_start = x_origin+roi[1]
                absolute_x_end = x_origin+roi[3]
                absolute_y_start = y_origin+roi[0]
                absolute_y_end = y_origin+roi[2]
                box_list.append(np.array([absolute_x_start, absolute_y_start, absolute_x_end, absolute_y_end]))


        for m in range(r['masks'].shape[-1]):
            masks[:, :, m] = cv2.resize(r['masks'][:, :, m].astype('uint8'), 
                                        (img.shape[1], img.shape[0]), interpolation=cv2.INTER_NEAREST)

        y_scale = img.shape[0]/IMAGE_SIZE
        x_scale = img.shape[1]/IMAGE_SIZE
        rois = (r['rois'] * [y_scale, x_scale, y_scale, x_scale]).astype(int)

        rois = np.array([list(roi) for roi in rois if roi[3]-roi[1]<40 or roi[2]-roi[0]<40])
        masks = r['masks']
    else:
        masks, rois = r['masks'], r['rois']

    return np.array(box_list)

Some of the hard-coded values in the function below allows to create the 512x512-pixel tiles from the original image and run the MaskRCNN in inference mode on the tiles.

In [None]:
def get_helmet_boxes(img_path):
    img = cv2.imread(img_path)
    y_tile_origins = [0,208]
    x_tile_origins = [0,384,768]
    x_margin = 64
    y_margin = 152

    rois = []
    for y in y_tile_origins:
        for x in x_tile_origins:
            results = predict_on_image(img, y, x, y_margin, x_margin)
            for result in results:
                rois.append(result)
    return np.array(rois)

When all the helmets have been detected, the set of functions below cleans the outputs. If 2 boxes have less than 7 pixels of difference between their borders, one of the boxes will be removed.

In [None]:
def boxes_overlap(box1, box2):
    pixel_shift_flexibility = 7
    if abs(box1[0]-box2[0])<pixel_shift_flexibility:
        if abs(box1[2]-box2[2])<pixel_shift_flexibility:
            if abs(box1[1]-box2[1])<pixel_shift_flexibility:
                if abs(box1[3]-box2[3])<pixel_shift_flexibility:
                    return True
    return False

def remove_box(box_to_remove, box_list):
    clean_box_list = []
    for box in box_list:
        if not (box_to_remove==box).all():
            clean_box_list.append(box)
    return np.array(clean_box_list)

def check_overlap(helmet_boxes):
    clean_helmet_box_list = []
    
    helmet_boxes_to_scan = copy.deepcopy(helmet_boxes)
    item_suppressed = 0
    for idx, helmet_box in enumerate(helmet_boxes):
        other_helmet_boxes = remove_box(helmet_box, helmet_boxes_to_scan)
        duplicated = False
        for other_box in other_helmet_boxes:
            if boxes_overlap(helmet_box, other_box):
                duplicated = True
                item_suppressed +=1 
                helmet_boxes_to_scan = remove_box(helmet_box, helmet_boxes_to_scan)
        if not duplicated:
            clean_helmet_box_list.append(helmet_box)
            
    return clean_helmet_box_list

In [None]:
def bbox_from_mask(img):
    rows = np.any(img, axis=1)
    cols = np.any(img, axis=0)
    rmin, rmax = np.where(rows)[0][[0, -1]]
    cmin, cmax = np.where(cols)[0][[0, -1]]

    return [cmin, rmin, cmax, rmax]

The 2 main functions to run the tiles creation and the prediction on each tiles, and then simply display the boxes onto the original image.

In [None]:
def detect_helmets(img_path):
    
    helmet_boxes = get_helmet_boxes(img_path)
    helmet_boxes = check_overlap(helmet_boxes)
            
    return helmet_boxes

def display_helmet_detected(helmet_boxes, img_path):
    masked_image = cv2.imread(img_path)
    for box in helmet_boxes:
        masked_image[box[1]:box[3],box[0]:box[2],:] = np.full((box[3]-box[1], box[2]-box[0], 3), (255,255,0))
    plt.figure(figsize=(15,10))
    plt.imshow(masked_image)
    plt.show()

# Extracting the frames from MP4 files

In [None]:
def mk_images(video_name, video_labels, video_dir, out_dir, only_with_impact=True):
    video_path=f"{video_dir}/{video_name}"
    video_name = os.path.basename(video_path)
    vidcap = cv2.VideoCapture(video_path)
    if only_with_impact:
        boxes_all = video_labels.query("video == @video_name")
        print(video_path, boxes_all[boxes_all.impact == 1.0].shape[0])
    else:
        print(video_path)
    frame = 0
    while True:
        it_worked, img = vidcap.read()
        if not it_worked:
            break
        frame += 1
        if only_with_impact:
            boxes = video_labels.query("video == @video_name and frame == @frame")
            boxes_with_impact = boxes[boxes.impact == 1.0]
            if boxes_with_impact.shape[0] == 0:
                continue
        img_name = f"{video_name}_frame{frame}"
        image_path = f'{out_dir}/{video_name}'.replace('.mp4',f'_{frame}.png')
        _ = cv2.imwrite(image_path, img)

In [None]:
os.mkdir("../../working/frames")
out_dir = "../../working/frames"

!mkdir -p $out_dir
video_dir = '../../input/nfl-health-and-safety-helmet-assignment/train/'
video_folder = [path.split('/')[-1] for path in glob(f'{video_dir}/*.mp4')]
for video_name in video_folder[:6]:
    mk_images(video_name, pd.DataFrame(), video_dir, out_dir, only_with_impact=False)

# Run the helmet detection model

Let's test the model now on a few images now!

In [None]:
train_baseline_df = pd.read_csv("../../input/nfl-health-and-safety-helmet-assignment/train_baseline_helmets.csv")

In [None]:
def run_comparison(img_path):
    frame_name = img_path.split("/")[-1].replace(".png","")
    frame_df = train_baseline_df[train_baseline_df["video_frame"]==frame_name]
    #show the baseline
    baseline_boxes = np.array([np.array([row.left, row.top, row.left+row.width, row.top+row.height ])  for idx, row in frame_df.iterrows()])
    display_helmet_detected(baseline_boxes, img_path)
    #run the model
    helmet_boxes = detect_helmets(img_path)
    display_helmet_detected(helmet_boxes, img_path)

The first image represents the baseline helmet boxes given by the competition host, while the 2nd image is the MaskRCNN model.

In [None]:
run_comparison("../../working/frames/57784_001741_Endzone_200.png")

In [None]:
run_comparison("../../working/frames/58104_000352_Sideline_30.png")

In [None]:
run_comparison("../../working/frames/57686_002546_Endzone_120.png")

In [None]:
run_comparison("../../working/frames/58106_002918_Sideline_70.png")

As you can see, the model is not perfect but I would tend to believe it is a good start! In comparison to the baseline, this MaskRCNN model seems to be more conversative. It translates into missed helmets on the edges of the images (possibly due to the tiling process) but may help to have less false positives when it gets messy and players climb on top of each other. That being said, it is only a quick visual assessment. It seems like the baseline helmet boxes are already reliable enough and it may be difficult to do better! Definitely not a low-hanging fruit!

## Thanks for reading this notebook! If you found this notebook helpful, please give it an upvote. It is always greatly appreciated!

In [None]:
!rm -rf ../../working/frames