# Helmet Mapping + Deepsort



In [None]:
# Install helmet-assignment helper code
!pip install ../input/helmet-assignment-helpers/helmet-assignment-main/ > /dev/null 2>&1

In [None]:
from helmet_assignment.score import NFLAssignmentScorer, check_submission
from helmet_assignment.features import add_track_features
from helmet_assignment.video import video_with_predictions
from IPython.display import Video, display

# Baseline helmet mapping
This section uses the simple helmet mapping approach from the awesome notebook:

https://www.kaggle.com/its7171/nfl-baseline-simple-helmet-mapping

In [None]:
import numpy as np
import pandas as pd
import itertools
import glob
import os
import sys
import torch
import cv2
import traceback
import time
from sklearn.metrics import accuracy_score
from tqdm.auto import tqdm
from multiprocessing import Pool
from matplotlib import pyplot as plt
import random
import torchvision
import shutil
from joblib import Parallel, delayed
from scipy.spatial.transform import Rotation
from math import pi, ceil, sqrt
from scipy.spatial import distance_matrix
from scipy.optimize import linear_sum_assignment
from statistics import mode
from sklearn.cluster import k_means
import importlib.util

## Settings and loading data

Note I've extracted `max_iter`, `DIG_STEP` and `DIG_MAX` to the top for easy experimentation. I've also modified the code to run in debug mode if running on the public test set.

In [None]:
# RANDOM_STATE = 43

In [None]:
# RANDOM_STATE = 373

In [None]:
# Configurables
n_debug_samples = 1
debug_video = None
# debug_video = '57911_002492_Sideline.mp4'
# RANDOM_STATE = np.random.randint(1000)
RANDOM_STATE = 42

In [None]:
CONF_THRE = 0.4
max_iter = 1000
DIG_STEP = 3
DIG_MAX = DIG_STEP*10

In [None]:
n_test_videos = len(os.listdir('../input/nfl-health-and-safety-helmet-assignment/test/'))
# Run in debug mode unless during submission

if n_test_videos == 6:
    debug = True
else:
    debug = False
if debug:
    video_dir = '../input/nfl-health-and-safety-helmet-assignment/train/'
else:
    video_dir = '../input/nfl-health-and-safety-helmet-assignment/test/'

In [None]:


# Read in the data.

BASE_DIR = '../input/nfl-health-and-safety-helmet-assignment'

labels = pd.read_csv(f'{BASE_DIR}/train_labels.csv')
if debug:
    tracking = pd.read_csv(f'{BASE_DIR}/train_player_tracking.csv')
    helmets = pd.read_csv(f'{BASE_DIR}/train_baseline_helmets.csv')
else:
    tracking = pd.read_csv(f'{BASE_DIR}/test_player_tracking.csv')
    helmets = pd.read_csv(f'{BASE_DIR}/test_baseline_helmets.csv')
helmets['frame'] = helmets.video_frame.apply(lambda x: int(x.split('_')[-1]))
tracking = add_track_features(tracking)

In [None]:
tracking

In [None]:
def add_cols(df):
    df['game_play'] = df['video_frame'].str.split('_').str[:2].str.join('_')
    if 'video' not in df.columns:
        df['video'] = df['video_frame'].str.split('_').str[:3].str.join('_') + '.mp4'
    return df
helmets = add_cols(helmets)
labels = add_cols(labels)
if debug:
    # Select `n_debug_samples` worth of videos to debug with
    if debug_video is None:
        sample_videos = labels['video'].drop_duplicates() \
            .sample(n_debug_samples, random_state=RANDOM_STATE).tolist()
    else:
        sample_videos = [debug_video]
    sample_gameplays = ['_'.join(x.split('_')[:2]) for x in sample_videos]
    tracking = tracking[tracking['game_play'].isin(sample_gameplays)]
    helmets = helmets[helmets['video'].isin(sample_videos)]
    labels = labels[labels['video'].isin(sample_videos)]
tracking.shape, helmets.shape, labels.shape

In [None]:
tracking.player.isnull().any()

In [None]:
def find_nearest(tracking, value):
    value = int(value)
    array = np.asarray(tracking['est_frame']).astype(int)
    unique_frames = np.unique(array)
    idx = np.argmin(np.abs(unique_frames - value))
    if value > unique_frames[idx]:
        curr_frame = tracking[tracking['est_frame'] == unique_frames[idx]]
        try:
            next_frame = tracking[
                    tracking['est_frame'] == unique_frames[idx + 1]]
        except IndexError:
            return curr_frame

    elif value < unique_frames[idx]:
        next_frame = tracking[tracking['est_frame'] == unique_frames[idx]]
        try:
            curr_frame = tracking[
                tracking['est_frame'] == unique_frames[idx - 1]]
        except IndexError:
            return next_frame
        
    else:
        return tracking[tracking['est_frame'] == unique_frames[idx]].reset_index(drop=True)
    try:
        next_frame = next_frame.set_index('player')
        curr_frame = curr_frame.set_index('player')

        diff = next_frame.est_frame.iloc[0] - curr_frame.est_frame.iloc[0]
        cols = ['x','y', 'a', 'dir', 's', 'o', 'est_frame']
        if diff != 0:
            speed = (next_frame[cols] - curr_frame[cols]) / diff
        else:
            speed = 0
        ret = next_frame.copy()
        ret[cols] = curr_frame[cols] + (value - curr_frame.est_frame.iloc[0]) * speed
        ret = ret.dropna(axis=0, subset=['est_frame'])
        ret['est_frame'] = ret['est_frame'].astype(int)
    except:
        print(next_frame)
        print(curr_frame)
        print(ret['est_frame'])
        raise
    return ret.reset_index()


def norm_arr(a):
    a = a-a.min()
    a = a/a.max()
    return a
    
def dist(a1, a2):
    return np.linalg.norm(a1-a2)

def dist_for_different_len(a1, a2):
    assert len(a1) >= len(a2), f'{len(a1)}, {len(a2)}'
    len_diff = len(a1) - len(a2)
#     a2 = norm_arr(a2)
    if len_diff == 0:
#         a1 = norm_arr(a1)
        return dist(a1,a2), ()
    else:
        min_dist = 10000
        min_detete_idx = None
        cnt = 0
        del_list = list(itertools.combinations(range(len(a1)),len_diff))
        if len(del_list) > max_iter:
            del_list = random.sample(del_list, max_iter)
        for detete_idx in del_list:
            this_a1 = np.delete(a1, detete_idx)
#             this_a1 = norm_arr(this_a1)
            this_dist = dist(this_a1, a2)
            #print(len(a1), len(a2), this_dist)
            if min_dist > this_dist:
                min_dist = this_dist
                min_detete_idx = detete_idx
                
        return min_dist, min_detete_idx
        
def rotate_arr(u, t, deg=True):
    if deg == True:
        t = np.deg2rad(t)
    R = np.array([[np.cos(t), -np.sin(t)],
                  [np.sin(t),  np.cos(t)]])
    return  np.dot(R, u)

def dist_rot(tracking_df, a2):
    tracking_df = tracking_df.sort_values('x')
    x = tracking_df['x']
    y = tracking_df['y']
    min_dist = 10000
    min_idx = None
    min_x = None
    for dig in range(-DIG_MAX,DIG_MAX+1,DIG_STEP):
        arr = rotate_arr(np.array((x,y)), dig)
        this_dist, this_idx = dist_for_different_len(np.sort(arr[0]), a2)
        if min_dist > this_dist:
            min_dist = this_dist
            min_idx = this_idx
            min_x = arr[0]
    tracking_df['x_rot'] = min_x
    player_arr = tracking_df.sort_values('x_rot')['player'].values
    players = np.delete(player_arr,min_idx)
    return min_dist, players

def dist_matrix(points, dense_view=True):
    z = np.array([complex(c[0], c[1]) for c in points])
    if dense_view:
        return np.abs(z[..., np.newaxis] - z)[np.triu_indices(len(z),1)]
    else:
        return np.abs(z[..., np.newaxis] - z)


Based on the conditional above, x always shows the longitudinal
distance in the image, while y the latitudinal

In [None]:
def mapping_df_fallback(tracking, df, previous_mapped=None):
    gameKey,playID,view,frame = df.video_frame.iloc[0].split('_')
    gameKey = int(gameKey)
    playID = int(playID)
    frame = int(frame)
    this_tracking = tracking[(tracking['gameKey']==gameKey) & (tracking['playID']==playID)]
    this_tracking = find_nearest(this_tracking, frame)
    len_this_tracking = len(this_tracking)
    df['center_h_p'] = (df['left']+df['width']/2).astype(int)
    df['center_h_m'] = (df['left']+df['width']/2).astype(int)*-1
    if 'conf' in df.columns:
        df = df[df['conf']>CONF_THRE].copy()
    if len(df) > len_this_tracking:
        df = df.tail(len_this_tracking)
    df_p = df.sort_values('center_h_p').copy()
    df_m = df.sort_values('center_h_m').copy()
    
    if view == 'Endzone':
        this_tracking['x'], this_tracking['y'] = this_tracking['y'].copy(), this_tracking['x'].copy()
    a2_p = df_p['center_h_p'].values
    a2_m = df_m['center_h_m'].values

    min_dist_p, min_detete_idx_p = dist_rot(this_tracking ,a2_p)
    min_dist_m, min_detete_idx_m = dist_rot(this_tracking ,a2_m)
    if min_dist_p < min_dist_m:
        min_dist = min_dist_p
        min_detete_idx = min_detete_idx_p
        tgt_df = df_p
    else:
        min_dist = min_dist_m
        min_detete_idx = min_detete_idx_m
        tgt_df = df_m
    #print(video_frame, len(this_tracking), len(df), len(df[df['conf']>CONF_THRE]), this_tracking['x'].mean(), min_dist_p, min_dist_m, min_dist)
    tgt_df['label'] = min_detete_idx
    return tgt_df[df.columns.tolist() + ['label']], this_tracking, {'fallback_mapping_used':True}

In [None]:
from pykalman import KalmanFilter
from typing import Dict, Iterable, List
from numpy import ma
def cartesian_product(arrays):
    la = len(arrays)
    dtype = np.find_common_type([np.array(a).dtype for a in arrays], [])
    arr = np.empty([len(a) for a in arrays] + [la], dtype=dtype)
    for i, a in enumerate(np.ix_(*arrays)):
        arr[..., i] = a
    return arr.reshape(-1, la)


def get_observation_matrix(params_len):
    return np.pad(np.eye(params_len), ((0,0),(0,2*params_len)))
def get_transition_matrix(params_len):
    return np.eye(3 * params_len) + np.diag(
                    np.ones(2 * params_len), params_len) + np.diag(
                    0.5 * np.ones(params_len), 2 * params_len)

class KalmanFilterRoutine:
    def __init__(self, params: List[str], init_frames=5):
        self.init_frames = init_frames
        self.kf = None
        self.params_buffer = []
        self.frame = 0
        self.means = None
        self.covariances = None
        self.observation_matrix = None
        self.transition_matrix = None
        self.params = params
        self.params_len = len(params)

    @property
    def is_ready(self):
        return self.frame >= self.init_frames
        
    def update(self, **params):
        self.frame += 1
        if self.frame < self.init_frames:
            self.params_buffer.append(params)
        else:
            if self.frame == self.init_frames:
                self.params_buffer.append(params)
                self.transition_matrix = get_transition_matrix(self.params_len)
                self.observation_matrix = get_observation_matrix(self.params_len)
                params_df = pd.DataFrame(self.params_buffer)
                missing = [p for p in self.params if p not in params_df.columns]
                params_df[missing] = np.nan
                params_df = params_df.fillna(0)
                params_df = params_df[self.params]
                initial_state_mean = np.pad(params_df.mean(axis=0), (
                    (0, 2*len(self.params))))
                self.kf = KalmanFilter(transition_matrices=self.transition_matrix, 
                                       observation_matrices=self.observation_matrix,
                                      initial_state_mean=initial_state_mean, random_state=RANDOM_STATE)
                if len(params_df) < 3:
                    params_df = pd.concat(
                        [params_df] + [params_df.iloc[[-1]] for _ in range(3 - len(params_df))])
                self.means, self.covariances = self.kf.filter(params_df.values)
                self.means = self.means.tolist()
                self.covariances = self.covariances.tolist()
            if self.frame > self.init_frames:
                observation = ma.asarray(np.array([params[k] if k in params else np.nan for k  in self.params]))
                observation[np.isnan(observation)] = ma.masked
                state_means, state_covs = self.kf.filter_update(
                    self.means[-1],
                    self.covariances[-1],
                    observation =observation)
                self.means.append(state_means)
                self.covariances.append(state_covs)
            self.updated_params =  np.array(self.means[-1][:len(self.params)])
            self.updated_params_der = np.array(self.means[-1][len(self.params): 2 * len(self.params)])
            self.updated_params_sder = np.array(self.means[-1][2 * len(self.params): 3 * len(self.params)])
        

In [None]:
from collections import deque

class ParamsCombinationsGenerator:
    # Uses Kalman Filter with Taylor expansion up to the 2nd derivative
    def __init__(self, params_ranges : Dict[str, Iterable],
                 strictly_positive_params:List[str]=None,
                 min_perturbations: Dict[str, float]=None,
                 max_perturbations: Dict[str, float]=None,
                 kalman_init=5,
                 allowed_change_ratio=1, n_steps=5, 
                 obey_original_ranges=True, use_kalman=True, previous_frames_to_keep=10):
        self.use_kalman = use_kalman
        self.kalman_init = kalman_init
        self.allowed_change_ratio = allowed_change_ratio
        self.params_ranges = params_ranges
        self.strictly_positive_params = strictly_positive_params
        self.n_steps = n_steps
        self.previous_frames_to_keep = previous_frames_to_keep
        self.buffer_starts = deque(maxlen=previous_frames_to_keep)
        self.buffer_ends = deque(maxlen=previous_frames_to_keep)
        self.params_buffer = []
        self.ori_options = [0, 1]
        self.frame = 0
        self.params = list(params_ranges.keys())
        
        self.means = None
        self.covariances = None
        self.observation_matrix = None
        self.transition_matrix = None
        self.obey_original_ranges = obey_original_ranges
        if obey_original_ranges:
            self.ranges_limits = np.array([[np.min(params_ranges[p]),
                                            np.max(params_ranges[p])] for p in self.params])
        if min_perturbations is None:
            self.min_perturbations = np.zeros(len(self.params))
        else:
            self.min_perturbations = np.array([min_perturbations[x]  if x in min_perturbations else 0 for x in self.params])
        if max_perturbations is not None:
            self.max_perturbations = np.array([max_perturbations[x]  if x in max_perturbations else
                                               np.inf for x in self.params])
        else:
            self.max_perturbations = np.zeros(len(self.params)) + np.inf
        self.kf = KalmanFilterRoutine(self.params, self.kalman_init)
        
    
    def reset(self):
        self.kf = KalmanFilterRoutine(self.params, self.kalman_init)
    
    def zero_buffer(self):
        self.buffer_starts = deque(maxlen=self.previous_frames_to_keep)
        self.buffer_ends = deque(maxlen=self.previous_frames_to_keep)
        
    @property
    def is_ready(self):
        return self.kf.is_ready
    
    def update(self, **params):
        self.prev_params = np.array([params[k] for k in self.params])
        if not self.use_kalman:
            return
        self.kf.update(**params)

    
    def get_bounds(self):    
        if not self.use_kalman or not self.kf.is_ready:
            if not self.buffer_starts:
                ranges = [self.params_ranges[x] for x in self.params]
            else:
                ranges = [(start,end) for start, end in zip(0.8 * np.mean(self.buffer_starts, axis=0), 
                                                            1.2 * np.mean(self.buffer_ends, axis=0))]
        else:
            diff1 = self.kf.updated_params - self.prev_params
            diff2 = self.kf.updated_params_der + 0.5 * self.kf.updated_params_sder
            
            
            changes =  np.maximum(np.abs(diff1 + diff2), self.min_perturbations)
            
            
#             perturbations = np.minimum(np.maximum(
#                 changes,self.min_perturbations), self.max_perturbations)
            perturbations = changes
#             starts = self.kf.updated_params - perturbations
#             ends = self.kf.updated_params + perturbations
            starts = self.prev_params - perturbations
            ends = self.prev_params + perturbations
#             print(stats, ends)
#             if self.strictly_positive_params is not None:
#                 flag = np.array([x in self.strictly_positive_params for x in self.params])
#                 add = np.maximum(0, - starts[flag])
#                 starts[flag] += add
#                 ends[flag] += add
#             if self.obey_original_ranges:

#                 flag = np.array([x is not None for x in self.ranges_limits[:, 0]])
#                 starts[flag] = np.maximum(self.ranges_limits[flag, 0], starts[flag])
#                 ends[flag] = np.minimum(self.ranges_limits[flag, 1], ends[flag])
            ranges = [(start, end) for start, end in zip(starts, ends)]
            self.buffer_starts.append(starts)
            self.buffer_ends.append(ends)
                          
        return ranges    
    

In [None]:
SIDELINE_START_THRES = 50
MAX_COORDS = (120, 53.33)
MAX_COST_SIDELINE = 50
MAX_COST_ENDZONE = 100
from scipy.optimize import basinhopping, minimize
def cost_function(this_tracking, expanded, im_centers, 
                  camera_height, camera_length, max_p, 
                  xdig, zdig, scaling, max_cost_thres=None,
                  ret_cost_only=False):
    assert ~np.any(np.isnan([xdig]))


    z_rot = Rotation.from_rotvec([0, 0, zdig]).as_matrix()
    #camera sits somewhere near the middle of the appropriate x side of the field
    camera_pos = np.array([camera_length,0,camera_height])

    expanded = expanded - camera_pos 

    z_rot = Rotation.from_rotvec([0,0,zdig]).as_matrix()
    z_rotated = (z_rot @ expanded.T).T
    x_rot = Rotation.from_rotvec([xdig,0,0]).as_matrix()
    x_rotated =  (x_rot @ z_rotated.T).T
    x_rotated = x_rotated[:,[0,2]]
    opt_params = None
    opt_rl_remapped = None
    arr = x_rotated
    scaled = scaling * arr
    # the origin is now assumed to be at the center of the image, so we need to move it to the bottom left first
    scaled = scaled + np.array([1280,720])/2
    # and then revert the y axis
    scaled[:,1] = 720 - scaled[:,1]
    if not ret_cost_only:
        reduced_tracking = this_tracking.copy()
        
    d = distance_matrix(im_centers,
                        scaled)
    match_to, match_from = linear_sum_assignment(d)
    match_to = match_to[np.argsort(match_from)]
    match_from = np.sort(match_from)
    as_costs = d[match_to, match_from]
    cost = np.mean(as_costs)


    if max_cost_thres is not None:
        mask = as_costs < max_cost_thres
        match_from = match_from[mask]
        match_to = match_to[mask]
        as_costs = as_costs[mask]
        
    if ret_cost_only:
        return cost     
    
    reduced_tracking = reduced_tracking.iloc[match_from]
    reduced_tracking[['x2im', 'y2im']] = scaled[match_from, :]
    params = dict(xdig=xdig, zdig=zdig, scaling=scaling,
                  camera_height=camera_height,
                  camera_length=camera_length,
                  cost=cost, 
                  match_from=match_from,
                  match_to=match_to,
                  match_costs=as_costs,
                  max_cost=(as_costs.max() if np.any(as_costs)
                            else max_cost_thres))
    return params, reduced_tracking

    
        

In [None]:
def mapping_df(combs_generator, tracking, df, previous_mapped=None, 
               available_oris=(0,1), ignore_starting_preproc=False,
               force_local_minimize=True, check_mapping=False, ratio = 0.8):
    gameKey,playID,view,frame = df.video_frame.iloc[0].split('_')
    gameKey = int(gameKey)
    playID = int(playID)
    frame = int(frame)
    this_tracking = tracking[(tracking['gameKey']==gameKey) & (tracking['playID']==playID)]
    this_tracking = find_nearest(this_tracking, frame)
    
    max_p = MAX_COORDS
    if view == 'Endzone':
        max_cost_thres = MAX_COST_ENDZONE
    else:
        max_cost_thres = MAX_COST_SIDELINE
    same_sgns = 0 # the projected axes on the image need to be reflected as of x or as of y
    if view == 'Endzone':
        this_tracking['x'], this_tracking['y'] = this_tracking['y'].copy(), this_tracking['x'].copy()
        max_p = max_p[::-1]
        # the projected axes need to be reflected both as of x and as of y or stay as is
        same_sgns = 1 
        

    if 'conf' in df.columns:
        df = df[df['conf']>CONF_THRE].copy()
    if not ignore_starting_preproc and (view == 'Sideline') and not combs_generator.is_ready:
        inc_mask = (df['top'] >= SIDELINE_START_THRES) & (df['top'] < 720 - SIDELINE_START_THRES)
        if not np.all(inc_mask):
            print(f"Removing {(~inc_mask).sum()} bounding boxes that reside in the top or bottom edge of the screen")
            df = df[inc_mask].copy()
        
    im_centers = df[['left', 'top']].values+ (df[['width', 'height']]/2).values
    rl_centers = this_tracking[['x','y']].values
    
    im_centers = im_centers
    
    opt_params = None
    costs = {0: [], 1: []}
    min_cost = 1e7
    opt_params = None
    for change_ori in available_oris:
        # assume 1 yard average height 
        expanded = np.hstack([rl_centers,
                              1 + np.zeros((len(rl_centers),1))])
        if same_sgns:
            c_translation = np.zeros(2)
            c_scaling = np.ones(2)
            if change_ori:
                c_translation = max_p
                c_scaling = - np.ones(2)
        else:
            if change_ori:
                c_translation = np.array([0, max_p[1]])
                c_scaling = np.array([1, -1])
            else:
                c_translation = np.array([max_p[0], 0])
                c_scaling = np.array([-1, 1])
        expanded[:, :2] = c_scaling * expanded[:, :2] + c_translation
        expanded[:, 1] = max_p[1] - expanded[:, 1]


        to_opt = combs_generator.params
        x0 = [np.mean(combs_generator.params_ranges[x]) for x in to_opt]
        bounds = combs_generator.get_bounds()
        min_func = lambda p: cost_function(
            this_tracking=this_tracking,
            expanded=expanded,
            im_centers=im_centers,
            camera_length=p[to_opt.index('camera_length')],
            camera_height=p[to_opt.index('camera_height')],
            max_p=max_p,
            zdig=p[to_opt.index('zdig')],                                 
            xdig=p[to_opt.index('xdig')],
            scaling=p[to_opt.index('scaling')],
            ret_cost_only=True,
            max_cost_thres=None)
        
        if force_local_minimize or combs_generator.is_ready:
            ret = minimize(min_func, x0=x0,
                bounds=bounds)
#             if ret.fun > max_cost_thres * ratio:
#                 print(f'Resetting due to high cost({ret.fun} > {max_cost_thres * ratio})')
#                 combs_generator.reset()
#                 bounds = combs_generator.get_bounds()
        if not combs_generator.is_ready and not force_local_minimize:
            ret = basinhopping(min_func,
                               x0=x0,niter=100 if combs_generator.buffer_starts else 5000,
                               niter_success=5 if combs_generator.buffer_starts else 100,
                minimizer_kwargs=dict(bounds=bounds))
            if ret.fun > max_cost_thres * ratio:
                print('Basin Hopping Unsuccessful! Resetting due to high cost('
                      f'{ret.fun} > {max_cost_thres * ratio})')
                combs_generator.reset()
                    
        cost = ret.fun
        if cost < min_cost:
            xdig = ret.x[to_opt.index('xdig')]
            zdig = ret.x[to_opt.index('zdig')]
            scaling = ret.x[to_opt.index('scaling')]
            camera_height = ret.x[to_opt.index('camera_height')]
            camera_length = ret.x[to_opt.index('camera_length')]
            found_params, found_tracking = cost_function(
                this_tracking=this_tracking,
                expanded=expanded, 
                im_centers=im_centers,
                camera_height=camera_height,
                camera_length=camera_length,
                max_p=max_p,
                zdig=zdig,
                xdig=xdig,
                scaling=scaling,
                ret_cost_only=False, 
                max_cost_thres=max_cost_thres)
            min_cost = cost
            opt_ret = ret
            opt_params = found_params
            opt_params['change_ori'] = change_ori
            opt_tracking = found_tracking

    if opt_params is None:
        combs_generator.reset()
        if debug:
            print('Failure')
            print(ret)
        raise
    combs_generator.update(zdig=opt_params['zdig'],
                           xdig=opt_params['xdig'],
                           scaling=opt_params['scaling'],
                           camera_height=opt_params['camera_height'],
                           camera_length=opt_params['camera_length'])
    match_to = opt_params['match_to']
    match_from = opt_params['match_from']
    match_costs = opt_params['match_costs']
    
    df['view'] = view
    df[['im_x_remapped', 'im_y_remapped']] = im_centers
 

    df_cols = [col for col in df.columns if col not in ['x','y','x2im', 'y2im', 'player']]
            
    ret = pd.concat(
        [
            df[df_cols].iloc[match_to].reset_index(drop=True),
            opt_tracking[['x','y','x2im', 'y2im', 'player']].reset_index(drop=True)
        ],
        axis=1).set_index(df.iloc[match_to].index)
    ret['cost'] = match_costs
    ret = pd.concat([ret, df.loc[[x for x in df.index if x not in ret.index], df_cols]]) # null labels for unmatched
    ret['cost'] = ret['cost'].fillna(np.inf)
    ret.rename(columns={'player':'label'},inplace=True)
    ret = ret.sort_values('left')
    
    return ret[df.columns.tolist() + ['label', 'cost']], opt_tracking, opt_params
    
def compare_and_assign(ret, previous_mapped):
    hist_dist_mat = distance_matrix(
    ret[['left', 'top']].values,
    previous_mapped[['left', 'top']].values)

    matched_to, matched_from = linear_sum_assignment(hist_dist_mat)
    costs = np.array([ret.iloc[matched_to]['cost'].values,
               previous_mapped.iloc[matched_from]['cost'].values])
    to_select = np.argmin(costs,axis=0)
    to_keep_previous = to_select == 1
    
    matched_to_flag = np.zeros(len(ret)).astype(bool)

    matched_from = matched_from[to_keep_previous]
    matched_to = matched_to[to_keep_previous]
    matched_to_flag[matched_to] = True
    labels_to_keep_previous = previous_mapped.iloc[matched_from].label
    to_change_df = ret[matched_to_flag] .copy()
    to_change_df['cost'] = (
        to_change_df['cost'].values +
        previous_mapped.iloc[matched_from]['cost'].values) / 2
    to_change_df['label'] = previous_mapped.iloc[matched_from]['label'].values
    to_keep_df = ret[~matched_to_flag].copy()
    to_reassign_flag_small = to_keep_df['label'].isin(to_change_df['label'].values).values
    to_reassign_flag = (~matched_to_flag & ret['label'].isin(to_change_df['label'].values)).values
    ret = pd.concat([to_change_df, to_keep_df[~to_reassign_flag_small]],axis=0)
    mask = ~ret['label'].isnull()
    if len(ret[mask]) != len(ret[mask].drop_duplicates('label')):
        display(to_change_df)
        display(to_keep_df[~to_reassign_flag_small])
        print(matched_to)
        raise
    return ret, to_reassign_flag

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.widgets import Slider
class MappingCheck:
    def __init__(self, img, change_ori, df, tracking, max_cost_thres=None, init=None):
        
        gameKey,playID,view,frame = df.video_frame.iloc[0].split('_')
        gameKey = int(gameKey)
        playID = int(playID)
        frame = int(frame)
        m = Mapping(tracking, view)
        combs_generator = m.combinations_generator
        this_tracking = tracking[(tracking['gameKey']==gameKey) & (tracking['playID']==playID)]
        this_tracking = find_nearest(this_tracking, frame)

        max_p = MAX_COORDS
        if max_cost_thres is None:
            if view == 'Endzone':
                max_cost_thres = MAX_COST_ENDZONE
            else:
                max_cost_thres = MAX_COST_SIDELINE
        same_sgns = 0 # the projected axes on the image need to be reflected as of x or as of y
        if view == 'Endzone':
            this_tracking['x'], this_tracking['y'] = this_tracking['y'].copy(), this_tracking['x'].copy()
            max_p = max_p[::-1]
            # the projected axes need to be reflected both as of x and as of y or stay as is
            same_sgns = 1 


        if 'conf' in df.columns:
            df = df[df['conf']>CONF_THRE].copy()

        im_centers = df[['left', 'top']].values+ (df[['width', 'height']]/2).values
        rl_centers = this_tracking[['x','y']].values

        im_centers = im_centers

        opt_params = None
        costs = {0: [], 1: []}
        min_cost = 1e7
        opt_params = None
        # assume 1 yard average height 
        expanded = np.hstack([rl_centers,
                              1 + np.zeros((len(rl_centers),1))])
        self.players = this_tracking['player'].values
        if same_sgns:
            c_translation = np.zeros(2)
            c_scaling = np.ones(2)
            if change_ori:
                c_translation = max_p
                c_scaling = - np.ones(2)
        else:
            if change_ori:
                c_translation = np.array([0, max_p[1]])
                c_scaling = np.array([1, -1])
            else:
                c_translation = np.array([max_p[0], 0])
                c_scaling = np.array([-1, 1])
        expanded[:, :2] = c_scaling * expanded[:, :2] + c_translation
        expanded[:, 1] = max_p[1] - expanded[:, 1]

        rot_dist_matrix = dist_matrix(
            expanded[:,:2], dense_view=False)   

        to_opt = combs_generator.params
        x0 = [np.mean(combs_generator.params_ranges[x]) for x in to_opt]
        bounds = combs_generator.get_bounds()
        ranges = {k: bounds[c] for c,k in enumerate(to_opt)}
        import matplotlib
        cmap = matplotlib.cm.get_cmap('jet')
        self.players_colors = [(np.array(cmap(x/len(self.players)))[:3] * 255).astype(int) for x in range(len(self.players))]
        
        from matplotlib.patches import Patch

        self.legend_elements = [Patch(facecolor=color.astype(float)/255, edgecolor='r',
                                 label=player) for player,color in zip(self.players, self.players_colors)]
        
        
        self.fig, self.ax = plt.subplots()
        plt.subplots_adjust(left=0.35, bottom=0.35)
        self.img = img
        self.img_handle = self.ax.imshow(img)
        self.expanded = expanded
        self.im_centers = im_centers
        self.max_p = max_p
        self.this_tracking = this_tracking
        self.ax.margins(x=0)

        axcolor = 'lightgoldenrodyellow'
        self.axch = plt.axes([0.25, 0.1, 0.65, 0.03], facecolor=axcolor)
        self.axcl = plt.axes([0.25, 0.15, 0.65, 0.03], facecolor=axcolor)
        self.axsc = plt.axes([0.25, 0.2, 0.65, 0.03], facecolor=axcolor)
        self.axxd = plt.axes([0.25, 0.25, 0.65, 0.03], facecolor=axcolor)
        self.axzd = plt.axes([0.25, 0.3, 0.65, 0.03], facecolor=axcolor)
        self.axct = plt.axes([0.025, 0.5, 0.15, 0.15], facecolor=axcolor)
        self.text = self.axct.text(0, 0,  '')
        self.init_sliders(ranges, init=init)
        plt.show()
        self.update(None)



    def init_sliders(self, ranges, init=None):
        if init is None:
            init = {k: np.mean(ranges[k]) for k in ranges}
        self.ch = Slider(self.axch, 'CamH', ranges['camera_height'][0],
                              ranges['camera_height'][-1], valinit=init['camera_height'])
        self.cl = Slider(self.axcl, 'CamL', ranges['camera_length'][0],
                              ranges['camera_length'][-1], valinit=init['camera_length'])
        self.sc = Slider(self.axsc, 'Scal', ranges['scaling'][0],
                              ranges['scaling'][-1], valinit=init['scaling'])
        self.xd = Slider(self.axxd, 'Xdig', ranges['xdig'][0],
                              ranges['xdig'][-1], valinit=init['xdig'])
        self.zd = Slider(self.axzd, 'Zdig', ranges['zdig'][0],
                              ranges['zdig'][-1], valinit=init['zdig'])
        self.ch.on_changed(self.update)
        self.cl.on_changed(self.update)
        self.sc.on_changed(self.update)
        self.xd.on_changed(self.update)
        self.zd.on_changed(self.update)
        self.leg = None



    def update(self, val):
        camera_height = self.ch.val
        camera_length = self.cl.val
        scaling=self.sc.val
        xdig=self.xd.val
        zdig=self.zd.val
        opt_params, opt_tracking = cost_function(self.this_tracking, self.expanded, 
                                                self.im_centers, camera_height,
                                                camera_length,
                                                self.max_p, xdig, zdig, scaling)
        cp = self.img.copy()
        if opt_params is not None:
            inds = []
            for m, (index, row) in zip(opt_params['match_from'], opt_tracking.iterrows()):
                x = row[['x2im', 'y2im']].values.astype(float)
                if np.all(np.isfinite(x)):
                    if (x[0] > 1280) or (x[0]<0):
                        continue
                    if (x[1] > 720) or (x[1]<0):
                        continue
                    inds.append(np.where(self.this_tracking.index==index)[0][0])
                    cp[int(x[1]) - 10: int(x[1]) + 10, int(x[0])-10:int(x[0]) + 10, :] = self.players_colors[
                        inds[-1]]
            l_subset = [self.legend_elements[x] for x in inds]
        if self.leg is not None:
            self.leg.remove()
        if opt_params is not None:
            self.leg = self.axct.legend(handles=l_subset, loc='center')     
            self.text.set_text(str(opt_params['cost']) + '\n' + str(len(l_subset)) )
        self.img_handle.set_data(cp)
        self.fig.canvas.draw_idle()

In [None]:
class Mapping:
    def __init__(self, tracking, view, use_kalman=True, use_previous=True, available_oris=(0,1),
                 init_frames=3, ignore_starting_preproc=False):
        self.tracking = tracking
        self.use_kalman = use_kalman
        self.use_previous = use_previous
        self.available_oris = available_oris
        self.init_frames = init_frames
        self.ignore_starting_preproc = ignore_starting_preproc
        
        self.buffer_max_cost = deque(maxlen=30)
        self.buffer_ori = []
        self.buffer_costs = []
        dig_step = np.deg2rad(DIG_STEP)
        dig_max = np.deg2rad(DIG_MAX)
        step_size = int(2 * DIG_MAX / DIG_STEP)
        length = MAX_COORDS[0] if view=='Sideline' else MAX_COORDS[1]
        self.params_ranges = dict(zdig=[- pi / 3, pi / 3],
                                  xdig=[0, pi/3],
                                  camera_height=[15, 50],
                                  camera_length=[0.3 * length, 0.7 * length],
                                  scaling=[20, 80])
        self.min_perturbations = dict(zdig=np.deg2rad(5), xdig=np.deg2rad(5), scaling=0.1, camera_length=0.1,
                                      camera_height=0.1)
        self.max_perturbations = dict(zdig=np.deg2rad(10), xdig=np.deg2rad(10), scaling=1, camera_length=0.2,
                                      camera_height=0.2)
        
        self.combinations_generator = ParamsCombinationsGenerator(
            self.params_ranges, strictly_positive_params='scaling', min_perturbations=self.min_perturbations,
        max_perturbations=self.max_perturbations, use_kalman=use_kalman, kalman_init=1, obey_original_ranges=False)
        self.previous_df = None
        self.max_cost_thres=None
        self.frame = 0
    
    def __call__(self, this_df):
        try:
            self.previous_df, this_tracking, opt_params = mapping_df(
                self.combinations_generator,
                self.tracking, this_df, 
                (self.previous_df if self.use_previous else None),
                available_oris=self.available_oris,
                force_local_minimize=len(self.available_oris)==2, # we dont need much of accuracy when detecting orientation
                ignore_starting_preproc=self.ignore_starting_preproc,
            )
            
            
            if len(self.buffer_ori) < self.init_frames:
                self.buffer_ori.append(opt_params['change_ori'])
                self.buffer_max_cost.append(opt_params['max_cost'])
                self.buffer_costs.append(opt_params['cost'])
            if len(self.buffer_ori) == self.init_frames:
                ori_df = pd.DataFrame({'ori': self.buffer_ori, 'cost': self.buffer_costs})
                mean_costs = ori_df.groupby('ori').median()
                if len(self.available_oris) == 2:
                    self.available_oris = [mean_costs.iloc[np.argmax(mean_costs)]]
                self.max_cost_thres = 1.1 * np.max(self.buffer_max_cost)
        except KeyboardInterrupt:
            raise
        except:
            traceback.print_exc()
            self.previous_df, this_tracking, opt_params = mapping_df_fallback(self.tracking, this_df)
            opt_params['error'] = traceback.format_exc()
        return self.previous_df, this_tracking, opt_params


def apply_on_video(tracking, video_df):
    submission_df_list = []
    df_list = list(video_df.groupby('frame'))
    view = video_df.iloc[0]['video_frame'].split('_')[2]
    ori_mapping = Mapping(tracking, view=view, use_kalman=False, use_previous=False, ignore_starting_preproc=True)
    print('Detecting video view orientation...')
    for frame in tqdm(np.linspace(1, len(df_list)-1, ori_mapping.init_frames).astype(int)):
        _, this_df = df_list[frame]
        ori_mapping(this_df)
    detected_ori = [mode(ori_mapping.buffer_ori)] if ori_mapping.buffer_ori else [0,1]
    
    print('Detected orientation:',detected_ori, '. Mapping...')
    
    mapping = Mapping(tracking,view=view,
                      available_oris=detected_ori)
    opt_params_dict = {}
    try:
        for frame, this_df in tqdm(df_list):
            df, _, opt_params = mapping(this_df)
            if debug:
                opt_params_dict[frame] = opt_params
            submission_df_list.append(df)
        submission_df = pd.concat(submission_df_list)
    except KeyboardInterrupt:
        if debug:
            with pd.option_context('display.max_rows', None, 'display.max_columns', None):
                display(pd.DataFrame(opt_params_dict).T)
        raise
    return submission_df

In [None]:
# videos_dfs = list(helmets.groupby('video'))
# if len(videos_dfs) == 1:
#     submission_df_list = [apply_on_video(tracking, videos_dfs[0][1])]
# else:
#     submission_df_list = Parallel(n_jobs=-1)(delayed(apply_on_video)(tracking, video_df) for _, video_df in tqdm(videos_dfs))
# submission_df = pd.concat(submission_df_list)
# submission_df.to_csv('submission-baseline.csv', index=False)

In [None]:
# if debug:
#     scorer = NFLAssignmentScorer(labels)
#     baseline_score = scorer.score(submission_df[~submission_df['label'].isnull()])
#     print(f"validation score {baseline_score:0.4f}")

## Score the predictions before applying deepsort postprocessing

The scores are roughly ~0.5.

# Deepsort Postprocessing

Deepsort is a popular framework for object tracking within video. 
- [This blog post](https://nanonets.com/blog/object-tracking-deepsort/
) shows some examples of it being put to use.
- This notebook shows how to apply deepsort to this helmet dataset: https://www.kaggle.com/s903124/nfl-helmet-with-yolov5-deepsort-starter
- You can also read the paper for deepsort here: https://arxiv.org/pdf/1703.07402.pdf

The approach is fairly simple:
1. Step through each frame in a video and apply the deepsort algorithm. This clusters helmets across frames when it is the same player/helmet.
2. Group by each of these deepsort clusters - and pick the most common label for that cluster. Then override all of the predictions for that helmet to the same player.

## Importing Yolov5 pretrained from dataset

In [None]:

model = torch.hub.load('../input/yolov5-git/yolov5-master/yolov5-master', 'yolov5m', pretrained='false', classes=1,  source='local')
model = torch.load('../input/helmet-yolov5m/helmet_yolov5m.pt', map_location='cuda')['model']

## Importing Deepsort from dataset
Because your submission is not allowed to use internet access, you can reference the deepsort codebase from the attached dataset. Deepsort also has a dependency of `easydict` which I've also added as a dataset.

In [None]:

sys.path.append('../input/easydict-master/easydict-master/')
# https://github.com/mikel-brostrom/Yolov5_DeepSort_Pytorch
model_import_path = '../input/yolov5-deepsort-pytorch/Yolov5_DeepSort_Pytorch-master/Yolov5_DeepSort_Pytorch-master/deep_sort_pytorch/'
sys.path.append(model_import_path)
from deep_sort.deep_sort import DeepSort
spec = importlib.util.spec_from_file_location("parser", os.path.join(model_import_path, 'utils', 'parser.py'))
foo = importlib.util.module_from_spec(spec)
spec.loader.exec_module(foo)
get_config = foo.get_config

## Deepsort config

Deepsort uses a config yaml file for some settings. These are just the default configs and could be improved.

In [None]:
%%writefile deepsort.yaml

DEEPSORT:
  REID_CKPT: "../input/yolov5-deepsort-pytorch/ckpt.t7"
  MAX_DIST: 0.1
  MIN_CONFIDENCE: 0.4
  NMS_MAX_OVERLAP: 0.5
  MAX_IOU_DISTANCE: 0.9
  MAX_AGE: 15
  N_INIT: 1
  NN_BUDGET: 100


### Load Deepsort Config

In [None]:
cfg = get_config()
cfg.merge_from_file('deepsort.yaml')

In [None]:
"""
Helper functions from yolov5 to plot deepsort labels.
"""

def compute_color_for_id(label):
    """
    Simple function that adds fixed color depending on the id
    """
    palette = (2 ** 11 - 1, 2 ** 15 - 1, 2 ** 20 - 1)

    color = [int((p * (label ** 2 - label + 1)) % 255) for p in palette]
    return tuple(color)

def plot_one_box(x, im, color=None, label=None, line_thickness=3):
    # Plots one bounding box on image 'im' using OpenCV
    assert im.data.contiguous, 'Image not contiguous. Apply np.ascontiguousarray(im) to plot_on_box() input image.'
    tl = line_thickness or round(0.002 * (im.shape[0] + im.shape[1]) / 2) + 1  # line/font thickness
    color = color or [random.randint(0, 255) for _ in range(3)]
    c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
    cv2.rectangle(im, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
    if label: 
        tf = max(tl - 1, 1)  # font thickness
        t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
        c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
        cv2.rectangle(im, c1, c2, color, -1, cv2.LINE_AA)  # filled
        cv2.putText(im, label, (c1[0], c1[1] - 2), 0, tl / 3, [225, 255, 255], thickness=tf, lineType=cv2.LINE_AA)
    return im

## Class to detect the playing region
Applies morphological operations onto image and try to remove edge cases based on the white lines in the field borders

In [None]:
def markers2img(markers):
    markers = 255 * (markers - np.min(markers)) / np.maximum(0.1, (np.max(markers) - np.min(markers)))
    return cv2.applyColorMap(markers.astype(np.uint8), cv2.COLORMAP_JET)

class ValidRegionTracker:
    def __init__(self):
        self.state_surface = None
        self.boundary_flag = None
        self.input_shape = (512,512)
        self.mask5 = np.ones((5,5), np.uint8)
        self.mask3 = np.ones((3,3), np.uint8)
        self.mask15 = np.ones((15,15), np.uint8)
        self.large_mask = np.ones((self.input_shape[1]//5,self.input_shape[0]//5), np.uint8)
        
    def detect(self, image_data):
        og_shape = image_data.shape[:2][::-1]
        mask = self.get_mask(cv2.resize(image_data, self.input_shape))
        return cv2.resize(
                    mask.astype(np.uint8), og_shape, 0, 0, cv2.INTER_NEAREST) > 0

    def get_mask(self, image_data):

        
        hls_img = cv2.cvtColor(image_data,  cv2.COLOR_RGB2HLS)
        white_obj_mask = cv2.threshold(hls_img[:,:,1],150,1, cv2.THRESH_BINARY)[1]
        seeds = cv2.erode(
            cv2.morphologyEx(white_obj_mask.astype(np.uint8), cv2.MORPH_OPEN, self.mask5),
            self.mask3)
        seeds[3:-3,3:-3] = 0
        sure_fg = seeds
        unknown = cv2.subtract(cv2.threshold(hls_img[:,:,1],100,1,cv2.THRESH_BINARY)[1],sure_fg)
        _, markers = cv2.connectedComponents(sure_fg)
        markers = markers+1
        markers[unknown==1] = 0
        img = cv2.cvtColor(white_obj_mask * 255, cv2.COLOR_GRAY2RGB)
        cv2.watershed(img, markers)
        white_obj_on_im_edges = (markers>1).astype(np.uint8)
        white_obj_on_im_edges = cv2.morphologyEx(white_obj_on_im_edges, cv2.MORPH_CLOSE,self.mask15)
        white_obj_on_im_edges = cv2.morphologyEx(white_obj_on_im_edges, cv2.MORPH_OPEN, self.mask15)
        from math import pi, sqrt
        boundary_flag = np.zeros(white_obj_on_im_edges.shape[:2])
        if self.boundary_flag is not None:
            boundary_flag = cv2.erode(self.boundary_flag, self.mask15)
        to_detect_edges = white_obj_on_im_edges
        edges = cv2.morphologyEx(
                cv2.Canny(to_detect_edges * 255,0,1,apertureSize = 3),cv2.MORPH_CLOSE,
                self.mask15)
        lines = cv2.HoughLinesP(
            edges,
            1, pi/180,100,maxLineGap=3
            )
        if lines is not None:
            # keep 4 largest
            lines_lengths = [ sqrt((x2-x1)**2 + (y2-y1)**2) for (x1,y1,x2,y2) in [l[0] for l in lines]]
            lines = lines[np.argsort(lines_lengths)[-4:],:,:]
            for line in lines:
                x1,y1,x2,y2 = line[0]
                cv2.line(boundary_flag,(x1,y1),(x2,y2),1,2)
        sure_fg = boundary_flag.astype(np.uint8)
        unknown = cv2.subtract(white_obj_on_im_edges,sure_fg)
        _, markers = cv2.connectedComponents(sure_fg)
        markers = markers+1
        markers[unknown==1] = 0
        img = cv2.cvtColor(
            white_obj_on_im_edges*255, cv2.COLOR_GRAY2RGB)
        cv2.watershed(
            img, markers)
        markers = (markers > 1).astype(np.uint8)
        self.boundary_flag = cv2.morphologyEx(
            markers, cv2.MORPH_CLOSE, self.large_mask)

        return self.boundary_flag == 0


## Functions to apply deepsort to helmet boxes.

Below are two functions `deepsort_helmets` which runs deepsort across a video. There is a lot of room for improving this function. The merging of deepsort labels onto the original helmet boxes is currently done in a very crude manner.

`add_deepsort_label_col` mapps the most common label to each deepsort cluster.

## Yolo5v code copy pasted for NMS and Image Resizing

In [None]:

def xywh2xyxy(x):
    # Convert nx4 boxes from [x, y, w, h] to [x1, y1, x2, y2] where xy1=top-left, xy2=bottom-right
    y = x.clone() if isinstance(x, torch.Tensor) else np.copy(x)
    y[:, 0] = x[:, 0] - x[:, 2] / 2  # top left x
    y[:, 1] = x[:, 1] - x[:, 3] / 2  # top left y
    y[:, 2] = x[:, 0] + x[:, 2] / 2  # bottom right x
    y[:, 3] = x[:, 1] + x[:, 3] / 2  # bottom right y
    return y
def non_max_suppression(prediction, conf_thres=0.25, iou_thres=0.45, classes=None, agnostic=False, multi_label=False,
                        labels=(), max_det=300):
    """Runs Non-Maximum Suppression (NMS) on inference results

    Returns:
         list of detections, on (n,6) tensor per image [xyxy, conf, cls]
    """

    nc = prediction.shape[2] - 5  # number of classes
    xc = prediction[..., 4] > conf_thres  # candidates

    # Checks
    assert 0 <= conf_thres <= 1, f'Invalid Confidence threshold {conf_thres}, valid values are between 0.0 and 1.0'
    assert 0 <= iou_thres <= 1, f'Invalid IoU {iou_thres}, valid values are between 0.0 and 1.0'

    # Settings
    min_wh, max_wh = 2, 4096  # (pixels) minimum and maximum box width and height
    max_nms = 30000  # maximum number of boxes into torchvision.ops.nms()
    time_limit = 10.0  # seconds to quit after
    redundant = True  # require redundant detections
    multi_label &= nc > 1  # multiple labels per box (adds 0.5ms/img)
    merge = False  # use merge-NMS

    t = time.time()
    output = [torch.zeros((0, 6), device=prediction.device)] * prediction.shape[0]
    for xi, x in enumerate(prediction):  # image index, image inference
        # Apply constraints
        # x[((x[..., 2:4] < min_wh) | (x[..., 2:4] > max_wh)).any(1), 4] = 0  # width-height
        x = x[xc[xi]]  # confidence

        # Cat apriori labels if autolabelling
        if labels and len(labels[xi]):
            l = labels[xi]
            v = torch.zeros((len(l), nc + 5), device=x.device)
            v[:, :4] = l[:, 1:5]  # box
            v[:, 4] = 1.0  # conf
            v[range(len(l)), l[:, 0].long() + 5] = 1.0  # cls
            x = torch.cat((x, v), 0)

        # If none remain process next image
        if not x.shape[0]:
            continue

        # Compute conf
        x[:, 5:] *= x[:, 4:5]  # conf = obj_conf * cls_conf

        # Box (center x, center y, width, height) to (x1, y1, x2, y2)
        box = xywh2xyxy(x[:, :4])

        # Detections matrix nx6 (xyxy, conf, cls)
        if multi_label:
            i, j = (x[:, 5:] > conf_thres).nonzero(as_tuple=False).T
            x = torch.cat((box[i], x[i, j + 5, None], j[:, None].float()), 1)
        else:  # best class only
            conf, j = x[:, 5:].max(1, keepdim=True)
            x = torch.cat((box, conf, j.float()), 1)[conf.view(-1) > conf_thres]

        # Filter by class
        if classes is not None:
            x = x[(x[:, 5:6] == torch.tensor(classes, device=x.device)).any(1)]

        # Apply finite constraint
        # if not torch.isfinite(x).all():
        #     x = x[torch.isfinite(x).all(1)]

        # Check shape
        n = x.shape[0]  # number of boxes
        if not n:  # no boxes
            continue
        elif n > max_nms:  # excess boxes
            x = x[x[:, 4].argsort(descending=True)[:max_nms]]  # sort by confidence

        # Batched NMS
        c = x[:, 5:6] * (0 if agnostic else max_wh)  # classes
        boxes, scores = x[:, :4] + c, x[:, 4]  # boxes (offset by class), scores
        i = torchvision.ops.nms(boxes, scores, iou_thres)  # NMS
        if i.shape[0] > max_det:  # limit detections
            i = i[:max_det]
        if merge and (1 < n < 3E3):  # Merge NMS (boxes merged using weighted mean)
            # update boxes as boxes(i,4) = weights(i,n) * boxes(n,4)
            iou = box_iou(boxes[i], boxes) > iou_thres  # iou matrix
            weights = iou * scores[None]  # box weights
            x[i, :4] = torch.mm(weights, x[:, :4]).float() / weights.sum(1, keepdim=True)  # merged boxes
            if redundant:
                i = i[iou.sum(1) > 1]  # require redundancy

        output[xi] = x[i]
        if (time.time() - t) > time_limit:
            print(f'WARNING: NMS time limit {time_limit}s exceeded')
            break  # time limit exceeded

    return output

def letterbox(im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleFill=False, scaleup=True, stride=32):
    # Resize and pad image while meeting stride-multiple constraints
    shape = im.shape[:2]  # current shape [height, width]
    if isinstance(new_shape, int):
        new_shape = (new_shape, new_shape)

    # Scale ratio (new / old)
    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
    if not scaleup:  # only scale down, do not scale up (for better val mAP)
        r = min(r, 1.0)

    # Compute padding
    ratio = r, r  # width, height ratios
    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
    if auto:  # minimum rectangle
        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
    elif scaleFill:  # stretch
        dw, dh = 0.0, 0.0
        new_unpad = (new_shape[1], new_shape[0])
        ratio = new_shape[1] / shape[1], new_shape[0] / shape[0]  # width, height ratios

    dw /= 2  # divide padding into 2 sides
    dh /= 2

    if shape[::-1] != new_unpad:  # resize
        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
    im = cv2.copyMakeBorder(im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)  # add border
    return im, ratio, (dw, dh)
def clip_coords(boxes, shape):
    # Clip bounding xyxy bounding boxes to image shape (height, width)
    if isinstance(boxes, torch.Tensor):  # faster individually
        boxes[:, 0].clamp_(0, shape[1])  # x1
        boxes[:, 1].clamp_(0, shape[0])  # y1
        boxes[:, 2].clamp_(0, shape[1])  # x2
        boxes[:, 3].clamp_(0, shape[0])  # y2
    else:  # np.array (faster grouped)
        boxes[:, [0, 2]] = boxes[:, [0, 2]].clip(0, shape[1])  # x1, x2
        boxes[:, [1, 3]] = boxes[:, [1, 3]].clip(0, shape[0])  # y1, y2

def scale_coords(img1_shape, coords, img0_shape, ratio_pad=None):
    # Rescale coords (xyxy) from img1_shape to img0_shape
    if ratio_pad is None:  # calculate from img0_shape
        gain = min(img1_shape[0] / img0_shape[0], img1_shape[1] / img0_shape[1])  # gain  = old / new
        pad = (img1_shape[1] - img0_shape[1] * gain) / 2, (img1_shape[0] - img0_shape[0] * gain) / 2  # wh padding
    else:
        gain = ratio_pad[0][0]
        pad = ratio_pad[1]
    coords[:, [0, 2]] -= pad[0]  # x padding
    coords[:, [1, 3]] -= pad[1]  # y padding
    coords[:, :4] /= gain
    clip_coords(coords, img0_shape)
    return coords

## Define Yolov5 Detector helper class

In [None]:
class Detector:
    def __init__(self, model, shape=832, nms_conf_thres=0.5, nms_iou_thres=0.5):
        self.model = model
        self.shape = shape
        self.cuda = next(self.model.parameters()).is_cuda
        self.nms_conf_thres = nms_conf_thres
        self.nms_iou_thres = nms_iou_thres
        self.ori_shape = None
    
    def preprocess(self, img):
        self.ori_shape = img.shape
        img = letterbox(img, (self.shape, self.shape), stride=32, auto=True)[0]
        self.new_shape = img.shape
        img = np.moveaxis(img, 2, 0)
        
        img = torch.from_numpy(img)
        if self.cuda:
            img = img.to('cuda')
        img = img.half()  # uint8 to fp16/32
        img /= 255.0  # 0 - 255 to 0.0 - 1.0
        if img.ndimension() == 3:
            img = img.unsqueeze(0)
        return img

    def convert_pred(self, preds):
        bboxes_list = []
        for pred in preds:
            bboxes = non_max_suppression(pred.unsqueeze(0), conf_thres=0.25,max_det=22,
                                         iou_thres=0.45, classes=0, agnostic=False)[0]
            bboxes[:, :4] = scale_coords(self.new_shape, bboxes[:, :4], self.ori_shape[:2]).round()
            bboxes_list.append(bboxes[:,:5].cpu().numpy())
            bboxes_list[-1][:,:4] = bboxes_list[-1][:,:4].astype(int) 
        if len(bboxes_list) == 1:
            return bboxes_list[0]
        return bboxes_list
    
    
    def detect(self, img):
        if not isinstance(img, list):
            img = [img]
        with torch.no_grad():
            data = torch.cat([self.preprocess(im) for im in img], 0)
            return self.convert_pred(self.model(data)[0])
    
    def __call__(self, img):
        return self.detect(img)
detector = Detector(model)

In [None]:
video_dir

In [None]:
%matplotlib inline
def detect_helmets(detector, tracking_group, game_play,
                   video_dir, batch_size, view=None, frames_step=None):
    video_prefix = game_play
    views = ['Endzone', 'Sideline']
    if view is not None:
        if isinstance(view, str):
            views = [view]
        else:
            views = views
    for view in views:
        tracker = ValidRegionTracker()
        video = video_prefix + '_' + view
        cap = cv2.VideoCapture(f'{video_dir}{video}.mp4')
        frame = 0
        imgs = []
        while True:
            if frames_step is not None:
                cap.set(1, frame)
                frame += frames_step
            success, image = cap.read()
            if success:
                if frames_step is None:
                    frame += 1
                image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
#                 mask = tracker.detect(image)
#                 image = image * mask[:, :, np.newaxis]
                imgs.append(image)
            if (len(imgs) == batch_size) or not success and (len(imgs) > 0):
                with torch.no_grad():
                    bboxes_list = detector(imgs)
                if batch_size == 1:
                    bboxes_list = [bboxes_list]
                d_df = pd.concat(
                    [pd.DataFrame({'left':bboxes[:,0], 'right': bboxes[:,2], 'width': bboxes[:,2] - bboxes[:,0],
                                  'top':bboxes[:,1], 'bottom': bboxes[:,3], 'height': bboxes[:,3] - bboxes[:,1], 'conf':bboxes[:,4],
                                   'video_frame': [f'{video}_{frame - batch_size + cnt + 1}' for _
                                                   in range(len(bboxes))], 
                                   'view':[view for _ in range(len(bboxes))], 
                                   'frame': np.zeros(len(bboxes)) + frame - batch_size + cnt + 1 })
                     for cnt, bboxes in enumerate(bboxes_list)],axis=0)
                d_df = d_df[d_df['conf'] > CONF_THRE]
                d_df['x'] = (d_df['left'] + round(d_df['width'] / 2))
                d_df['y'] = (d_df['top'] + round(d_df['height'] / 2))
                yield d_df, imgs
                imgs = []
            if not success:
                break


def detect_orientation(detector, tracking_group, game_play, video_dir, view=None, frames_step=30):
    print('Detecting video view orientation...')
    d_dfs = {}
    
    for frames_d_df,imgs in tqdm(
        detect_helmets(detector, tracking_group, game_play, video_dir, 1, view=view, frames_step=frames_step)):
        for ([frame, view], d_df),image in zip(frames_d_df.groupby(['frame', 'view']), imgs):
            if view not in d_dfs:
                d_dfs[view] = []
            d_dfs[view].append(d_df)
    ret = {}    
    for view in d_dfs:
        df_list = [x.reset_index(drop=True) for x in d_dfs[view]]
        d_dfs[view] = pd.concat(df_list)
    
        ori_mapping = Mapping(tracking_group, view, use_kalman=False, use_previous=False,
                              init_frames=len(df_list), ignore_starting_preproc=True)
        df_list = list(d_dfs[view].groupby('frame'))
        for frame in tqdm(np.linspace(1, len(df_list)-1, ori_mapping.init_frames).astype(int)):
            _, this_df = df_list[frame]
            ori_mapping(this_df)
        ret[view] = [mode(ori_mapping.buffer_ori)]

    return ret

def deepsort_helmets(detector,
                     tracking_group,
                     game_play,
                     video_dir,
                     deepsort_config='deepsort.yaml',
                     plot=False,
                     batch_size=64,
                     plot_frames=[],
                     save_all_to_dir=False,use_cuda=True, view=None):
    
    video_prefix = tracking_group['gameKey'].iloc[0].astype(str) + '_' + \
        tracking_group["playID"].iloc[0].astype(str).zfill(6)
        
    
    # Run through frames.
    ds = []
    objpoints = []
    imgpoints = []
    
    oris = detect_orientation(detector, tracking_group, game_play, video_dir, view=view)
    print('Detected orientation:',oris, '. Mapping...')
    old_view = None    
    for frames_d_df,imgs in tqdm(detect_helmets(detector, tracking_group, game_play,
                                                video_dir, batch_size, view=view)):
        for ([frame, view], d_df),image in zip(frames_d_df.groupby(['frame', 'view']), imgs):
            if view != old_view:
                old_view = view
                deepsort = DeepSort(cfg.DEEPSORT.REID_CKPT, 
                        max_dist=cfg.DEEPSORT.MAX_DIST,
                        min_confidence=cfg.DEEPSORT.MIN_CONFIDENCE,
                        nms_max_overlap=cfg.DEEPSORT.NMS_MAX_OVERLAP,
                        max_iou_distance=cfg.DEEPSORT.MAX_IOU_DISTANCE,
                        max_age=cfg.DEEPSORT.MAX_AGE,
                        n_init=cfg.DEEPSORT.N_INIT,
                        nn_budget=cfg.DEEPSORT.NN_BUDGET,
                        use_cuda=use_cuda)
                mapping = Mapping(tracking_group, view, available_oris=oris[view])
                if save_all_to_dir:
                    out_dir = f'tmp/out/{video_prefix}_{view}'
                    try:
                        shutil.rmtree(out_dir)
                    except IOError:
                        pass
                    os.makedirs(out_dir)
               
            video_frame = d_df['video_frame'].iloc[0]
            mapped_df, this_tracking, opt_params = mapping(d_df)
            mapped_df['x'] = (mapped_df['left'] + round(mapped_df['width'] / 2))
            mapped_df['y'] = (mapped_df['top'] + round(mapped_df['height'] / 2))
            mapped_df['left'] = mapped_df['left'].astype(int)
            mapped_df['top'] = mapped_df['top'].astype(int)
            
            xywhs = mapped_df[['x','y','width','height']].values
            confs = np.ones([len(mapped_df),])
            clss =  np.zeros([len(mapped_df),])
            outputs = deepsort.update(xywhs, confs, clss, image)
#             ds_df = None
            
#             if frame > cfg.DEEPSORT.N_INIT:
#                 ds_df = pd.DataFrame(outputs, columns=['left', 'top', 'right', 'bottom',
#                                                        'deepsort_cluster', 'class'])
#                 ds_df['width'] = ds_df['right'] - ds_df['left']
#                 ds_df['height'] = ds_df['bottom'] - ds_df['top']
#                 ds_df['video_frame'] = video_frame
#                 ds_df.drop(columns='class', inplace=True)
# #                 mapped_df, this_tracking, opt_params = mapping(ds_df)
#             else:
# #                 mapped_df, this_tracking, opt_params = mapping(d_df)
#                 mapped_df['deepsort_cluster'] = np.arange(len(mapped_df))
            preds_df = pd.DataFrame(outputs, columns=['left','top','right','bottom','deepsort_cluster','class'])
            if len(preds_df) > 0:
                preds_df['left'] = preds_df['left'].astype(int)
                preds_df['top'] = preds_df['top'].astype(int)
                # TODO Fix this messy merge
                mapped_df = pd.merge_asof(mapped_df.sort_values(['left','top']),
                                  preds_df[['left','top','deepsort_cluster']] \
                                  .sort_values(['left','top']), on='left', suffixes=('','_deepsort'),
                                  direction='nearest')
            ds.append(mapped_df)
            if (plot and frame > cfg.DEEPSORT.N_INIT) or (frame in plot_frames) or save_all_to_dir:
                ori = image.copy()
                for j, row in mapped_df.iterrows(): 
                    row['right'] = row['left'] + row['width']
                    row['bottom'] = row['top'] + row['height']
                    bboxes = row[['left','top', 'right','bottom']].values
                    id = row['label']
                    label = f'{id}'
                    im = plot_one_box(bboxes, image, label=label, color=[255,0,0], line_thickness=2)
                if 'x2im' in this_tracking.columns:
                    for j,row in this_tracking.iterrows():
                        x = row[['x2im', 'y2im']].values.astype(float)
                        if np.all(np.isfinite(x)):
                            im[int(x[1]) - 10: int(x[1]) + 10, int(x[0])-10:int(x[0]) + 10, :] = 0
                if (frame in plot_frames or plot):
                    display(mapped_df)
                    fig, ax = plt.subplots(figsize=(15, 10))
                    ax.set_title(f'Deepsort labels: {frame}')
                    plt.imshow(im)
                    plt.show()
                if save_all_to_dir:
                    fname = os.path.join(out_dir , str(int(frame)).zfill(6))
                    writer = pd.ExcelWriter(fname + '.xlsx', engine='xlsxwriter')
                    this_tracking.sort_values('player')[['player','x','y','dir','o','s','a']
                                                        + (['x2im', 'y2im'] if 'x2im' 
                                                        in this_tracking.columns else [])
                                                       ].to_excel(writer,sheet_name='Tracking')
                    d_df.sort_values('left').to_excel(writer,sheet_name='YoloV5')
#                     if ds_df is not None:
#                         ds_df.sort_values('left').to_excel(writer,sheet_name='DeepSort')
                    mapped_df.sort_values('left').to_excel(writer,sheet_name='Final')
                    pd.Series(opt_params).to_excel(writer, sheet_name='MappingOptParams')
                    
                    writer.save()
                    cv2.imwrite(fname +'.png', cv2.cvtColor(im, cv2.COLOR_RGB2BGR))

            
    dout = pd.concat(ds)
    return dout

In [None]:
# %matplotlib notebook
# cap = cv2.VideoCapture(f'{video_dir}57783_003374_Endzone.mp4')
# frame = 0
# imgs = []
# success, image = cap.read()
# a = MappingCheck(image, 0, helmets, tracking)

In [None]:
def add_deepsort_label_col(out):
    # Find the top occuring label for each deepsort_cluster
    cum = out[~out['label'].isnull()].groupby('deepsort_cluster')['label'].value_counts() \
        .sort_values(ascending=False).to_frame() \
        .rename(columns={'label':'label_count'}) \
        .reset_index() \
        .groupby(['deepsort_cluster']) \
        .first()
    
    sortlabel_map = cum['label'].to_dict()
    # Find the # of times that label appears for the deepsort_cluster.
    sortlabelcount_map = cum['label_count'].to_dict()
    
    out['label_deepsort'] = out['deepsort_cluster'].map(sortlabel_map)
    out['label_count_deepsort'] = out['deepsort_cluster'].map(sortlabelcount_map)
    return out
def randomize_duplicates(group):
    if len(group) > 1:
        print(f"Duplicates were found for frame {group['video_frame']}, randomizing..")
        group[['right','bottom']] +=  (
            np.random.randint(-3, 3, group[['left','top']].shape))
        group['bottom'] = np.maximum(np.minimum(group['bottom'], 720), 0)
        group['right'] = np.maximum(np.minimum(group['right'], 1280), 0)
        group.loc[group['bottom']<group['top'],'bottom'] = group.loc[group['bottom']<group['top'],'top']
        group.loc[group['right']<group['left'],'right'] = group.loc[group['right']<group['left'],'left']
        group['height'] = group['top'] - group['bottom']
        group['width'] = group['right'] - group['left']
    return group
def score_vs_deepsort(myvideo, out, labels):
    # Score the base predictions compared to the deepsort postprocessed predictions.
    myvideo_mp4 = myvideo + '.mp4'
    labels_video = labels.query('video == @myvideo_mp4')
    scorer = NFLAssignmentScorer(labels_video)
    out_deduped = out.groupby(['video_frame','label']).first().reset_index()
    base_video_score = scorer.score(out_deduped)
    
    out_preds = out.drop('label', axis=1).rename(columns={'label_deepsort':'label'})
    out_preds = out_preds.groupby(['video_frame','label']).first().reset_index()
    deepsort_video_score = scorer.score(out_preds)
    print(f'{base_video_score:0.5f} before --> {deepsort_video_score:0.5f} deepsort')

In [None]:
%matplotlib inline
views = ['Sideline','Endzone']
if debug:
#     from matplotlib import rcParams
#     rcParams['figure.figsize'] = [20,20]
    save_all_to_dir = True
    try:
        import xlsxwriter
    except:
        save_all_to_dir = False
outs = []
for video in tqdm(helmets['video'].unique()):
    game_key, play_id, view = video.split('_')
    game_play = game_key + '_' + play_id
    view = view[:-len('.mp4')]
    game_key = int(game_key)
    play_id = int(play_id)
    tracking_group = tracking[(tracking['gameKey']==game_key)&
                              (tracking['playID']==play_id)]
    if debug:
        # Plot deepsort labels when in debug mode.
        out = deepsort_helmets(detector, tracking_group, game_play, video_dir,
                               save_all_to_dir=save_all_to_dir, plot_frames=[0, 5,10, 150, 250],
                              view=view)
    else:
        out = deepsort_helmets(detector, tracking_group, game_play, 
                               video_dir, save_all_to_dir=False, view=view)
    out = add_deepsort_label_col(out)
#     cols_to_drop = []
#     for col in out.columns:
#         if col != 'label':
#             if out[col].isnull().any():
#                 cols_to_drop.append(col)
#     out = out[[col for col in out_ds.columns if col not in cols_to_drop]].copy()
    out['right'] = out['left'] + out['width']
    out['bottom'] = out['top'] + out['height']
    out = out.groupby(['left','top', 'right', 'bottom', 'video_frame']).apply(
        randomize_duplicates)
    outs.append(out)
    if debug:
        t = tracking_group.iloc[0]
        if video in labels['video'].values:
            # Score
            score_vs_deepsort(video[:-len('.mp4')], out, labels)
submission_deepsort = pd.concat(outs).copy()

# Check Submission & Save
Finally we will create a submission file and check that it passes the submission requirements.
The steps are:
1. Drop the `label` and replace with `label_deepsort` predictions.
2. Remove any duplicate labels within a single video/frame. This is required to meet the submission requirements.
3. Save the results.

In [None]:
ss = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/sample_submission.csv')
# Final Checks
submission_deepsort.reset_index(inplace=True, drop=True)
submission_deepsort['label_deepsort'] = submission_deepsort['label_deepsort'].fillna(submission_deepsort['label'])
submission_deepsort = submission_deepsort[~submission_deepsort['label_deepsort'].isnull()]
submission_deepsort = submission_deepsort.drop('label', axis=1) \
    .rename(columns={'label_deepsort':'label'})[ss.columns]
# Drop duplicate labels
submission_deepsort = submission_deepsort.loc[
    ~submission_deepsort[['video_frame','label']].duplicated()]
check_submission(submission_deepsort)
submission_deepsort[['left','width','top','height']] = submission_deepsort[['left','width','top','height']].astype(int)
submission_deepsort = submission_deepsort.dropna(axis=0)
submission_deepsort.to_csv('submission.csv', index=False)

# Display video showing predictions

Lastly, if we want to review our predictions we can create a video to review the predictions using the `video_with_predictions` function from the `helmet_assignment` helper package.

In [None]:
if debug:
    submission_deepsort['video'] = submission_deepsort['video_frame'].str.split('_').str[:3].str.join('_') + '.mp4'
    debug_videos = submission_deepsort['video'].unique()
    debug_labels = labels.query('video in @debug_videos')
    scorer = NFLAssignmentScorer(debug_labels)
    scorer.score(submission_deepsort)
    for video in debug_videos:
        # Create video showing predictions for one of the videos.
        video_out = video_with_predictions(
            f'../input/nfl-health-and-safety-helmet-assignment/train/{video}',
            scorer.sub_labels.fillna(0))

        frac = 0.60 # scaling factor for display
        display(Video(data=video_out,
                      embed=True,
                      height=int(720*frac),
                      width=int(1280*frac))
               )