# Camera-Tracking Matching with Gradient Descent

If you wanna understand how I came up with this notebook please check the detailed explanation on https://www.kaggle.com/coldfir3/camera-tracking-matching-with-gradient-descent/edit

In [None]:
import pandas as pd
from PIL import Image, ImageDraw
from pathlib import Path
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2 as cv
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

In [None]:
fast_sub = (len(pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/test_baseline_helmets.csv')) == 72386)
fast_sub

In [None]:
# code from: https://www.kaggle.com/robikscube/nfl-helmet-assignment-getting-started-guide
def add_track_features(tracks, fps=59.94, snap_frame=10):
    """
    Add column features helpful for syncing with video data.
    """
    tracks = tracks.copy()
    tracks["game_play"] = (
        tracks["gameKey"].astype("str")
        + "_"
        + tracks["playID"].astype("str").str.zfill(6)
    )
    tracks["time"] = pd.to_datetime(tracks["time"])
    snap_dict = (
        tracks.query('event == "ball_snap"')
        .groupby("game_play")["time"]
        .first()
        .to_dict()
    )
    tracks["snap"] = tracks["game_play"].map(snap_dict)
    tracks["isSnap"] = tracks["snap"] == tracks["time"]
    tracks["team"] = tracks["player"].str[0].replace("H", "Home").replace("V", "Away")
    tracks["snap_offset"] = (tracks["time"] - tracks["snap"]).astype(
        "timedelta64[ms]"
    ) / 1_000
    # Estimated video frame
    tracks["est_frame"] = (
        ((tracks["snap_offset"] * fps) + snap_frame).round().astype("int")
    )
    return tracks

def add_video_features(videos):
    videos['game_play'] = videos['video_frame'].apply(lambda x: '_'.join(x.split('_')[:2]))
    videos['camera'] = videos['video_frame'].apply(lambda x: x.split('_')[2])
    videos['frame'] = videos['video_frame'].apply(lambda x: x.split('_')[-1])
    videos['xc'] = (videos['left'] + videos['width']/2).astype(int).values
    videos['yc'] = (videos['top'] + videos['height']/2).astype(int).values
    return videos

# TODO, add interpolation of tracking_df and replace nearest
class get_keypoints():
    
    def __init__(self, video_df = None, track_df = None):
        if video_df is None:
            video_df = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/test_baseline_helmets.csv')
            self.video_df = add_video_features(video_df)
        if track_df is None:
            tracking_df = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/test_player_tracking.csv')
            tracking_df = add_track_features(tracking_df)
            self.tracking_df = tracking_df.query("est_frame > 0")
            
    def __call__(self, game_play, frame, min_conf = 0.6, topk = 22, normalized = True, debug = False):
        
        kpS = self.video_df.query(
            f"game_play == '{game_play}' and frame == '{frame}' and camera == 'Sideline' and conf > {min_conf}").nlargest(topk, 'conf')
        kpE = self.video_df.query(
            f"game_play == '{game_play}' and frame == '{frame}' and camera == 'Endzone'and conf > {min_conf}").nlargest(topk, 'conf')
        
        keypoints = dict()
        keypoints['Sideline'] = kpS[['xc', 'yc']].values
        keypoints['Endzone'] = kpE[['xc', 'yc']].values
        frames = self.tracking_df.query(
            f"game_play == '{game_play}'")['est_frame'].unique()
        if frame not in frames:
            index = (np.absolute(frames-frame)).argmin()
            frame = frames[index]
        keypoints['Tracking'] = self.tracking_df.query(
            f"game_play == '{game_play}' and est_frame == {frame}")[['x', 'y']].values
        if debug: print(keypoints)
        if normalized:
            for k, v in keypoints.items():
                if len(v)> 0:
                    keypoints[k] = (v - v.min(axis = 0)) / (v.max(axis = 0) - v.min(axis = 0))
                
        keypoints['Sideline'][:,1] = 1-keypoints['Sideline'][:,1]
        
        keypoints['Players'] = self.tracking_df.query(
            f"game_play == '{game_play}' and est_frame == {frame}")['player'].values
        
        keypoints['BBoxes'] = {'Sideline':kpS,'Endzone':kpE}
                
        self.keypoints = keypoints
            
        return keypoints
    
    def plot(self, add_no = False):
        if not hasattr(self, 'keypoints'):
            print('you must run the function first...')
        else:
            kp = self.keypoints
            plt.figure(figsize=(12, 6))
            plt.scatter(kp['Endzone'][:,0], kp['Endzone'][:,1], marker = 'x', color = 'red')
            plt.scatter(kp['Sideline'][:,0], kp['Sideline'][:,1], marker = '^', color = 'red')
            plt.scatter(kp['Tracking'][:,0], kp['Tracking'][:,1], marker = 'o', color = 'green')  
            if add_no:
                for i in range(len(kp['Tracking'][:,0])):
                    plt.annotate(i, (kp['Tracking'][i,0], kp['Tracking'][i,1]))
                for i in range(len(kp['Sideline'][:,0])):
                    plt.annotate(i, (kp['Sideline'][i,0], kp['Sideline'][i,1]))
    
get_kp = get_keypoints()

# Keypoint matching using Pytorch

In [None]:
import torch

In [None]:
def min_mse(preds, targets):
    d = torch.cdist(preds.squeeze(2), targets.squeeze(2))
    loss = (d.min(dim = 1).values**2).mean().sqrt()
    return loss

In [None]:
def step(src, trg, m, lr = 3e-3, prt = True):
    preds = torch.matmul(m, src) # Homography transform
    loss = min_mse(preds, trg)   # mse between the closes pair of points
    if prt: print(f'loss: {(loss.item()):.5f}')
    loss.backward()
    m.data -= lr * m.grad.data
    m.grad = None

In [None]:
def fit_predict(src, trg, init_rot = 0, init_scale = [1,1,1], lr = 3e-3, n_steps = 1000, verbose = True):
    t = np.pi * init_rot / 180
    m_rot = torch.tensor([[np.cos(t),-np.sin(t), 0],
                          [np.sin(t), np.cos(t), 0],
                          [        0,         0, 1]], dtype = torch.double)
    m_scale = torch.tensor([[init_scale[0], 0, 0],
                            [0, init_scale[0], 0],
                            [0, 0, init_scale[0]]], dtype = torch.double)
    m = m_scale @ m_rot
    m.requires_grad_()
    for i in range(n_steps): 
        if not (i % (n_steps//10)) and verbose:
            step(src, trg, m, lr=lr)
        else:
            step(src, trg, m, lr=lr, prt=False)
            
    with torch.no_grad():
        tfm = torch.matmul(m, src)
        
    if verbose:
        plt.scatter(src[:,0], src[:,1], marker = 'o', color = 'red', label = 'source')
        plt.scatter(trg[:,0], trg[:,1], marker = '^', color = 'green', label = 'target')  
        plt.scatter(tfm[:,0], tfm[:,1], marker = 'o', color = 'blue', label = 'result')
        plt.legend();
        
    return tfm

In [None]:
def matching(tfm, trg, players):

    d = torch.cdist(tfm[:,:2,0], trg[:,:2,0])
    
    greedy_order = d.min(axis = 1).values.argsort()
    players=players[greedy_order]
    d = d[greedy_order]
    players_matched = []
    for ix, p in enumerate(players):
        iy = d[ix].argmin().item()
        players_matched.append(iy)
        d[:,iy] = np.Inf
        if (d == np.Inf).all():
            break

    return players[torch.tensor(players_matched).argsort()]

In [None]:
def end2end_prediction(video_frame):
    
    game, play, camera, frame = video_frame.split('_')
    game_play = '_'.join([game, play])
    frame = int(frame)
    k = get_kp(game_play, frame)
    src = torch.cat([torch.tensor(k['Tracking']), torch.ones(len(k['Tracking'])).unsqueeze(1)], axis = -1).unsqueeze(2)
    trg = torch.cat([torch.tensor(k[camera]), torch.ones(len(k[camera])).unsqueeze(1)], axis = -1).unsqueeze(2)
    tfm = fit_predict(src, trg, verbose = False)
    
    lbls = matching(tfm, trg, k['Players'])

    pred = k["BBoxes"][camera][['video_frame','left','width','top','height']].copy()
#     print(pred)
#     print(lbls)
    pred['label'] = lbls
    return pred

In [None]:
def check_submission(sub):
    # Maximum of 22 boxes per frame.
    max_box_per_frame = sub.groupby(["video_frame"])["label"].count().max()
    if max_box_per_frame > 22:
        print("Has more than 22 boxes in a single frame")
        return False
    # Only one label allowed per frame.
    has_duplicate_labels = sub[["video_frame", "label"]].duplicated().any()
    if has_duplicate_labels:
        print("Has duplicate labels")
        return False
    # Check for unique boxes
    has_duplicate_boxes = (
        sub[["video_frame", "left", "width", "top", "height"]].duplicated().any()
    )
    if has_duplicate_boxes:
        print("Has duplicate boxes")
        return False
    return True

In [None]:
if fast_sub:
    print(end2end_prediction('57906_000718_Sideline_1'))

## Get predictions

In [None]:
# sample_sub = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/sample_submission.csv')
video_frames = pd.read_csv('../input/nfl-health-and-safety-helmet-assignment/test_baseline_helmets.csv')['video_frame'].unique()
if fast_sub:
    video_frames = video_frames[:16]

In [None]:
# preds = []
# for video_frame in tqdm(video_frames):
#     preds.append(end2end_prediction(video_frame))

In [None]:
preds = Parallel(n_jobs=4)(delayed(end2end_prediction)(x) for x in tqdm(video_frames))

In [None]:
submission = pd.concat(preds).drop_duplicates(subset=['video_frame', 'label']).reset_index(drop = True)
sub_ok = check_submission(submission)
if sub_ok:
    print('Submission passed, saving it now to submission.csv file...')
    submission.to_csv('submission.csv', index = False)
else:
    print('Submission FAILED')