In this competion, it may be important to use tracking data, so I will share notebook of merging label and tracking data

ref: (Thanks [Rob](https://www.kaggle.com/robikscube) for getting-started-guide and kind discussion comment)
- https://www.kaggle.com/robikscube/nfl-helmet-assignment-getting-started-guide
- https://www.kaggle.com/c/nfl-health-and-safety-helmet-assignment/discussion/264361#1467283

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
from tqdm.auto import tqdm

# Prepare

In [None]:
# Read in data files
BASE_DIR = '../input/nfl-health-and-safety-helmet-assignment'

# Labels and sample submission
labels = pd.read_csv(f'{BASE_DIR}/train_labels.csv')
ss = pd.read_csv(f'{BASE_DIR}/sample_submission.csv')

# Player tracking data
tr_tracking = pd.read_csv(f'{BASE_DIR}/train_player_tracking.csv')
te_tracking = pd.read_csv(f'{BASE_DIR}/test_player_tracking.csv')

# Baseline helmet detection labels
tr_helmets = pd.read_csv(f'{BASE_DIR}/train_baseline_helmets.csv')
te_helmets = pd.read_csv(f'{BASE_DIR}/test_baseline_helmets.csv')

# Extra image labels
img_labels = pd.read_csv(f'{BASE_DIR}/image_labels.csv')

In [None]:
def add_track_features(tracks, fps=59.94, snap_frame=10):
    """
    Add column features helpful for syncing with video data.
    """
    tracks = tracks.copy()
    tracks["game_play"] = (
        tracks["gameKey"].astype("str")
        + "_"
        + tracks["playID"].astype("str").str.zfill(6)
    )
    tracks["time"] = pd.to_datetime(tracks["time"])
    snap_dict = (
        tracks.query('event == "ball_snap"')
        .groupby("game_play")["time"]
        .first()
        .to_dict()
    )
    tracks["snap"] = tracks["game_play"].map(snap_dict)
    tracks["isSnap"] = tracks["snap"] == tracks["time"]
    tracks["team"] = tracks["player"].str[0].replace("H", "Home").replace("V", "Away")
    tracks["snap_offset"] = (tracks["time"] - tracks["snap"]).astype(
        "timedelta64[ms]"
    ) / 1_000
    # Estimated video frame
    tracks["est_frame"] = (
        ((tracks["snap_offset"] * fps) + snap_frame).round().astype("int")
    )
    return tracks


tr_tracking = add_track_features(tr_tracking)
te_tracking = add_track_features(te_tracking)


In [None]:
labels.head()

In [None]:
tr_tracking.head(3)

## Merge Process

There are some merge patterns, for example, which(label, tracking) is left in left join?

In this notebook, Label data is regarded as left in left join.

Also some constrains are as bellow.

- skip sideline player (H00, V00)
- nearest time join (because tracking and label sampling frequency is different)


I just merge label and tracking in this notebook.
Since we have no label data in test dataset, we have to replace data, but I think same strategy can be applyed.

In [None]:
def merge_label_and_tracking(tracking_df, label_df):

    tracking_with_game_index = tracking_df.set_index(["gameKey", "playID", "player"])

    df_list = []

    for key, _label_df in tqdm(label_df.groupby(["gameKey", "playID", "view", "label"])):
        # skip because there are sideline player
        if key[3] == "H00" or key[3] == "V00":
            continue

        tracking_data = tracking_with_game_index.loc[(key[0], key[1], key[3])]
        _label_df = _label_df.sort_values("frame")

        # merge with frame and est_frame
        merged_df = pd.merge_asof(
            _label_df,
            tracking_data,
            left_on="frame",
            right_on="est_frame",
            direction='nearest',
        )
        df_list.append(merged_df)

    all_merged_df = pd.concat(df_list)
    all_merged_df = all_merged_df.sort_values(["video_frame", "label"], ignore_index=True)
    
    return all_merged_df

In [None]:
merged_df = merge_label_and_tracking(tr_tracking, labels)

In [None]:
merged_df.head()

In [None]:
merged_df.isna().mean()

In [None]:
merged_df.to_csv("train_label_tracking_merged.csv", index=False)