## Libraries

In [None]:
import datetime as dt
import os
import random
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as torchdata
import seaborn as sns

from pathlib import Path
from typing import Optional

from catalyst.core import Callback, CallbackOrder, IRunner
from catalyst.dl import SupervisedRunner
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, recall_score, precision_score, confusion_matrix
from sklearn.model_selection import GroupKFold
from sklearn.preprocessing import StandardScaler
from tqdm.notebook import tqdm


warnings.simplefilter("ignore")

pd.set_option("max_columns", 100)

## Data Loading

In [None]:
tracking = pd.read_csv("../input/nfl-impact-tracking-animation/train_player_tracking_annotated.csv")
tracking.head()

In [None]:
tracking["event"].unique()

In [None]:
tracking["event"] = tracking["event"].fillna("")
events_to_id = {
    "": 0,
    "ball_snap": 1,
    "handoff": 2,
    "tackle": 3,
    "first_contact": 4,
    "out_of_bounds": 5,
    "play_action": 6,
    "pass_forward": 7,
    "pass_arrived": 8,
    "pass_outcome_caught": 9,
    "run": 10,
    "touchdown": 11,
    "penalty_flag": 12
}

In [None]:
tracking["event_id"] = tracking["event"].map(events_to_id)
tracking

## Feature Engineering

In [None]:
def prepare_relative_distances(df: pd.DataFrame):
    distances = []
    orientation_difference = []
    x = df["x"].values
    y = df["y"].values
    o = df["o"].values
    for i in range(len(df)):
        other_row_indices = np.argwhere(np.arange(len(df)) != i).reshape(-1)
        x_others = x[other_row_indices]
        y_others = y[other_row_indices]
        o_others = o[other_row_indices]
        distance_between_players = np.sqrt((x_others - x[i]) ** 2 + (y_others - y[i]) ** 2)
        distances.append(distance_between_players)
        
        abs_orientation_diff = np.abs(o_others - o[i])
        orientation_difference.append(abs_orientation_diff)
    distances_matrix = np.stack(distances)
    orientation_diff_matrix = np.stack(orientation_difference)
    relative_distance_df = pd.concat([
        df[["index"]].reset_index(drop=True),
        pd.DataFrame(distances_matrix, columns=[f"dist{i}" for i in range(distances_matrix.shape[1])]),
        pd.DataFrame(orientation_diff_matrix, columns=[f"od{i}" for i in range(orientation_diff_matrix.shape[1])])
    ], axis=1)
    return relative_distance_df

In [None]:
relative_distances = []
for (game_key, play_id, time), df in tqdm(tracking.groupby(["gameKey", "playID", "time"])):
    relative_distances.append(prepare_relative_distances(df))
    
relative_distances_df = pd.concat(relative_distances, axis=0).reset_index(drop=True)

In [None]:
relative_distances_df.head()

In [None]:
tracking = tracking.merge(relative_distances_df, on="index", how="left")

In [None]:
def prepare_relative_speed_and_acceleration(df: pd.DataFrame):
    distance_columns = [f"dist{i}" for i in range(21)]
    od_columns = [f"od{i}" for i in range(21)]
    diff_df = df[distance_columns].diff().fillna(0).reset_index(drop=True)
    diff_df.columns = [f"speed{i}" for i in range(21)]
    
    acceleration_df = diff_df.diff().fillna(0).reset_index(drop=True)
    acceleration_df.columns = [f"acc{i}" for i in range(21)]
    
    od_diff_df = df[od_columns].diff().fillna(0).reset_index(drop=True)
    od_diff_df.columns = [f"od_diff{i}" for i in range(21)]
    relative_speed_and_acceleration = pd.concat([
        df[["index"]].reset_index(drop=True),
        diff_df,
        acceleration_df,
        od_diff_df
    ], axis=1)
    return relative_speed_and_acceleration

In [None]:
relative_speeds = []
for (game_key, play_id, player), df in tqdm(tracking.groupby(["gameKey", "playID", "player"])):
    relative_speeds.append(prepare_relative_speed_and_acceleration(df))
    
relative_speed_df = pd.concat(relative_speeds, axis=0).reset_index(drop=True)

In [None]:
tracking = tracking.merge(relative_speed_df, on="index", how="left")

In [None]:
distance_columns = [f"dist{i}" for i in range(21)]
speed_columns = [f"speed{i}" for i in range(21)]
acceleration_columns = [f"acc{i}" for i in range(21)]
od_columns = [f"od{i}" for i in range(21)]
od_diff_columns = [f"od_diff{i}" for i in range(21)]
tracking[distance_columns]

In [None]:
tracking["relative_distance"] = tracking[distance_columns].min(axis=1)
min_value_indices = tracking[distance_columns].idxmin(axis=1).map(
    lambda x: distance_columns.index(x)).reset_index()

speed_values = tracking[speed_columns].values
acceleration_values = tracking[acceleration_columns].values
od_values = tracking[od_columns].values
od_diff_values = tracking[od_diff_columns].values

tracking["relative_speed"] = min_value_indices.apply(lambda row: speed_values[row["index"], row[0]], axis=1)
tracking["relative_acceleration"] = min_value_indices.apply(lambda row: acceleration_values[row["index"], row[0]], axis=1)
tracking["relative_orientation_difference"] = min_value_indices.apply(lambda row: od_values[row["index"], row[0]], axis=1)
tracking["relative_orientation_difference_diff"] = min_value_indices.apply(lambda row: od_diff_values[row["index"], row[0]], axis=1)

In [None]:
tracking = tracking.merge(min_value_indices, left_index=True, right_on="index")

In [None]:
tracking

In [None]:
X = tracking[[
    "index", "gameKey", "playID", "player",
    "x", "y", "s", "a", "dis", "o", "dir",
    "relative_distance", "relative_speed", "relative_acceleration", 
    "relative_orientation_difference", "relative_orientation_difference_diff",
    "event_id",
    "impact"
]]
X.head()

In [None]:
ss = StandardScaler()
columns = ["x", "y", "s", "a", "dis", "o", "dir", "relative_distance", "relative_speed", "relative_acceleration",
           "relative_orientation_difference", "relative_orientation_difference_diff"]
categoricals = ["event_id"]
transformed = ss.fit_transform(X[columns])
X[columns] = transformed

In [None]:
X.head()

## Datasets

In [None]:
class LSTMDataset(torchdata.Dataset):
    def __init__(self, X: pd.DataFrame, columns, maxlen=103):
        self.X_list = list(X.groupby(["gameKey", "playID", "player"]))
        self.maxlen = maxlen
        self.columns = columns
        
    def __len__(self):
        return len(self.X_list)
    
    def __getitem__(self, idx: int):
        (game_key, play_id, player), df = self.X_list[idx]
        impact = df["impact"].values.reshape(-1).astype(np.float32)
        impact_long = np.zeros(self.maxlen, dtype=np.float32)
        impact_long[:len(impact)] = impact

        x = df[self.columns].values.astype(np.float32)
        x_long = np.zeros((self.maxlen, x.shape[1]), dtype=np.float32)
        x_long[:len(x), :] = x
        
        mask = np.zeros(self.maxlen, dtype=bool)
        mask[:len(impact)] = True
        
        index = df["index"].values.reshape(-1)
        indices = np.zeros(self.maxlen, dtype=int)
        indices[:len(x)] = index
        return {
            "game_key": game_key,
            "play_id": play_id,
            "player": player,
            "index": indices,
            "mask": mask,
            "targets": impact_long,
            "x": x_long
        }

## Model

In [None]:
class BiLSTMModel(nn.Module):
    def __init__(self, input_size: int, hidden_size: int, num_layers: int, n_categories: int, n_emb: int):
        super().__init__()
        self.embedding = nn.Embedding(n_categories, n_emb)
        self.lstm = nn.LSTM(input_size=input_size + n_emb,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=True)
        self.classifier = nn.Sequential(
            nn.Linear(2 * hidden_size, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, 1))
        
    def forward(self, x):
        batch_size = x.size(0)
        cat_cols = x[:, :, -1].long()
        emb_out = self.embedding(cat_cols)
        x = torch.cat([x[:, :, :-1], emb_out], dim=2)
        seq, _ = self.lstm(x)
        return self.classifier(seq).sigmoid().view(batch_size, -1)

## Training Utilities

In [None]:
def set_seed(seed):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
class F1Callback(Callback):
    def __init__(self,
                 input_key: str="targets",
                 output_key: str="logits",
                 threshold: float = 0.5,
                 prefix: str = "f1"):
        super().__init__(CallbackOrder.Metric)
        
        self.input_key = input_key
        self.output_key = output_key
        self.threshold = threshold
        self.prefix = prefix
        
    def on_loader_start(self, state: IRunner):
        self.prediction = []
        self.target = []
        
    def on_batch_end(self, state: IRunner):
        targ = state.input[self.input_key].detach().cpu().numpy()
        out = state.output[self.output_key].detach().cpu().numpy()
        
        targ = targ.reshape(-1)
        out = out.reshape(-1)
        
        self.prediction.append(out)
        self.target.append(targ)
        
        if targ.sum() == 0:
            score = 1.0
        else:
            score = f1_score(y_true=targ, y_pred=(out > self.threshold).astype(int))
        state.batch_metrics[self.prefix] = score
        
    def on_loader_end(self, state: IRunner):
        y_pred = np.concatenate(self.prediction, axis=0)
        y_true = np.concatenate(self.target, axis=0)
        score = f1_score(y_true=y_true, y_pred=(y_pred > self.threshold).astype(int))
        if state.is_valid_loader:
            state.epoch_metrics[state.valid_loader + "_epoch_" + self.prefix] = score
        else:
            state.epoch_metrics["train_epoch_" + self.prefix] = score
            
            
class AUCCallback(Callback):
    def __init__(self,
                 input_key: str="targets",
                 output_key: str="logits",
                 prefix: str = "auc"):
        super().__init__(CallbackOrder.Metric)
        
        self.input_key = input_key
        self.output_key = output_key
        self.prefix = prefix
        
    def on_loader_start(self, state: IRunner):
        self.prediction = []
        self.target = []
        
    def on_batch_end(self, state: IRunner):
        targ = state.input[self.input_key].detach().cpu().numpy()
        out = state.output[self.output_key].detach().cpu().numpy()
        
        targ = targ.reshape(-1)
        out = out.reshape(-1)
        
        self.prediction.append(out)
        self.target.append(targ)
        
        if targ.sum() == 0:
            score = 1.0
        else:
            score = roc_auc_score(y_true=targ, y_score=out)
        state.batch_metrics[self.prefix] = score
        
    def on_loader_end(self, state: IRunner):
        y_pred = np.concatenate(self.prediction, axis=0)
        y_true = np.concatenate(self.target, axis=0)
        score = roc_auc_score(y_true=y_true, y_score=y_pred)
        if state.is_valid_loader:
            state.epoch_metrics[state.valid_loader + "_epoch_" + self.prefix] = score
        else:
            state.epoch_metrics["train_epoch_" + self.prefix] = score

## 5fold training

In [None]:
device = torch.device("cpu" if not torch.cuda.is_available() else "cuda")

In [None]:
X["group"] = X["gameKey"].map(str) + "_" + X["playID"].map(str)

oof = pd.DataFrame()
scores = 0.0

set_seed(42)

new_columns = columns + ["event_id"]

gkf = GroupKFold(n_splits=5)
for fold, (trn_idx, val_idx) in enumerate(gkf.split(X, groups=X["group"])):
    print("*" * 100)
    print(f"Fold: {fold}")
    
    X_trn = X.loc[trn_idx, :].reset_index(drop=True)
    X_val = X.loc[val_idx, :].reset_index(drop=True)
    
    trn_dataset = LSTMDataset(X_trn, columns=new_columns)
    val_dataset = LSTMDataset(X_val, columns=new_columns)
    
    trn_loader = torchdata.DataLoader(trn_dataset, batch_size=128, shuffle=True)
    val_loader = torchdata.DataLoader(val_dataset, batch_size=256, shuffle=False)
    
    loaders = {
        "train": trn_loader,
        "valid": val_loader
    }
    
    model = BiLSTMModel(input_size=len(columns), hidden_size=64, num_layers=1, n_categories=13, n_emb=8)
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
    callbacks = [
        AUCCallback(input_key="targets",
                    prefix="auc"),
        F1Callback(input_key="targets",
                   threshold=0.1,
                   prefix="f1_at_01"),
        F1Callback(input_key="targets",
                   threshold=0.3,
                   prefix="f1_at_03")
    ]
    
    runner = SupervisedRunner(device=device,
                              input_key="x",
                              input_target_key="targets")
    runner.train(
        model=model,
        criterion=criterion,
        loaders=loaders,
        optimizer=optimizer,
        scheduler=scheduler,
        num_epochs=50,
        verbose=False,
        logdir=f"fold{fold}",
        callbacks=callbacks,
        main_metric="epoch_auc",
        minimize_metric=False)
    
    # oof
    batch_predict = []
    batch_targets = []
    batch_indices = []
    for batch in val_loader:
        mask = batch["mask"]
        index = batch["index"][mask].numpy()
        batch_indices.append(index)
        
        targets = batch["targets"]
        batch_targets.append(targets[mask].numpy())
        
        x = batch["x"].to(device)
        with torch.no_grad():
            out = model(x).detach().cpu()
            
        batch_predict.append(out[mask].numpy())
        
    batch_predictions_array = np.concatenate(batch_predict)
    batch_targets_array = np.concatenate(batch_targets)
    batch_indices_array = np.concatenate(batch_indices)
    
    oof = oof.append(pd.DataFrame({
        "pred": batch_predictions_array,
        "targets": batch_targets_array,
        "index": batch_indices_array}))

## Evaluation

In [None]:
score = roc_auc_score(y_score=oof["pred"], y_true=oof["targets"])
print(f"AUC: {score:.5f}")

score = f1_score(y_pred=(oof["pred"] > 0.1), y_true=oof["targets"])
print(f"F1@0.1: {score:.5f}")

score = f1_score(y_pred=(oof["pred"] > 0.3), y_true=oof["targets"])
print(f"F1@0.3: {score:.5f}")

In [None]:
oof = oof.sort_values(by="index")
X_ = pd.concat([
    X.merge(oof, on="index").reset_index(drop=True),
    tracking[["time"]].reset_index(drop=True)
], axis=1)

time_level_prediction = X_.groupby(["gameKey", "playID", "time"]).agg({
    "targets": "max",
    "pred": "max"
})

In [None]:
time_level_prediction

In [None]:
score = roc_auc_score(y_score=time_level_prediction["pred"], y_true=time_level_prediction["targets"])
print(f"AUC: {score:.5f}")

score = f1_score(y_pred=(time_level_prediction["pred"] > 0.1), y_true=time_level_prediction["targets"])
print(f"F1@0.1: {score:.5f}")

score = f1_score(y_pred=(time_level_prediction["pred"] > 0.08), y_true=time_level_prediction["targets"])
print(f"F1@0.08: {score:.5f}")

In [None]:
labels = ["0", "1"]
cm = confusion_matrix(y_pred=(time_level_prediction["pred"] > 0.06),
                      y_true=time_level_prediction["targets"],
                      normalize="all")

fig, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(cm, xticklabels=labels, yticklabels=labels, cmap='Blues', annot=True, lw=0.5)
ax.set_xlabel('Predicted Label')
ax.set_ylabel('True Label')
ax.set_aspect('equal')