# Model Training: Returns For Loss

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import ipywidgets as widgets
import warnings
import datetime
import joblib
import json
import gc

In [None]:
from typing import List, Dict, Set, Any, Callable, Optional
from tqdm.notebook import tqdm
from io import StringIO
from IPython.display import HTML
from timeit import default_timer as timer
from collections import Counter

In [None]:
from sklearn.exceptions import ConvergenceWarning
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)

# Helpers

In [None]:
def show_feature_dist_plots(fields, df):
    n_rows = int(np.ceil(len(fields) / 2))
    fig, axes = plt.subplots(n_rows, 2)

    for (field, name), ax in zip(fields, axes.flatten()):
        sns.histplot(df[field], ax=ax)
        ax.axvline(0, color="black", alpha=0.5, dashes=[1, 1])
        ax.axhline(0, color="black", alpha=0.5, dashes=[1, 1])
        ax.set_xlabel(name)
        ax.set_ylabel("Frequency")

    fig.set_size_inches(20, 5 * n_rows)
    plt.show()


def show_scatter_plots(fields, df, Y, ylabel):
    n_rows = int(np.ceil(len(fields) / 2))
    fig, axes = plt.subplots(n_rows, 2)

    for (field, name), ax in zip(fields, axes.flatten()):
        sns.regplot(
            x=df[field],
            y=Y,
            ax=ax,
            line_kws=dict(
                color="red",
                dashes=[9, 1]
            ),
            scatter_kws=dict(
                s=5,
                alpha=0.1,
                color="blue"
            )
        )
        ax.axvline(0, color="black", alpha=0.5, dashes=[1, 1])
        ax.axhline(0, color="black", alpha=0.5, dashes=[1, 1])
        ax.set_xlabel(name)
        ax.set_ylabel(ylabel)

    fig.set_size_inches(20, 5 * n_rows)
    plt.show()

In [None]:
def plot_regression_distribution(Y_vals, ax):
    sns.histplot(Y_vals, ax=ax)
    ax.set_xlim(-110, 110)
    ax.set_xticks(np.arange(-110, 120, 10))
    ax.set_xlabel("Return Yards Gained")
    ax.set_ylabel("Frequency")

In [None]:
def show_binary_scores(Y_true, Y_pred):
    acc = accuracy_score(Y_true, Y_pred)
    prc = precision_score(Y_true, Y_pred)
    rec = recall_score(Y_true, Y_pred)
    f1s = f1_score(Y_true, Y_pred)
    auc = roc_auc_score(Y_true, Y_pred)
    print(f"Accuracy  = {acc:.3f}")
    print(f"Precision = {prc:.3f}")
    print(f"Recall    = {rec:.3f}")
    print(f"F1-Score  = {f1s:.3f}")
    print(f"ROC AUC   = {auc:.3f}")
    print()


def plot_binary_dist(mdl, X, Yb, ax):
    kw = dict(
        binwidth=0.05,
        binrange=(0, 1),
        alpha=0.5,
    )
    sns.histplot(mdl.predict_proba(X[~Yb])[:,1], color="b", label="Actual Gain", ax=ax, **kw)
    sns.histplot(mdl.predict_proba(X[Yb])[:,1], color="r", label="Actual Loss", ax=ax, **kw)
    ax.legend()
    ax.set_xlabel("Predicted Probability of Return For Zero or Loss")


def plot_score_scatter(mdl, X, Yy, ax):
    scatter_kws = dict(
        alpha=0.1,
    )
    line_kws = dict(
        color="blue",
    )
    grid_kws = dict(
        color="black",
        alpha=0.5,
        dashes=[4, 1]
    )
    Ypb = mdl.predict(X)
    Ppb = mdl.predict_proba(X)[:,1]
    scatter_kws["color"] = ["red" if pred_loss else "green" for pred_loss in Ypb]
    sns.regplot(x=Ppb, y=Yy, color="b", ax=ax, scatter_kws=scatter_kws, line_kws=line_kws)
    ax.axhline(0, **grid_kws)
    ax.set_xlabel("Predicted Probability of Return For Zero or Loss")
    ax.set_ylabel("Actual Return Yards Gained")


def show_binary_plots(mdl, X, Yb, Yy):
    fig, axes = plt.subplots(1, 3)
    plot_binary_dist(mdl, X, Yb, ax=axes[0])
    plot_score_scatter(mdl, X, Yy, ax=axes[1])
    plot_confusion_matrix(mdl, X, Yb, ax=axes[2])
    plt.gcf().set_size_inches(20, 4)
    plt.show()


def evaluate_binary_model(mdl, X, Yb, Yy):
    show_binary_scores(Yb, mdl.predict(X))
    show_binary_plots(mdl, X, Yb, Yy)

# Load Data

In [None]:
DIR_VT = "../input/process-punt-return-decision-data"
df_return_frames = pd.read_csv(f"{DIR_VT}/return_frames.csv")
print(f"Return frames has {df_return_frames.shape[0]:,d} rows and {df_return_frames.shape[1]:,d} cols.")

In [None]:
df_return_frames["players"] = df_return_frames["players"].apply(lambda j: json.loads(j))

In [None]:
DIR = "../input/nfl-big-data-bowl-2022"
df_games = pd.read_csv(f"{DIR}/games.csv")
df_players = pd.read_csv(f"{DIR}/players.csv")
# Get patched versions from our custom output
df_plays = pd.read_csv(f"{DIR_VT}/plays_patched.csv")
df_pff = pd.read_csv(f"{DIR_VT}/pff_patched.csv")

In [None]:
SEED = 0
PLAY_KEYS = ["gameId", "playId"]
FRAME_KEYS = [
    "gameId",
    "playId",
    "original",
    "frameId"
]

# Check Data

In [None]:
df_return_frames.columns

In [None]:
df_return_frames.firstReturnableEvent.value_counts()

In [None]:
df_pre_return = df_return_frames[
    (df_return_frames.firstReturnableEvent == "punt_land")
    | (df_return_frames.firstReturnableEvent == "punt_received")
    | (df_return_frames.firstReturnableEvent == "punt_downed")
]
len(df_pre_return), len(df_return_frames)

In [None]:
df_data = df_return_frames
# df_data = df_pre_return.copy()
# df_data = df_no_penalty_yards.copy()
len(df_data), len(df_return_frames)

# Engineer Features

In [None]:
MAX_DIST = 200
SIDELINE_MIN = 0
SIDELINE_MAX = 53.0 + (1.0 / 3.0)
RECEIVING_GOAL_LINE = 10
KICKING_GOAL_LINE = 110


def distance(a: Dict, b: Dict) -> float:
    return np.sqrt((a["y"] - b["y"])**2 + (a["x"] - b["x"])**2)


def closest_defender_distance(
    ball_x: float,
    ball_y: float,
    players: List[Dict],
    kickingTeam: str
) -> float:
    ball = { "x": ball_x, "y": ball_y }
    min_dist = MAX_DIST
    for p in players:
        if p["teamCode"] == kickingTeam:
            d = distance(ball, p)
            if d < min_dist:
                min_dist = d
    return min_dist


def defenders_within_radius(
    ball_x: float,
    ball_y: float,
    players: List[Dict],
    kickingTeam: str,
    radius: int
) -> int:
    ball = { "x": ball_x, "y": ball_y }
    count = 0
    for p in players:
        if p["teamCode"] == kickingTeam:
            d = distance(ball, p)
            if d < radius:
                count += 1
    return count


def blockers_within_radius(
    ball_x: float,
    ball_y: float,
    players: List[Dict],
    kickingTeam: str,
    radius: int
) -> int:
    ball = { "x": ball_x, "y": ball_y }
    count = 0
    for p in players:
        if p["teamCode"] != kickingTeam and p["x"] >= ball_x:
            d = distance(ball, p)
            if d < radius:
                count += 1
    return count


def distance_to_sideline(ball_y: float) -> float:
    d_bottom = ball_y - SIDELINE_MIN
    d_top = SIDELINE_MAX - ball_y
    # Out of bounds
    if d_bottom <= 0 or d_top <= 0:
        return 0
    # Get distance to closest sideline
    return min(d_bottom, d_top)


def speed_upfield(player: Dict) -> float:
    if pd.isna(player):
        return 0
    angle_rads = np.deg2rad(player["dir"])
    speed = player["s"]
    return speed * np.sin(angle_rads)


def speed_lateral(player: Dict) -> float:
    if pd.isna(player):
        return 0
    angle_rads = np.deg2rad(player["dir"])
    speed = player["s"]
    # Take absolute value to get lateral speed in either direction
    return speed * np.abs(np.cos(angle_rads))


def closest_defender_speed_upfield(
    ball_x: float,
    ball_y: float,
    players: List[Dict],
    kickingTeam: str
) -> float:
    ball = { "x": ball_x, "y": ball_y }
    min_dist = MAX_DIST
    closest = None
    for p in players:
        if p["teamCode"] == kickingTeam:
            d = distance(ball, p)
            if d < min_dist:
                min_dist = d
                closest = p
    return speed_upfield(closest)


def closest_defender_speed_lateral(
    ball_x: float,
    ball_y: float,
    players: List[Dict],
    kickingTeam: str
) -> float:
    ball = { "x": ball_x, "y": ball_y }
    min_dist = MAX_DIST
    closest = None
    for p in players:
        if p["teamCode"] == kickingTeam:
            d = distance(ball, p)
            if d < min_dist:
                min_dist = d
                closest = p
    return speed_lateral(closest)

In [None]:
vec_closest_defender_distance = np.vectorize(closest_defender_distance)
vec_defenders_within_radius = np.vectorize(defenders_within_radius)
vec_blockers_within_radius = np.vectorize(blockers_within_radius)
vec_distance_to_sideline = np.vectorize(distance_to_sideline)
vec_closest_defender_speed_upfield = np.vectorize(closest_defender_speed_upfield)
vec_closest_defender_speed_lateral = np.vectorize(closest_defender_speed_lateral)

In [None]:
# Main inputs
bx = df_data["ballX"]
by = df_data["ballY"]
p = df_data["players"]
kt = df_data["possessionTeam"]

In [None]:
# Defender and blocker features
df_data["closestDefenderDistance"] = vec_closest_defender_distance(bx, by, p, kt)
df_data["defendersWithinRadius"] = vec_defenders_within_radius(bx, by, p, kt, 2)
df_data["blockersWithinRadius"] = vec_blockers_within_radius(bx, by, p, kt, 5)

In [None]:
# Field position features
df_data["distanceToSideline"] = vec_distance_to_sideline(by)
df_data["distanceToOwnGoalLine"] = df_data["ballYardline"]
df_data["isInsideOwnEndzone"] = df_data["distanceToOwnGoalLine"] <= 0
df_data["isInsideOwn20"] = df_data["distanceToOwnGoalLine"] <= 20
df_data["isInsideOwn10"] = df_data["distanceToOwnGoalLine"] <= 10

In [None]:
# Returner features
df_data["closestDefenderSpeedUpfield"] = vec_closest_defender_speed_upfield(bx, by, p, kt)
df_data["closestDefenderSpeedLateral"] = vec_closest_defender_speed_lateral(bx, by, p, kt)

# Split Cross Validation Data

In [None]:
TARGET_VAL = "returnYardsGained"
TARGET_BOOL = "isZeroOrLoss"

In [None]:
df_train = df_data[df_data["split"] == "train"]
df_validate = df_data[df_data["split"] == "validate"]
df_test = df_data[df_data["split"] == "test"]
print(f"Train:    {len(df_train):,d} frames")
print(f"Validate: {len(df_validate):,d} frames")
print(f"Test:     {len(df_test):,d} frames")
print()
print(f"Train:    {(df_train[TARGET_BOOL].mean() * 100):.1f}% zero or loss")
print(f"Validate: {(df_validate[TARGET_BOOL].mean() * 100):.1f}% zero or loss")
print(f"Test:     {(df_test[TARGET_BOOL].mean() * 100):.1f}% zero or loss")

In [None]:
fig, axes = plt.subplots(1, 3)
plot_regression_distribution(df_train[TARGET_VAL], ax=axes[0])
plot_regression_distribution(df_validate[TARGET_VAL], ax=axes[1])
plot_regression_distribution(df_test[TARGET_VAL], ax=axes[2])
fig.set_size_inches(20, 4)
plt.show()

In [None]:
# Separate target variable from input variables for each split
# All inputs are based on the decision frame

INPUT_COLS = [
    # Defender and blocker features
    "closestDefenderDistance",
    "defendersWithinRadius",
    "blockersWithinRadius",
    "closestDefenderSpeedUpfield",
    "closestDefenderSpeedLateral",
    # Field position features
    "ballYardline",
    "distanceToSideline",
    "isInsideOwnEndzone",
    "isInsideOwn20",
    "isInsideOwn10",
]

Yy_train = df_train[TARGET_VAL]
Yb_train = df_train[TARGET_BOOL]
X_train = df_train[INPUT_COLS]

Yy_validate = df_validate[TARGET_VAL]
Yb_validate = df_validate[TARGET_BOOL]
X_validate = df_validate[INPUT_COLS]

Yy_test = df_test[TARGET_VAL]
Yb_test = df_test[TARGET_BOOL]
X_test = df_test[INPUT_COLS]

In [None]:
fields = [
    ("receivingYardline", "Receiving Yard Line"),
    ("returnYardsGained", "Return Yards Gained"),
    
    ("kickLength", "Kick Length (yds)"),
    ("ballYardline", "Ball Yard Line"),
    
    ("closestDefenderDistance", "Closest Defender Distance (yds)"),
    ("distanceToSideline", "Distance To Sideline (yds)"),
    
    ("closestDefenderSpeedUpfield", "Closest Defender Speed Upfield (yds/sec)"),
    ("closestDefenderSpeedLateral", "Closest Defender Speed Lateral (yds/sec)"),
    
    ("defendersWithinRadius", "Defenders Within Radius (players)"),
    ("blockersWithinRadius", "Blockers Within Radius (players)"),
]

In [None]:
show_feature_dist_plots(fields, df_train)

In [None]:
show_scatter_plots(fields, df_train, Yy_train, ylabel="Return Yards Gained")

# Train Models

In [None]:
print(f"Train:    {(df_train[TARGET_BOOL].mean() * 100):.1f}% zero or loss")
print(f"Validate: {(df_validate[TARGET_BOOL].mean() * 100):.1f}% zero or loss")
print(f"Test:     {(df_test[TARGET_BOOL].mean() * 100):.1f}% zero or loss")

In [None]:
# Benchmark against naive model
dummy_mdl = DummyClassifier(strategy="stratified", random_state=SEED)
dummy_mdl.fit(X_train, Yb_train)

In [None]:
evaluate_binary_model(dummy_mdl, X_validate, Yb_validate, Yy_validate)

In [None]:
# Benchmark against naive model
show_binary_scores(Yb_validate, [True for _ in Yb_validate])

In [None]:
s = timer()
mdl_lr = LogisticRegression(
    penalty="l2",
    solver="saga",
    random_state=SEED,
)
# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore", category=ConvergenceWarning)
mdl_lr.fit(X_train, Yb_train)
print(f"Trained model in {timer()-s:.1f} secs.")

In [None]:
evaluate_binary_model(mdl_lr, X_validate, Yb_validate, Yy_validate)

In [None]:
s = timer()
mdl_lr_bal = LogisticRegression(
    penalty="l2",
    solver="saga",
    class_weight="balanced",
    random_state=SEED,
)
mdl_lr_bal.fit(X_train, Yb_train)
print(f"Trained model in {timer()-s:.1f} secs.")

In [None]:
evaluate_binary_model(mdl_lr_bal, X_validate, Yb_validate, Yy_validate)

In [None]:
s = timer()
mdl_rf = RandomForestClassifier(
    n_estimators=100
)
mdl_rf.fit(X_train, Yb_train)
print(f"Trained model in {timer()-s:.1f} secs.")

In [None]:
evaluate_binary_model(mdl_rf, X_validate, Yb_validate, Yy_validate)

In [None]:
s = timer()
mdl_svc = SVC(
    gamma="auto",
    probability=True
)
mdl_svc.fit(X_train, Yb_train)
print(f"Trained model in {timer()-s:.1f} secs.")

In [None]:
evaluate_binary_model(mdl_svc, X_validate, Yb_validate, Yy_validate)

# Output Predictions

In [None]:
def set_predictions_output(df, name, model):
    X = df[INPUT_COLS]
    df[f"loss_{name}"] = model.predict(X)
    df[f"prob_{name}"] = model.predict_proba(X)[:,1]
    return df


def save_models_and_predictions(df, models):
    df_out = df[df.original].copy()
    # Convert JSON columns to strings
    df_out["players"] = df_out["players"].apply(lambda o: json.dumps(o))
    # Go through each model
    for (model_name, model) in tqdm(models):
        # Save model
        model_outfile = f"models/{model_name}.joblib"
        joblib.dump(model, model_outfile)
        print(f"Saved model to file: {model_outfile}")
        # Add predictions to output
        df_out = set_predictions_output(df_out, model_name, model)
    # Save predictions
    outfile = f"predictions.csv"
    df_out.to_csv(outfile, index=False)
    print(f"Wrote {(df_out.shape[0]):,d} rows and {df_out.shape[1]} cols to file: {outfile}")

In [None]:
!mkdir -p models

In [None]:
chosen_models = [
    ("rf_binary", mdl_rf),
    ("lr_binary", mdl_lr),
    ("lr_bal_binary", mdl_lr_bal),
    ("svc_binary", mdl_svc),
]

In [None]:
save_models_and_predictions(df_data, chosen_models)