# Process Punt Return Decision Data

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.animation import FuncAnimation
import ipywidgets as widgets
import datetime
import json
import gc

from typing import List, Dict, Set, Any, Callable, Optional
from tqdm.notebook import tqdm
from io import StringIO
from IPython.display import HTML
from timeit import default_timer as timer

In [None]:
"""
Methods to help with data IO, cleaning, and patches.
"""


def reorient_tracking_data(df: pd.DataFrame) -> pd.DataFrame:
    x_top = 120
    y_top = 53.0 + (1.0 / 3.0)
    plays_to_rotate = df.playDirection == "right"
    cols = set(df.columns)
    if "x" in cols:
        df["x"] = np.where(
            plays_to_rotate,
            (df["x"] * -1) + x_top,
            df["x"]
        )
    if "y" in cols:
        df["y"] = np.where(
            plays_to_rotate,
            (df["y"] * -1) + y_top,
            df["y"]
        )
    # Also flip our custom ballLanding columns
    if "ballLandingX" in cols:
        df["ballLandingX"] = np.where(
            plays_to_rotate,
            (df["ballLandingX"] * -1) + x_top,
            df["ballLandingX"]
        )
    if "ballLandingY" in cols:
        df["ballLandingY"] = np.where(
            plays_to_rotate,
            (df["ballLandingY"] * -1) + y_top,
            df["ballLandingY"]
        )
    # End of custom columns
    if "o" in cols:
        df["o"] = np.where(
            plays_to_rotate,
            (df["o"] + 180) % 360,
            df["o"]
        )
    if "dir" in cols:
        df["dir"] = np.where(
            plays_to_rotate,
            (df["dir"] + 180) % 360,
            df["dir"]
        )
    if "playDirection" in cols:
        df["playDirection"] = np.where(
            plays_to_rotate,
            "left",
            df["playDirection"]
        )
    return df


def reflect_tracking_data(df_original: pd.DataFrame) -> pd.DataFrame:
    df = df_original.copy()
    x_top = 120
    y_top = 53.0 + (1.0 / 3.0)
    plays_to_rotate = df.playDirection == "right"
    cols = set(df.columns)
    if "y" in cols:
        df["y"] = (df["y"] * -1) + y_top
    # Also flip our custom column ballLandingY
    if "ballLandingY" in cols:
        df["ballLandingY"] = (df["ballLandingY"] * -1) + y_top
    # Flip the sign of the angle,
    # then add 180 degrees,
    # then convert negatives to (0, 359) by using modulo
    if "o" in cols:
        df["o"] = ((df["o"] * -1) + 180) % 360
    if "dir" in cols:
        df["dir"] = ((df["dir"] * -1) + 180) % 360
    return df


def apply_pff_patches(df_pff: pd.DataFrame) -> pd.DataFrame:
    raw_pff_patches = 'role,pffJersey,trackingJersey\ntackler,TEN 47,TEN 46\nmissedTackler,LA 38,LA 21\ngunners,TEN 47,TEN 46\ngunners,OAK 14,OAK 38\ngunners,NYJ 25,NYJ 40\ngunners,BUF 29,BUF 36\ngunners,NYJ 23,NYJ 37\nvises,CAR 20,CAR 41\nvises,CLE 25,CLE 35\nvises,LA 38,LA 21\nvises,MIA 30,MIA 23\nvises,NYG 23,NYG 37\nvises,NYJ 25,NYJ 40\nvises,BUF 29,BUF 36\nvises,OAK 14,OAK 38\nvises,SEA 24,SEA 8\nvises,TEN 47,TEN 46\npuntRushers,ATL 55,ATL 62\npuntRushers,DAL 59,DAL 53\npuntRushers,DEN 91,DEN 58\npuntRushers,IND 33,IND 36\npuntRushers,NO 27,NO 36\npuntRushers,NYJ 23,NYJ 37\npuntRushers,NYJ 37,NYJ 43\nspecialTeamsSafeties,BUF 29,BUF 36\nspecialTeamsSafeties,CAR 20,CAR 41\nspecialTeamsSafeties,IND 33,IND 36\nspecialTeamsSafeties,LA 38,LA 21\nspecialTeamsSafeties,MIA 30,MIA 23\nspecialTeamsSafeties,NYG 23,NYG 37\nspecialTeamsSafeties,SEA 24,SEA 8\nspecialTeamsSafeties,TEN 25,TEN 41\n'
    df_patches = pd.read_csv(StringIO(raw_pff_patches))
    df_out = df_pff.copy()
    for _, patch in df_patches.iterrows():
        role = patch["role"]
        old_jersey = patch["pffJersey"]
        new_jersey = patch["trackingJersey"]
        df_out[role] = np.where(
            df_out[role].str.contains(old_jersey),
            df_out[role].str.replace(old_jersey, new_jersey),
            df_out[role]
        )
    return df_out


def fill_missing_hangtimes(df_pff : pd.DataFrame) -> pd.DataFrame :
    # gameid playid hangtime
    hangTime_patches = [
        (2018092302, 3437, 4),
        (2018100711, 253, 1.8),
        (2018102111, 3651, 0.1),
        (2019090807,1293, 2.8),
        (2019102006, 2301, 3.4),
        (2020100401, 211, 2.4)
    ]
    df_hangTime_patches = pd.DataFrame(hangTime_patches, columns = ['gameId', 'playId', 'newHangTime'])
    df_pff = df_pff.join(df_hangTime_patches.set_index(['gameId', 'playId']), on = ['gameId', 'playId'])
    df_pff['hangTime'] = np.where(
        df_pff['hangTime'].isna(),
        df_pff['newHangTime'] ,
        df_pff['hangTime']
    )
    df_pff.drop(columns = ['newHangTime'], inplace = True)
    return df_pff


def get_tracking_jersey_patches(df: pd.DataFrame) -> pd.Series:
    return np.where(
        (
            (df["gameId"] == 2020092004)
            & (df["team"] == "away")
            & (df["nflId"] == 40657)
        ),
        40,
        df["jerseyNumber"]
    )


def read_tracking_data(seasons: List[int], progress=False, **kwargs) -> pd.DataFrame:
    readers = [
        pd.read_csv(
            f"{DIR}/tracking{season}.csv",
            iterator=True,
            chunksize=10**5,
            **kwargs
        )
        for season in
        seasons
    ]
    
    dfs = []
    for i, reader in enumerate(readers):
        progress_bar = tqdm(reader, desc=f"{seasons[i]} Season") if progress else reader
        for chunk in progress_bar:
            dfs.append(chunk)
            
    return pd.concat(dfs)

In [None]:
def get_kicking_yardline(play: pd.Series) -> int:
    """
    Returns the line-of-scrimmage yardline relative to the kicking team.
        e.g. the kicking team's goal line will be 0
        e.g. midfield is always 50
        e.g. the receiving team's 30 is 50 + (50 - 30) = 50 + 20 = 70
    """
    if play.yardlineNumber == 50:
        return 50
    elif play.possessionTeam == play.yardlineSide:
        return play.yardlineNumber
    else:
        return 50 + (50 - play.yardlineNumber)
    

def get_receiving_yardline(play: pd.Series) -> int:
    if pd.isna(play.kickLength):
        return None
    return 100 - (play.kickingYardline + play.kickLength)


def get_return_result_yardline(play: pd.Series) -> int:
    if pd.isna(play.receivingYardline):
        return None
    if pd.isna(play.kickReturnYardage):
        return play.receivingYardline
    return play.receivingYardline + play.kickReturnYardage


def get_penalty_result_yardline(play: pd.Series) -> int:
    """
    Gets the result of the play with return and penalty from
    the yardage line relative to the return.
        e.g. if the kicking team kicks from their 10 yard line
            (10 in kicking units, 90 in receiving units) and
            the play nets 30 yards for the kicking team, that
            puts the ball on the kicking team's 40 yard line
            (40 in kicking units, 60 in receiving units)
    """
    return 100 - (play.kickingYardline + play.playResult)

# Load Data

In [None]:
tracking_cols = [
    "gameId",
    "playId",
    "frameId",
    "playDirection",
    "time",
    "event",
    "x",
    "y",
    "s",
    "dir",
    "team",
    "nflId",
    "jerseyNumber",
    "position",
]

In [None]:
DIR = "../input/nfl-big-data-bowl-2022"
df_games = pd.read_csv(f"{DIR}/games.csv")
df_plays = pd.read_csv(f"{DIR}/plays.csv")
df_players = pd.read_csv(f"{DIR}/players.csv")
df_pff = pd.read_csv(f"{DIR}/PFFScoutingData.csv")

# Apply Patches and Transformations

In [None]:
df_plays["kickingYardline"] = df_plays.apply(get_kicking_yardline, axis="columns")
df_plays["receivingYardline"] = df_plays.apply(get_receiving_yardline, axis="columns")
df_plays["returnResultYardline"] = df_plays.apply(get_return_result_yardline, axis="columns")
df_plays["penaltyResultYardline"] = df_plays.apply(get_penalty_result_yardline, axis="columns")

In [None]:
df_pff = apply_pff_patches(df_pff)
df_pff = fill_missing_hangtimes(df_pff)

In [None]:
df_tracking_raw = read_tracking_data(
    seasons=[2018, 2019, 2020],
    progress=True,
    usecols=tracking_cols
)
print(f"Tracking position data contains {len(df_tracking_raw):,d} records.\n")

In [None]:
df_tracking_raw["jerseyNumber"] = get_tracking_jersey_patches(df_tracking_raw)

In [None]:
df_tracking_raw["event"] = np.where(
    df_tracking_raw["event"] == "None",
    None,
    df_tracking_raw["event"]
)

In [None]:
df_events = (
    df_tracking_raw
        [["gameId", "playId", "event", "frameId"]]
            .dropna()
            .drop_duplicates()
)

# Select Relevant Plays

In [None]:
PLAY_KEYS = ["gameId", "playId"]

In [None]:
df_plays_punts = df_plays[df_plays.specialTeamsPlayType == "Punt"]
print(f"Filtered from {len(df_plays):,d} plays to {len(df_plays_punts):,d} punts.")

In [None]:
# These are play results that could include returnable punts
included_result = [
    "Return",
     # Included, but only if returnable
    "Touchback",
    "Fair Catch",
    "Downed",
     # Included, but only if returnable
    "Out of Bounds",
    "Muffed",
]
is_included_result = df_plays_punts.specialTeamsResult.apply(lambda r: r in included_result)
df_plays_result = df_plays_punts[is_included_result]
print(f"Filtered from {len(df_plays_punts):,d} plays to {len(df_plays_result):,d} with included result.")

In [None]:
has_no_penalty_yards = (
    (df_plays_result.penaltyYards.isna())
    | (~(abs(df_plays_result.penaltyYards) > 0))
)
df_plays_no_penalty = df_plays_result[has_no_penalty_yards]
print(f"Filtered from {len(df_plays_result):,d} plays to {len(df_plays_no_penalty):,d} with no penalty.")

In [None]:
# These are events that should only happen on a returnable play
returnable_event = [
    "punt_received",
    # Does this mean the signal or the catch?
    # Can this happen if the ball goes out of bounds?
    "fair_catch",
    "punt_land",
    # Excluded, should be preceded by another returnable event
    # "out_of_bounds",
    "punt_downed",
    # Excluded, should be preceded by another returnable event
    # "touchback",
    "punt_muffed",
]
# Create vectorized version for faster performance on large tracking dataset
vec_is_returnable_event = np.vectorize(lambda e: e in returnable_event)

In [None]:
df_plays_returnable = (
    df_events
        [vec_is_returnable_event(df_events.event)]
        [PLAY_KEYS]
        .drop_duplicates()
        # Use inner join to only keep punt plays that have a returnable event
        .join(df_plays_no_penalty.set_index(PLAY_KEYS), on=PLAY_KEYS, how="inner")
)
print(f"Filtered from {len(df_plays_no_penalty):,d} plays to {len(df_plays_returnable):,d} returnable punts.")

In [None]:
df_plays_returnable.specialTeamsResult.value_counts()

# Select Relevant Frames

In [None]:
punt_cols = [
    "kickLength",
    "possessionTeam",
    "penaltyYards",
    "kickingYardline",
    "receivingYardline",
    "returnResultYardline",
    "penaltyResultYardline",
    "specialTeamsPlayType",
    "specialTeamsResult",
    "returnerId",
]
df_punt_tracking = (
    df_plays_returnable
        [PLAY_KEYS + punt_cols]
        .join(df_tracking_raw.set_index(PLAY_KEYS), on=PLAY_KEYS)
)
print(f"From {len(df_plays_returnable):,d} plays, joined to {len(df_punt_tracking):,d} tracking records.")

In [None]:
df_punt_tracking["isReturnableEvent"] = vec_is_returnable_event(df_punt_tracking.event)

In [None]:
# Find the frame ID of the first returnable event on the play
df_returnable_event_frames = (
    df_punt_tracking
        [df_punt_tracking.isReturnableEvent]
        [PLAY_KEYS + ["event", "frameId"]]
        .dropna()
        .drop_duplicates()
)
# Get location where ball will land
df_ball_location = (
    df_punt_tracking
        [
            (df_punt_tracking.isReturnableEvent)
            & (df_punt_tracking.team == "football")
        ]
        [PLAY_KEYS + ["frameId", "x", "y"]]
)
df_first_event = (
    df_returnable_event_frames
        .groupby(PLAY_KEYS)
        ["frameId"].min()
        .reset_index()
        .join(
            df_returnable_event_frames.set_index(PLAY_KEYS + ["frameId"])["event"],
            on=(PLAY_KEYS + ["frameId"])
        )
        .join(
            df_ball_location.set_index(PLAY_KEYS + ["frameId"])[["x", "y"]],
            on=(PLAY_KEYS + ["frameId"])
        )
        .rename(columns={
            "frameId": "firstReturnableFrame",
            "event": "firstReturnableEvent",
            "x": "ballLandingX",
            "y": "ballLandingY",
        })

)
assert len(df_first_event) == len(df_plays_returnable), "Result should have one event per play."

In [None]:
# Find the frame where the returner has to decide whether or not to return
# Assume decision period is one second (10 frames) before the first returnable event
FRAMES_FOR_DECISION = 10
df_first_event["decisionFrame"] = df_first_event["firstReturnableFrame"] - FRAMES_FOR_DECISION

In [None]:
df_first_event.columns

In [None]:
# Join tracking data on decision frame to get one frame per play
df_decision_frame = (
    df_first_event
        .join(
            df_punt_tracking.set_index(PLAY_KEYS + ["frameId"]),
            on=PLAY_KEYS + ["decisionFrame"]
        )
        .rename(columns={"decisionFrame": "frameId"})
)
assert len(df_first_event) == len(df_plays_returnable), "Result should have one frame per play."
assert len(df_decision_frame) == (len(df_first_event) * 23), "Result should have 23 trackers per frame."
print(f"Filtered to {len(df_decision_frame):,d} rows of tracking data.")

In [None]:
# Join to game and PFF columns
game_cols = [
    "season",
    "week",
    "homeTeamAbbr",
    "visitorTeamAbbr",
]
pff_cols = [
    "hangTime",
]
df_selected_frames = (
    df_decision_frame
        .join(df_games.set_index("gameId")[game_cols], on="gameId")
        .join(df_pff.set_index(PLAY_KEYS)[pff_cols], on=PLAY_KEYS)
)
assert len(df_selected_frames) == len(df_decision_frame), "Join should not explode."
assert df_selected_frames["hangTime"].isna().sum() == 0, "Hang time field should have no nulls."

In [None]:
def get_team_code(team: str, home: str, away: str) -> Optional[str]:
    if team == "football":
        return None
    if team == "home":
        return home
    return away

vec_get_team_code = np.vectorize(get_team_code)

In [None]:
# Add team code to tracking data, not just home/away 
df_selected_frames["teamCode"] = vec_get_team_code(
    df_selected_frames.team,
    df_selected_frames.homeTeamAbbr,
    df_selected_frames.visitorTeamAbbr
)

# Reflect Tracking Data

In [None]:
# Reorient tracking data so that the return team goal line is at x = 10
df_reoriented = reorient_tracking_data(df_selected_frames)

In [None]:
# Double the dataset by reflecting each play
df_reflected = reflect_tracking_data(df_reoriented)
df_reoriented["original"] = True
df_reflected["original"] = False
df_tracking_all = pd.concat([df_reoriented, df_reflected])
assert len(df_tracking_all) == len(df_reoriented) * 2, "Result should be doubled after reflection."
print(f"Doubled dataset from {len(df_reoriented):,d} plays to {len(df_tracking_all):,d} plays.")

# Aggregate Tracking Data by Frame

In [None]:
FRAME_KEYS = ["gameId", "playId", "original", "frameId"]
PLAYER_COLS = [
    "team",
    "teamCode",
    "jerseyNumber",
    "nflId",
    "position",
    "x",
    "y",
    "s",
    "dir",
]
PLAYER_COLS_SET = set(PLAYER_COLS)
RECEIVING_GOAL_LINE = 10


def get_ball(trackers: List[Dict]) -> Dict:
    for row in trackers:
        if row["team"] == "football":
            return { "x": row["x"], "y": row["y"] }
    return None


def get_trackers_per_frame(df: pd.DataFrame) -> pd.DataFrame:
    df.sort_values(by=FRAME_KEYS, inplace=True)
    records = df[FRAME_KEYS + PLAYER_COLS].to_dict(orient="records")
    output = []
    last_key = None
    frame_data = []
    ball = None
    for p in tqdm(records, total=len(df)):
        key = (p["gameId"], p["playId"], p["original"], p["frameId"])
        if key == last_key or last_key is None:
            # Add entry to existing frame
            if p["team"] == "football":
                ball = { "x": p["x"], "y": p["y"] }
            else:
                frame_data.append({ k: v for k, v in p.items() if k in PLAYER_COLS_SET })
        elif last_key is not None:
            output.append((*last_key, frame_data, ball))
            # Reset for next frame
            frame_data = []
            ball = None
            # Add first entry of next frame
            if p["team"] == "football":
                ball = { "x": p["x"], "y": p["y"] }
            else:
                frame_data.append({ k: v for k, v in p.items() if k in PLAYER_COLS_SET })
        last_key = key
    output.append((*last_key, frame_data, ball))
    df_out = pd.DataFrame(output, columns=(FRAME_KEYS + ["players", "ball"]))
    # Get ball coordinates as columns instead of one JSON column
    df_out["ballX"] = df_out["ball"].apply(lambda b: b["x"])
    df_out["ballY"] = df_out["ball"].apply(lambda b: b["y"])
    df_out.drop(columns=["ball"], inplace=True)
    # Get ball yardline in terms of receiving team yards,
    # where the receiving team goal line is 0 instead of x = 10
    df_out["ballYardline"] = df_out["ballX"] - RECEIVING_GOAL_LINE
    return df_out

In [None]:
df_frames = get_trackers_per_frame(df_tracking_all)
assert len(df_frames) == len(df_plays_returnable) * 2, "Results should have one frame per play."
assert (df_frames["players"].apply(len) == 22).all(), "Every frame should have 22 players."
print(f"Aggregated tracking data into {len(df_frames):,d} frames.")

In [None]:
df_tracking_frames = (
    df_tracking_all
        .drop(columns=PLAYER_COLS)
        .groupby(PLAY_KEYS + ["original"])
        .first()
        .reset_index()
        .join(df_frames.set_index(FRAME_KEYS), on=FRAME_KEYS)
)
assert len(df_tracking_frames) == len(df_plays_returnable) * 2, "Results should have one frame per play."
assert df_tracking_frames["players"].isna().sum() == 0, "All plays should match to tracking frames."

# Fill Returner ID

In [None]:
MAX_DIST = 200


def distance(a: Dict, b: Dict) -> float:
    return np.sqrt((a["y"] - b["y"])**2 + (a["x"] - b["x"])**2)


def get_returner_furthest_back(players: List[Dict], kickingTeam: str) -> int:
    min_x = MAX_DIST
    returner = None
    for p in players:
        if p["teamCode"] != kickingTeam:
            if p["x"] < min_x:
                min_x = p["x"]
                returner = p
    return int(returner["nflId"])


def get_returner_closest_to_ball(
    players: List[Dict],
    kickingTeam: str,
    ball_x: float,
    ball_y: float
) -> int:
    ball = {"x": ball_x, "y": ball_y}
    min_dist = MAX_DIST
    returner = None
    for p in players:
        if p["teamCode"] != kickingTeam:
            d = distance(ball, p)
            if d < min_dist:
                min_dist = d
                returner = p
    return int(returner["nflId"])


def get_first_returner(raw: str) -> Optional[int]:
    if pd.isna(raw):
        return None
    return int(raw.split(";")[0])


vec_get_returner_furthest_back = np.vectorize(get_returner_furthest_back)
vec_get_returner_closest_to_ball = np.vectorize(get_returner_closest_to_ball)

In [None]:
# From inspecting the 11 plays with more than one returner (mostly due to laterals or recoveries),
# the IDs are listed in the order in which the returners possessed the balls, so we can use the
# first returner as the returner who decided whether or not to return the punt
df_tracking_frames["returnerFirst"] = df_tracking_frames["returnerId"].apply(get_first_returner)
# When no returner IDs are listed in the play data, we will fall back to the player on the return team
# who was furthest back, i.e., the lowest x-value in the reoriented data, which puts the back of the
# return team's end zone at x = 0
df_tracking_frames["returnerFurthest"] = vec_get_returner_furthest_back(
    df_tracking_frames["players"],
    df_tracking_frames["possessionTeam"]
)
# We also inspected the returner closest to the ball at the initial event (which could be the punt
# landing or being received), but when comparing possible returners, we decided that the returner
# furthest back would be the most reasonable default, and also overlooking cases where no player
# is back deep for the return team
df_tracking_frames["returnerClosest"] = vec_get_returner_closest_to_ball(
    df_tracking_frames["players"],
    df_tracking_frames["possessionTeam"],
    df_tracking_frames["ballX"],
    df_tracking_frames["ballY"],
)
df_tracking_frames["returnerNflId"] = (
    df_tracking_frames["returnerFirst"]
        .combine_first(df_tracking_frames["returnerFurthest"])
        .astype(int)
)
assert df_tracking_frames["returnerNflId"].isna().sum() == 0, "All plays should have a returner NFL ID."

# Create Target Variable

In [None]:
# Create yard line version of ball landing x-coordinate, after all transformations
df_tracking_frames["ballLandingYardline"] = df_tracking_frames["ballLandingX"] - RECEIVING_GOAL_LINE

In [None]:
# We filtered out penalty plays, but we named this column penaltyResultYardline because it uses
# the playResult value from the play data, which would include any penalty yards.
df_tracking_frames["returnYardsGained"] = (
    df_tracking_frames["penaltyResultYardline"] - df_tracking_frames["receivingYardline"]
)

In [None]:
def return_result(yards: float) -> str:
    if yards > 0:
        return "gain"
    if yards < 0:
        return "loss"
    return "zero"


df_tracking_frames["returnOutcome"] = df_tracking_frames["returnYardsGained"].apply(return_result)
df_tracking_frames["isZeroOrLoss"] = df_tracking_frames["returnOutcome"] != "gain"

# Verify Positions and Yard Lines

In [None]:
# Spot check columns that have been reoriented and reflected
yard_cols = [
    "original",
    "specialTeamsResult",
    "firstReturnableEvent",
    "kickingYardline",
    "kickLength",
    "receivingYardline",
    "ballLandingX",
    "ballLandingY",
    "ballLandingYardline",
    "ballX",
    "ballY",
    "ballYardline",
]
df_tracking_frames[PLAY_KEYS + yard_cols].head()

In [None]:
# Verify correctness of columns that have been reflected
same_cols = [
    "kickingYardline",
    "kickLength",
    "receivingYardline",
    "ballLandingX",
    "ballLandingYardline",
    "ballX",
    "ballYardline",
]
flip_cols = [
    "ballLandingY",
    "ballY",
]
check_cols = same_cols + flip_cols
df_check_original = df_tracking_frames[df_tracking_frames.original]
df_check_reflected = df_tracking_frames[~(df_tracking_frames.original.astype(bool))]
df_check_compare = (
    df_check_original
        .set_index(PLAY_KEYS)
        [check_cols]
        .join(
            df_check_reflected.set_index(PLAY_KEYS)[check_cols],
            on=PLAY_KEYS,
            lsuffix="Original",
            rsuffix="Reflected",
        )
        .reset_index()
)
for col in same_cols:
    assert np.all(
        df_check_compare[f"{col}Original"] == df_check_compare[f"{col}Reflected"]
    ), f"Column {col} should have the same value after reflection."
for col in flip_cols:
    assert np.all(
        df_check_compare[f"{col}Original"] != df_check_compare[f"{col}Reflected"]
    ), f"Column {col} should be flipped after reflection."

In [None]:
df_check_compare[PLAY_KEYS + list(sorted(df_check_compare.columns))].head()

In [None]:
# Check for punts where the ball was further down field at the decision frame than when it "landed",
# which is really based on the first returnable event
df_back = df_tracking_frames[
    (df_tracking_frames.original)
    & (df_tracking_frames["ballLandingYardline"] > df_tracking_frames["ballYardline"])
].copy()
print(f"There are {len(df_back)} plays where the ball landing spot is downfield of the decision spot.")
df_back["ballDiffYardline"] = df_back["ballYardline"] - df_back["ballLandingYardline"]

In [None]:
# The derived column ballDiffYardline is the difference between where the ball
# was at the decision time and where it was when it was first returnable.
# We expect this difference to be positive, as the ball should move closer to
# returning team's endzone. In these cases, it is negative, possibly due to
# wobbling in the air, tracking error, or a punt that bounces back before the
# first returnable event. After checking all ten instances, all but two cases
# have a diff of more than a yard, one moves 2.4 yards back and the other moves
# 4.3 yards back, so this should not be a big issue.
df_back[PLAY_KEYS + ["ballLandingYardline", "ballYardline", "ballDiffYardline"]]

# Split Cross Validation Data

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# We need both versions of the same play to be in the same split
SEED = 0
SPLIT_COLS = ["gameId", "playId"]
df_split = df_tracking_frames[SPLIT_COLS].drop_duplicates()
# Split training data, hold data will become validation and test data
split_train, split_hold = train_test_split(
    df_split,
    test_size=0.5,
    shuffle=True,
    random_state=SEED,
)
# Split validation and test data
split_validate, split_test = train_test_split(
    split_hold,
    test_size=0.5,
    shuffle=True,
    random_state=SEED,
)
with pd.option_context("mode.chained_assignment", None):
    split_train["split"] = "train"
    split_validate["split"] = "validate"
    split_test["split"] = "test"
df_split_all = pd.concat([
    split_train,
    split_validate,
    split_test,
])
print("Split of plays:")
print(f"Train:    {len(split_train):,d} plays")
print(f"Validate: {len(split_validate):,d} plays")
print(f"Test:     {len(split_test):,d} plays")
print()
print(f"Total:    {len(df_split_all):,d} plays")
print()
df_assigned = df_split_all.join(df_tracking_frames.set_index(SPLIT_COLS), on=SPLIT_COLS)
print("Split of records:")
df_train = df_assigned[df_assigned["split"] == "train"]
df_validate = df_assigned[df_assigned["split"] == "validate"]
df_test = df_assigned[df_assigned["split"] == "test"]
print(f"Train:    {len(df_train):,d} frames")
print(f"Validate: {len(df_validate):,d} frames")
print(f"Test:     {len(df_test):,d} frames")
print()
print(f"Total:    {len(df_assigned):,d} frames")
assert len(df_assigned) == len(df_split_all) * 2, "Result should be doubled due to reflected plays."

# Write Output

In [None]:
df_assigned.columns

In [None]:
df_assigned["players"] = df_assigned["players"].apply(lambda o: json.dumps(o))

In [None]:
outfile = "return_frames.csv"
df_assigned.to_csv(outfile, index=False)
print(f"Wrote {df_assigned.shape[0]:,d} rows, {df_assigned.shape[1]:,d} cols to file: {outfile}")

In [None]:
outfile = "plays_patched.csv"
df_plays.to_csv(outfile, index=False)
print(f"Wrote {df_plays.shape[0]:,d} rows, {df_plays.shape[1]:,d} cols to file: {outfile}")

In [None]:
outfile = "pff_patched.csv"
df_pff.to_csv(outfile, index=False)
print(f"Wrote {df_pff.shape[0]:,d} rows, {df_pff.shape[1]:,d} cols to file: {outfile}")