# PFF Players Missing From Tracking Data

While reconciling the PFF data and tracking data, I found two problems:

1. PFF data lists jersey numbers that do not exist in the tracking data for the corresponding play
2. Tracking data has one game where two teammates have the same jersey number on the same plays

This notebook identifies those cases and provides [patches](#patches) to fix the source data.

This data cleaning operation will not cover whether the PFF annotations are valid or not, only cover the cases where the PFF jersey number cannot be conclusively matched to a single jersey number in the tracking data.

I used [this notebook](https://www.kaggle.com/vingkan/finding-missing-players-2021-12-31) to manually view each of the affected plays and determine the correct jersey number present in the tracking data.

From looking at the data, here are some possible causes for the mismatches:

- Human error in recording the jersey number
- Players who changed their jersey number during the season (and NGS and PFF have different ways of choosing)

In all the examples I looked at, each **missing player** had the same jersey number in the tracking data across all their plays.

In [None]:
import pandas as pd
import numpy as np
from typing import List
from tqdm.notebook import tqdm
from io import StringIO

In [None]:
DIR = "../input/nfl-big-data-bowl-2022"
df_games = pd.read_csv(f"{DIR}/games.csv")
df_plays = pd.read_csv(f"{DIR}/plays.csv")
df_players = pd.read_csv(f"{DIR}/players.csv")
df_pff = pd.read_csv(f"{DIR}/PFFScoutingData.csv")

In [None]:
pff_jersey_cols = [
    "missedTackler",
    "assistTackler",
    "tackler",
    "gunners",
    "puntRushers",
    "specialTeamsSafeties",
    "vises",
]
pff_jersey_out_cols = [
    "gameId",
    "playId",
    "teamCode",
    "jerseyNumber",
    "role",
]


def split_jersey_list(raw: str, **kwargs) -> List[str]:
    if pd.isna(raw):
        return []
    return raw.split("; ")


def get_jersey_part(raw: str, part: int, **kwargs) -> List[str]:
    if pd.isna(raw):
        return None
    return raw.split(" ")[part]


def get_pff_player_jerseys(df_pff: pd.DataFrame) -> pd.DataFrame:
    df = df_pff.copy()
    for col in pff_jersey_cols:
        df[col] = df[col].apply(split_jersey_list)
    dfs = []
    for col in pff_jersey_cols:
        df["role"] = col
        df_role = df.explode(col)
        df_role["teamCode"] = df_role[col].apply(get_jersey_part, args=(0,))
        df_role["jerseyNumber"] = df_role[col].apply(get_jersey_part, args=(1,))
        df_role = df_role.dropna(subset=["teamCode", "jerseyNumber"])
        dfs.append(df_role[pff_jersey_out_cols])
    df_all = pd.concat(dfs)
    df_all["jerseyNumber"] = df_all["jerseyNumber"].astype(int)
    return df_all

In [None]:
def get_team_code(row: pd.Series, **kwargs) -> str:
    team = row["team"]
    if pd.isna(team) or team == "football":
        return None
    return row[team]


def augment_with_team_code(df: pd.DataFrame) -> pd.DataFrame:
        df["jerseyNumber"] = df["jerseyNumber"].astype(float)
        game_to_team_code = df_games \
            .rename(columns={
                "homeTeamAbbr": "home",
                "visitorTeamAbbr": "away",
            }) \
            [["gameId", "home", "away"]] \
            .set_index("gameId")
        df = df.join(game_to_team_code, on=["gameId"])
        with pd.option_context("mode.chained_assignment", None):
            df["teamCode"] = df.apply(get_team_code, axis="columns")
        return df


def read_tracking_data(seasons: List[int], progress=False, **kwargs) -> pd.DataFrame:
    readers = [
        pd.read_csv(
            f"{DIR}/tracking{season}.csv",
            iterator=True,
            chunksize=10**5,
            **kwargs
        )
        for season in
        seasons
    ]
    
    dfs = []
    for i, reader in enumerate(readers):
        progress_bar = tqdm(reader, desc=f"{seasons[i]} Season") if progress else reader
        for chunk in progress_bar:
            dfs.append(chunk)
            
    return pd.concat(dfs)

In [None]:
SEASONS = [2018, 2019, 2020]
tracking_cols = [
    "gameId",
    "playId",
    "team",
    "jerseyNumber",
    "nflId",
    "displayName",
    "position",
]

In [None]:
df_tracking = read_tracking_data(
    seasons=SEASONS,
    progress=True,
    usecols=tracking_cols
).drop_duplicates().dropna()
print(f"Tracking data contains {len(df_tracking):,d} player-play records.\n")
df_tracking.head()

In [None]:
df_tracking_players = augment_with_team_code(df_tracking)
print(f"Player data from tracking contains {len(df_tracking_players):,d} player-play records.\n")
df_tracking_players.head()

In [None]:
join_cols = [
    "gameId",
    "playId",
    "teamCode",
    "jerseyNumber"
]
df_pff_players = get_pff_player_jerseys(df_pff)
df_pff_players_no_roles = df_pff_players.groupby(join_cols)
print(f"PFF data contains {len(df_pff_players):,d} player-play-role records.\n")
print(f"PFF data contains {len(df_pff_players_no_roles):,d} player-play records.\n")
df_pff_players.head()

In [None]:
df_pff_track = df_pff_players.join(df_tracking_players.set_index(join_cols), on=join_cols)
df_pff_track_no_roles = df_pff_track.groupby(join_cols)
print(f"Post-join, PFF player role data contains {len(df_pff_track):,d} player-play-role records.\n")
print(f"Post-join, PFF player role data contains {len(df_pff_track_no_roles):,d} player-play records.\n")
df_pff_track.head()

In [None]:
play_keys = ["gameId", "playId"]
play_cols = play_keys + ["specialTeamsPlayType", "specialTeamsResult"]
df_pff_track_explosion = df_pff_track \
    [join_cols + ["nflId"]] \
    .drop_duplicates() \
    .groupby(join_cols) \
    .count() \
    .reset_index() \
    .join(df_plays[play_cols].set_index(play_keys), on=play_keys) \
    .rename(columns={ "nflId": "tracking_matches" }) \
    .query("tracking_matches > 1") \
    .sort_values(by="tracking_matches", ascending=False)
print(f"Post-join, {len(df_pff_track_explosion):,d} player-play records had multiple tracking matches.")
df_pff_track_explosion.head()

In [None]:
df_tracking \
    .query("team == 'away' and jerseyNumber == 44 and gameId == 2020092004 and playId == 279")

In [None]:
df_missing_from_pff = df_pff_track[df_pff_track["nflId"].isna()]
df_missing_from_pff = df_missing_from_pff.join(df_plays[play_cols].set_index(play_keys), on=play_keys)
with pd.option_context("mode.chained_assignment", None):
    df_missing_from_pff["season"] = df_missing_from_pff["gameId"].apply(lambda d: int(str(d)[:4]))
print(f"{len(df_missing_from_pff):,d} PFF player-play-role records are not found in tracking data.\n")
df_missing_from_pff.head()

In [None]:
df_tracking_players.query("gameId == 2019111711 and playId == 3643 and teamCode == 'LA' and jerseyNumber == 38")

In [None]:
df_missing_from_pff.groupby("specialTeamsPlayType")["playId"].count().reset_index()

In [None]:
df_missing_from_pff.groupby("season")["playId"].count().reset_index()

In [None]:
df_missing_from_pff.groupby(["season", "specialTeamsPlayType"])["playId"].count().reset_index()

In [None]:
df_missing_from_pff.groupby(["season", "specialTeamsPlayType", "role"])["playId"].count().reset_index()

In [None]:
df_missing_from_pff.query("season == 2020 and specialTeamsPlayType == 'Punt' and role == 'vises'")

In [None]:
df_missing_from_pff \
    .query("role == 'gunners' or role == 'vises'") \
    .groupby(["teamCode", "jerseyNumber"]) \
    ["playId"].count() \
    .reset_index() \
    .rename(columns={ "playId": "playCount" }) \
    .sort_values(by="playCount", ascending=False)

In [None]:
out_cols = ["gameId", "playId", "teamCode", "jerseyNumber", "role"]
buffer = StringIO()
df_missing_from_pff[out_cols].to_csv(buffer, index=False)

In [None]:
buffer.getvalue()

In [None]:
raw_missing_players = 'gameId,playId,teamCode,jerseyNumber,role\n2019111711,3643,LA,38,missedTackler\n2018120600,1607,TEN,47,tackler\n2018120600,1607,TEN,47,gunners\n2018120600,3309,TEN,47,gunners\n2018122400,542,OAK,14,gunners\n2019090805,326,NYJ,25,gunners\n2019090805,666,BUF,29,gunners\n2019090805,1006,NYJ,23,gunners\n2019090805,1006,NYJ,25,gunners\n2019090805,1159,BUF,29,gunners\n2019090805,1436,NYJ,25,gunners\n2019090805,1802,NYJ,25,gunners\n2019090805,2441,NYJ,25,gunners\n2019090805,3298,NYJ,25,gunners\n2019090805,3724,NYJ,25,gunners\n2019090805,4435,BUF,29,gunners\n2018093011,1761,NO,27,puntRushers\n2019090805,4435,NYJ,23,puntRushers\n2019112400,3811,ATL,55,puntRushers\n2020091312,3370,DAL,59,puntRushers\n2020091312,3822,DAL,59,puntRushers\n2020092004,279,BUF,40,puntRushers\n2020092004,1033,BUF,40,puntRushers\n2020092004,1353,BUF,40,puntRushers\n2020092004,1862,BUF,40,puntRushers\n2020092004,3716,BUF,40,puntRushers\n2020092708,1811,IND,33,puntRushers\n2020092708,2190,IND,33,puntRushers\n2020092708,2556,IND,33,puntRushers\n2020092708,2665,NYJ,37,puntRushers\n2020092708,3050,IND,33,puntRushers\n2020092708,3271,NYJ,37,puntRushers\n2020092711,2619,DEN,91,puntRushers\n2020092711,3538,DEN,91,puntRushers\n2018090903,3713,MIA,30,specialTeamsSafeties\n2018091613,778,NYG,23,specialTeamsSafeties\n2018091613,1035,NYG,23,specialTeamsSafeties\n2018091613,1575,NYG,23,specialTeamsSafeties\n2018102100,3461,TEN,25,specialTeamsSafeties\n2019090805,2219,BUF,29,specialTeamsSafeties\n2019090805,2481,BUF,29,specialTeamsSafeties\n2019090805,3125,BUF,29,specialTeamsSafeties\n2019090805,3551,BUF,29,specialTeamsSafeties\n2019090805,4017,BUF,29,specialTeamsSafeties\n2019091507,36,SEA,24,specialTeamsSafeties\n2019091507,1491,SEA,24,specialTeamsSafeties\n2019091507,2703,SEA,24,specialTeamsSafeties\n2019091507,3074,SEA,24,specialTeamsSafeties\n2019091507,3732,SEA,24,specialTeamsSafeties\n2019111711,54,LA,38,specialTeamsSafeties\n2019111711,1153,LA,38,specialTeamsSafeties\n2019111711,1663,LA,38,specialTeamsSafeties\n2019111711,3643,LA,38,specialTeamsSafeties\n2020092708,3513,IND,33,specialTeamsSafeties\n2020100400,40,CAR,20,specialTeamsSafeties\n2020100400,613,CAR,20,specialTeamsSafeties\n2020100400,939,CAR,20,specialTeamsSafeties\n2018090903,1257,MIA,30,vises\n2018090903,3163,MIA,30,vises\n2018091613,2608,NYG,23,vises\n2018120600,1699,TEN,47,vises\n2018122400,744,OAK,14,vises\n2018122400,1634,OAK,14,vises\n2018122400,2467,OAK,14,vises\n2019090805,1159,NYJ,25,vises\n2019090805,4435,NYJ,25,vises\n2019091507,115,SEA,24,vises\n2019091507,315,SEA,24,vises\n2019091507,786,SEA,24,vises\n2019091507,1571,SEA,24,vises\n2019100700,3613,CLE,25,vises\n2019111711,1240,LA,38,vises\n2019111711,2587,LA,38,vises\n2019111711,2829,LA,38,vises\n2019111711,3159,LA,38,vises\n2019111711,3382,LA,38,vises\n2020100400,169,CAR,20,vises\n2020100400,698,CAR,20,vises\n2020100400,1096,CAR,20,vises\n'
pd.read_csv(StringIO(raw_missing_players))

# Patches

## Raw PFF Patches

```
role,pffJersey,trackingJersey\ntackler,TEN 47,TEN 46\nmissedTackler,LA 38,LA 21\ngunners,TEN 47,TEN 46\ngunners,OAK 14,OAK 38\ngunners,NYJ 25,NYJ 40\ngunners,BUF 29,BUF 36\ngunners,NYJ 23,NYJ 37\nvises,CAR 20,CAR 41\nvises,CLE 25,CLE 35\nvises,LA 38,LA 21\nvises,MIA 30,MIA 23\nvises,NYG 23,NYG 37\nvises,NYJ 25,NYJ 40\nvises,BUF 29,BUF 36\nvises,OAK 14,OAK 38\nvises,SEA 24,SEA 8\nvises,TEN 47,TEN 46\npuntRushers,ATL 55,ATL 62\npuntRushers,DAL 59,DAL 53\npuntRushers,DEN 91,DEN 58\npuntRushers,IND 33,IND 36\npuntRushers,NO 27,NO 36\npuntRushers,NYJ 23,NYJ 37\npuntRushers,NYJ 37,NYJ 43\nspecialTeamsSafeties,BUF 29,BUF 36\nspecialTeamsSafeties,CAR 20,CAR 41\nspecialTeamsSafeties,IND 33,IND 36\nspecialTeamsSafeties,LA 38,LA 21\nspecialTeamsSafeties,MIA 30,MIA 23\nspecialTeamsSafeties,NYG 23,NYG 37\nspecialTeamsSafeties,SEA 24,SEA 8\nspecialTeamsSafeties,TEN 25,TEN 41\n
```

## Tracking Data Patch

In the 2020 tracking data, both Deon Lacey and Tyler Matakevich on the Buffalo Bills have jersey number 44. In that game, Matakevich was 44 and Lacey should be number 40.

```
gameId = 2020092004
team = away
nflId = 40657
jerseyNumber = 40 (corrected)
```

In [None]:
raw_pff_patches = 'role,pffJersey,trackingJersey\ntackler,TEN 47,TEN 46\nmissedTackler,LA 38,LA 21\ngunners,TEN 47,TEN 46\ngunners,OAK 14,OAK 38\ngunners,NYJ 25,NYJ 40\ngunners,BUF 29,BUF 36\ngunners,NYJ 23,NYJ 37\nvises,CAR 20,CAR 41\nvises,CLE 25,CLE 35\nvises,LA 38,LA 21\nvises,MIA 30,MIA 23\nvises,NYG 23,NYG 37\nvises,NYJ 25,NYJ 40\nvises,BUF 29,BUF 36\nvises,OAK 14,OAK 38\nvises,SEA 24,SEA 8\nvises,TEN 47,TEN 46\npuntRushers,ATL 55,ATL 62\npuntRushers,DAL 59,DAL 53\npuntRushers,DEN 91,DEN 58\npuntRushers,IND 33,IND 36\npuntRushers,NO 27,NO 36\npuntRushers,NYJ 23,NYJ 37\npuntRushers,NYJ 37,NYJ 43\nspecialTeamsSafeties,BUF 29,BUF 36\nspecialTeamsSafeties,CAR 20,CAR 41\nspecialTeamsSafeties,IND 33,IND 36\nspecialTeamsSafeties,LA 38,LA 21\nspecialTeamsSafeties,MIA 30,MIA 23\nspecialTeamsSafeties,NYG 23,NYG 37\nspecialTeamsSafeties,SEA 24,SEA 8\nspecialTeamsSafeties,TEN 25,TEN 41\n'
df_pff_patches = pd.read_csv(StringIO(raw_pff_patches))
df_pff_patches.head()

In [None]:
def apply_pff_patches(df_pff: pd.DataFrame, df_patches: pd.DataFrame) -> pd.DataFrame:
    df_out = df_pff.copy()
    for _, patch in df_patches.iterrows():
        role = patch["role"]
        old_jersey = patch["pffJersey"]
        new_jersey = patch["trackingJersey"]
        df_out[role] = np.where(
            df_out[role].str.contains(old_jersey),
            df_out[role].str.replace(old_jersey, new_jersey),
            df_out[role]
        )
    return df_out


def get_tracking_jersey_patches(df: pd.DataFrame) -> pd.Series:
    return np.where(
        (
            (df["gameId"] == 2020092004)
            & (df["team"] == "away")
            & (df["nflId"] == 40657)
        ),
        40,
        df["jerseyNumber"]
    )

In [None]:
df_pff_patched = apply_pff_patches(df_pff, df_pff_patches)

In [None]:
df_tracking["jerseyNumber"] = get_tracking_jersey_patches(df_tracking)