# Setup

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans


Import data from
<https://www.kaggle.com/competitions/nfl-big-data-bowl-2024/data>

In [None]:
tackles = pd.read_csv("../data/tackles.csv")
plays = pd.read_csv("../data/plays.csv")
games = pd.read_csv("../data/games.csv")
players = pd.read_csv("../data/players.csv")
tracking_weeks = [pd.read_csv(f"../data/tracking_week_{i}_filtered.csv") for i in range(1, 10)]
tackles


Join data and store in `agg`. This will be used for future analysis

In [None]:
agg = tackles.join(games.set_index("gameId"), on = "gameId")
agg = agg.join(players.set_index("nflId"), on = "nflId")
agg = agg.join(plays.set_index(["playId", "gameId"]), on = ["playId", "gameId"], how = "inner")
agg = agg.join(players.set_index("nflId"), on = "ballCarrierId", rsuffix = "_carrier")
agg


As expected, this has the same number of fields as tackles and more
columns

What we have to analyze

In [None]:
agg.columns


``` example
Index(['gameId', 'playId', 'nflId', 'tackle', 'assist', 'forcedFumble',
       'pff_missedTackle', 'season', 'week', 'gameDate', 'gameTimeEastern',
       'homeTeamAbbr', 'visitorTeamAbbr', 'homeFinalScore',
       'visitorFinalScore', 'height', 'weight', 'birthDate', 'collegeName',
       'position', 'displayName', 'ballCarrierId', 'ballCarrierDisplayName',
       'playDescription', 'quarter', 'down', 'yardsToGo', 'possessionTeam',
       'defensiveTeam', 'yardlineSide', 'yardlineNumber', 'gameClock',
       'preSnapHomeScore', 'preSnapVisitorScore', 'passResult', 'passLength',
       'penaltyYards', 'prePenaltyPlayResult', 'playResult',
       'playNullifiedByPenalty', 'absoluteYardlineNumber', 'offenseFormation',
       'defendersInTheBox', 'passProbability', 'preSnapHomeTeamWinProbability',
       'preSnapVisitorTeamWinProbability', 'homeTeamWinProbabilityAdded',
       'visitorTeamWinProbilityAdded', 'expectedPoints', 'expectedPointsAdded',
       'foulName1', 'foulName2', 'foulNFLId1', 'foulNFLId2',
       'height_ball_carrier', 'weight_ball_carrier', 'birthDate_ball_carrier',
       'collegeName_ball_carrier', 'position_ball_carrier',
       'displayName_ball_carrier'],
      dtype='object')
```

# Simple graphs

In [None]:
plt.rc('figure', figsize = (6.4, 4.8))
fig, ax = plt.subplots()
ax.hist(agg["yardsToGo"], bins = range(0, 40), edgecolor = 'C9', linewidth = 1)
ax.set_title("Yards To Go")
ax.set_xlabel("yards")
# plt.show()


In [None]:
agg["yardsToGo"].describe()


Distance needed for a first down is quite frequently 10.

In [None]:
fig, ax = plt.subplots()
ax.hist(agg["gameClock"], bins = 20, edgecolor = 'C9', linewidth = 1)
ax.set_title("time")
ax.set_xlabel("time")
# plt.show()


As time increases, number of tackles decreases, not sure if this is due
to games ending early or some other reason yet.

In [None]:
agg.loc[agg["tackle"] == 1].groupby("defensiveTeam").count().plot.pie(y = "gameId")


In [None]:
agg.loc[agg["tackle"] == 1].groupby("possessionTeam").count().plot.pie(y = "gameId")


Interestingly, all teams seem to have roughly the same number of plays
with tackles

# Future plans

Use height, weight, position, etc. of player to predict whether they
made a tackle during a game or over games

# clusters

In [None]:
all_tracking_select = pd.concat(tracking_weeks).loc[:, ["gameId", "playId", "nflId", "s", "a", "o", "dir"]].dropna(how="any")
agg_select = agg.loc[agg["tackle"] == 1, ["gameId", "playId", "nflId", "ballCarrierId", "pff_missedTackle"]]
positions = agg_select.join(all_tracking_select.set_index(["gameId", "playId", "nflId"]), on = ["gameId", "playId", "nflId"], how = "inner")
positions = positions.join(all_tracking_select.set_index(["gameId", "playId", "nflId"]), on = ["gameId", "playId", "ballCarrierId"], rsuffix = "_carrier")
# use pff_missedTackle for whether or not the tackle was missed
sns.scatterplot(x=positions['s'] - positions['s_carrier'], y=(180 + positions['dir'] - positions['dir_carrier']) % 360 - 180, hue=positions['pff_missedTackle'], palette={"black", "orange"}, s=3)
# plt.show()


In [None]:
tracking_select = [tracking_week.loc[:, ["gameId", "playId", "nflId", "s", "a", "o", "dir"]].dropna(how="any") for tracking_week in tracking_weeks]
agg_select = agg.loc[agg["tackle"] == 1, ["gameId", "playId", "nflId", "ballCarrierId", "pff_missedTackle"]]
positions = [agg_select.join(df.set_index(["gameId", "playId", "nflId"]), on = ["gameId", "playId", "nflId"], how = "inner").join(df.set_index(["gameId", "playId", "nflId"]), on = ["gameId", "playId", "ballCarrierId"], rsuffix = "_carrier") for df in tracking_select]
a = [float((df["s"] - df["s_carrier"]).mean()) for df in positions]


In [None]:
plt.plot(a)
# plt.show()


In [None]:
all_joined = pd.concat(joined)
sns.scatterplot(x=all_joined["s"], y=all_joined["a"], hue=all_joined['tackle'], palette={"gray", "red"}, s=10)


In [None]:
joined[0][joined[0]["pff_missedTackle"] == 1]


# pass length

In [None]:
p = plays[~plays.passLength.isna()].passLength
# ((p > -5) & (p < 10)).sum()
plays.passLength.describe()


In [None]:
plt.hist(plays.passLength, bins = 70)


In [None]:
grouped = plays.join(players.set_index("nflId"), on = "ballCarrierId").groupby("passLength")
x = [passLength for passLength, _ in grouped]
y = [df.playResult.mean() for passLength, df in grouped]
plt.scatter(plays.passLength, plays.playResult, s=3)
plt.xlabel("passLength")
plt.ylabel("playResult")
plt.plot(x,y, color = "orange")
plt.plot((-10,70), (-10, 70), color = "purple")

# plt.show()


In [None]:
# y = [df.playResult.mean() - df.passLength.mean() for passLength, df in grouped]
# c = plays.join(players.set_index("nflId"), on = "ballCarrierId", rsuffix = "_carrier").ballCarrierDisplayName
# plays['yardsBeforeTackle'] = plays.playResult - plays.passLength
# plt.scatter(plays.passLength, plays.yardsBeforeTackle, s=3, c=c, cmap='gray')
# plt.xlabel("passLength")
# plt.ylabel("playResult")
# plt.plot(x,y, color = "orange")
# # plt.plot((-10,70), (-10, 70), color = "purple")

# plt.show()
# y = [df.playResult.mean() - df.passLength.mean() for passLength, df in grouped]
s = plays.join(players.set_index("nflId"), on = "ballCarrierId", rsuffix = "_carrier").weight
y = [df.playResult.mean() - df.passLength.mean() for passLength, df in grouped]
plays['yardsBeforeTackle'] = plays.playResult - plays.passLength
plt.scatter(plays.passLength, plays.yardsBeforeTackle, s=(s-150) / 8, c=plays.expectedPointsAdded, cmap='gray')
plt.xlabel("passLength")
plt.ylabel("playResult - passLength")
plt.plot(x,y, color = "orange")


``` example
480
```

In [None]:
# fig, axs = plt.subplots(24, 20, figsize = (100, 100))
# i = 0
# for name, df in player_run:
#     axs[i % 24, i // 24].scatter(df.passLength, df.playResult)
#     axs[i % 24, i // 24].set_title(name)
#     i += 1
#     # axs[]
plt.show()
