In [1]:
!pip install --upgrade polars
!pip install nfl_data_py
!pip install --upgrade numpy

Collecting nfl_data_py
  Downloading nfl_data_py-0.3.0.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastparquet>0.5 (from nfl_data_py)
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-snappy>0.5 (from nfl_data_py)
  Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet>0.5->nfl_data_py)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [1]:
import os
import pandas as pd
import polars as pl
import nfl_data_py as nfl

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [3]:
os.listdir()

['chromedriver',
 '.DS_Store',
 'Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020',
 'asonty',
 'Highlights_NGS_2019',
 'Highlights_NGS_Prime',
 'geckodriver.log',
 'final_df.parquet']

**2021 Data Bowl**

In [4]:
path = "data bowl 2021"
os.listdir(path)

['games.csv',
 'plays.csv',
 'tracking.gzip',
 'players.csv',
 'processed_df.parquet']

In [7]:
data = pl.read_parquet(path+"/tracking.gzip")
plays = pl.read_csv(path+"/plays.csv")
players = pl.read_csv(path+"/players.csv")
games = pl.read_csv(path+"/games.csv")

In [8]:
game_adapted = (games.
                select(["gameId", "homeTeamAbbr", "visitorTeamAbbr"]).
                melt("gameId", ["homeTeamAbbr", "visitorTeamAbbr"], variable_name = "team", value_name = "TeamAbbr").
                with_columns(pl.col('team').str.replace("TeamAbbr", '')))

play_adapted = (plays.
                select(["gameId", "playId", "possessionTeam"]).
                rename({"possessionTeam": "TeamAbbr"}).
                with_columns(pl.lit("Offense").alias("OffDef")))

game_play = play_adapted.join(game_adapted, on=["gameId", "TeamAbbr"], how = "left")

In [9]:
import numpy as np

def get_defense(arr, ref):
  if arr[4] == "home":
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "visitor"))
    return filtered.to_numpy()
  else:
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "home"))
    return filtered.to_numpy()

def get_second_team(arr, ref):
  second = get_defense(arr, ref)[0]
  to_stack = np.array([arr[0], arr[1], second[2], "Defense", second[1]])
  return np.stack([arr, to_stack])

def get_new_arr(df, ref):
  arr = np.stack([get_second_team(v, ref) for v in df.to_numpy()])
  arr = np.reshape(arr, (arr.shape[0]*arr.shape[1], arr.shape[2]))
  return arr

In [10]:
new_arr = get_new_arr(game_play, game_adapted)
new_dict = {game_play.columns[i] : [r[i] for r in new_arr] for i in range(len(game_play.columns))}

new_dict["gameId"] = [int(v) for v in new_dict["gameId"]]
new_dict["playId"] = [int(v) for v in new_dict["playId"]]
new_dict["TeamAbbr"] = [str(v) for v in new_dict["TeamAbbr"]]
new_dict["OffDef"] = [str(v) for v in new_dict["OffDef"]]
new_dict["team"] = [str(v) for v in new_dict["team"]]

game_play = pl.from_dict(new_dict)

In [11]:
cols = ["gameId", "playId", "team", "nflId", "position", "frameId", "x", "y"]

new_data_final = (data.
                  filter(pl.col("team") != "football").
                  with_columns(pl.col("team").str.replace("away", "visitor")).
                  select(cols).
                  join(game_play, on = ["gameId", "playId", "team"], how = "left").
                  drop("team").
                  rename({"TeamAbbr": "team"}).
                  with_columns(pl.lit("Passing").alias("PlayType")).
                  select(["PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"]).
                  with_columns((pl.col("frameId")%2).alias("filter")).
                  filter(pl.col("filter") == 1).
                  drop("filter"))

In [12]:
new_data_final.shape

(8595356, 10)

In [13]:
new_data_final.write_parquet(path+"/processed_df.parquet")

In [5]:
data = pl.read_parquet(path+"/processed_df.parquet")

In [6]:
years_to_get = [2017, 2018, 2019, 2020, 2021, 2022]
rosters = pl.from_pandas(nfl.import_rosters(years_to_get))
season_data = pl.from_pandas(nfl.import_pbp_data(years_to_get))
players = pl.read_csv(path+"/players.csv")

2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


In [54]:
data.columns

['PlayType',
 'gameId',
 'playId',
 'team',
 'OffDef',
 'nflId',
 'position',
 'frameId',
 'x',
 'y']

In [None]:
filter((pl.col("gameId") != 2018093011) &
                       (~pl.col("nflId").is_in(["43336", "38852"]))

In [18]:
updated_data = (data.
                groupby("PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position").
                agg([
                    pl.col("frameId"),
                    pl.col("x"),
                    pl.col("y")
                ]).
                with_columns(pl.col("nflId").cast(pl.Int64)).
                join(
                    players.
                    select("nflId", "displayName").
                    rename({"displayName" : "player_name"}),
                    on = "nflId",
                    how = "left").
                join(
                    rosters.
                    filter(pl.col("season") == 2018).
                    select("team", "player_name", "gsis_it_id").
                    rename({"team" : "check_team"}),
                    on = "player_name",
                    how = "left").
                drop("nflId").
                rename({"gsis_it_id" : "nflId"}).
                filter((pl.col("gameId") != 2018093011) &
                       (~pl.col("nflId").is_in(["43336", "38852"])))
                )

checks = (updated_data.
          select("gameId", "playId", "nflId").
          groupby("gameId", "playId", "nflId").
          count().
          rename({"count" : "NbOcc"}))

In [19]:
checks.select("NbOcc").unique()

NbOcc
u32
1


In [27]:
nb_players_per_play = (updated_data.
                       select("gameId", "playId", "nflId").
                       groupby("gameId", "playId").
                       count().
                       rename({"count" : "nb_players_check"}))

final_data = (updated_data.
              join(nb_players_per_play,
                   on = ["gameId", "playId"],
                   how = "left").
              filter(pl.col("nb_players_check") < 23).
              drop("nb_players_check").
              explode(["frameId", "x", "y"]))

In [31]:
final_data = (final_data.
              select("PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"))

In [32]:
final_data.write_parquet(path+"/processed_df.parquet")

In [33]:
final_data.filter(pl.col("nflId").is_null())

PlayType,gameId,playId,team,OffDef,nflId,position,frameId,x,y
str,i64,i64,str,str,str,str,i64,f64,f64
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",1,84.29,10.34
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",3,84.29,10.36
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",5,84.29,10.36
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",7,84.28,10.37
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",9,84.28,10.36
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",11,84.26,10.35
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",13,84.22,10.44
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",15,84.17,10.52
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",17,84.04,10.7
"""Passing""",2018112504,3584,"""IND""","""Defense""",,"""CB""",19,83.77,11.04


In [34]:
updated_data.filter(pl.col("nflId").is_null())

PlayType,gameId,playId,team,OffDef,position,frameId,x,y,player_name,check_team,nflId
str,i64,i64,str,str,str,list[i64],list[f64],list[f64],str,str,str
"""Passing""",2018112504,3584,"""IND""","""Defense""","""CB""","[1, 3, … 55]","[84.29, 84.29, … 67.75]","[10.34, 10.36, … 25.97]","""Kenny Moore II…",,
"""Passing""",2018102103,3611,"""IND""","""Defense""","""CB""","[1, 3, … 69]","[59.8, 59.79, … 51.05]","[35.8, 35.79, … 24.38]","""Kenny Moore II…",,
"""Passing""",2018102804,3017,"""SEA""","""Defense""","""SS""","[1, 3, … 73]","[99.67, 99.7, … 95.86]","[16.9, 16.92, … 26.58]","""Lano Hill""",,
"""Passing""",2018111110,1252,"""SEA""","""Defense""","""SS""","[1, 3, … 57]","[78.0, 78.29, … 77.37]","[28.74, 28.76, … 17.91]","""Lano Hill""",,
"""Passing""",2018120205,1575,"""IND""","""Defense""","""CB""","[1, 3, … 77]","[46.62, 46.6, … 21.55]","[10.2, 10.23, … 1.33]","""Kenny Moore II…",,
"""Passing""",2018102104,2512,"""JAX""","""Offense""","""RB""","[1, 3, … 57]","[50.62, 50.62, … 52.4]","[27.87, 27.88, … 27.21]","""Dave Williams""",,
"""Passing""",2018112506,719,"""NYG""","""Offense""","""WR""","[1, 3, … 43]","[18.27, 18.26, … 8.52]","[32.66, 32.67, … 28.55]","""Odell Beckham …",,
"""Passing""",2018120207,1454,"""NYG""","""Offense""","""WR""","[1, 3, … 53]","[70.29, 70.29, … 53.44]","[10.78, 10.78, … 5.28]","""Odell Beckham …",,
"""Passing""",2018091608,3988,"""IND""","""Defense""","""CB""","[1, 3, … 49]","[45.94, 45.93, … 53.55]","[17.41, 17.42, … 15.64]","""Kenny Moore II…",,
"""Passing""",2018120207,3855,"""NYG""","""Offense""","""WR""","[1, 3, … 51]","[78.14, 78.12, … 105.43]","[19.86, 19.89, … 13.04]","""Odell Beckham …",,
