In [1]:
!pip install polars



In [2]:
import os
import pandas as pd
import polars as pl

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [4]:
os.listdir()

['.DS_Store',
 'Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020']

**2023 Data Bowl**

In [5]:
path = "data bowl 2023"
os.listdir(path)

['nfl-big-data-bowl-2023.zip',
 'games.parquet',
 'plays.parquet',
 'tracks.parquet']

In [6]:
import zipfile
from zipfile import ZipFile

zip_file_path = 'data bowl 2023/nfl-big-data-bowl-2023.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()
    for file_name in file_list:
        print(file_name)

games.csv
pffScoutingData.csv
players.csv
plays.csv
week1.csv
week2.csv
week3.csv
week4.csv
week5.csv
week6.csv
week7.csv
week8.csv


In [7]:
import zipfile

def get_csv_from_zip(zip_path, file):
  zf = zipfile.ZipFile(zip_path)
  return pl.from_pandas(pd.read_csv(zf.open(file)))

In [26]:
games = pl.read_parquet("data bowl 2023/games.parquet")
plays = pl.read_parquet("data bowl 2023/plays.parquet")
tracks = pl.read_parquet("data bowl 2023/tracks.parquet")
players = pl.read_csv("data bowl 2023/players.csv")

In [36]:
players_to_merge = players.select(["nflId", "officialPosition"]).rename({"officialPosition": "position"}).with_columns((pl.col("nflId").cast(pl.Float64)))

In [28]:
game_adapted = (games.
                select(["gameId", "homeTeamAbbr", "visitorTeamAbbr"]).
                melt("gameId", ["homeTeamAbbr", "visitorTeamAbbr"], variable_name = "team", value_name = "TeamAbbr").
                with_columns(pl.col('team').str.replace("TeamAbbr", '')))

play_adapted = (plays.
                select(["gameId", "playId", "possessionTeam"]).
                rename({"possessionTeam": "TeamAbbr"}).
                with_columns(pl.lit("Offense").alias("OffDef")))

game_play = play_adapted.join(game_adapted, on=["gameId", "TeamAbbr"], how = "left")

In [29]:
import numpy as np

def get_defense(arr, ref):
  if arr[4] == "home":
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "visitor"))
    return filtered.to_numpy()
  else:
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "home"))
    return filtered.to_numpy()

def get_second_team(arr, ref):
  second = get_defense(arr, ref)[0]
  to_stack = np.array([arr[0], arr[1], second[2], "Defense", second[1]])
  return np.stack([arr, to_stack])

def get_new_arr(df, ref):
  arr = np.stack([get_second_team(v, ref) for v in df.to_numpy()])
  arr = np.reshape(arr, (arr.shape[0]*arr.shape[1], arr.shape[2]))
  return arr

In [30]:
new_arr = get_new_arr(game_play, game_adapted)
new_dict = {game_play.columns[i] : [r[i] for r in new_arr] for i in range(len(game_play.columns))}

new_dict["gameId"] = [int(v) for v in new_dict["gameId"]]
new_dict["playId"] = [int(v) for v in new_dict["playId"]]
new_dict["TeamAbbr"] = [str(v) for v in new_dict["TeamAbbr"]]
new_dict["OffDef"] = [str(v) for v in new_dict["OffDef"]]
new_dict["team"] = [str(v) for v in new_dict["team"]]

game_play = pl.from_dict(new_dict)

In [31]:
game_play = game_play.rename({"team" : "HV", "TeamAbbr": "team"})

In [35]:
tracks

gameId,playId,nflId,frameId,time,jerseyNumber,team,playDirection,x,y,s,a,dis,o,dir,event
i64,i64,f64,i64,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str
2021090900,97,25511.0,1,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.77,24.22,0.29,0.3,0.03,165.16,84.99,"""None"""
2021090900,97,25511.0,2,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.78,24.22,0.23,0.11,0.02,164.33,92.87,"""None"""
2021090900,97,25511.0,3,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.78,24.24,0.16,0.1,0.01,160.24,68.55,"""None"""
2021090900,97,25511.0,4,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.73,24.25,0.15,0.24,0.06,152.13,296.85,"""None"""
2021090900,97,25511.0,5,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.69,24.26,0.25,0.18,0.04,148.33,287.55,"""None"""
2021090900,97,25511.0,6,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.64,24.26,0.35,0.53,0.05,144.42,282.72,"""ball_snap"""
2021090900,97,25511.0,7,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.56,24.26,0.54,1.05,0.08,137.49,272.95,"""None"""
2021090900,97,25511.0,8,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.47,24.25,0.8,1.85,0.09,131.95,267.49,"""None"""
2021090900,97,25511.0,9,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.38,24.24,0.99,2.03,0.09,129.85,263.48,"""None"""
2021090900,97,25511.0,10,"""2021-09-10T00:…",12.0,"""TB""","""right""",37.27,24.23,1.19,1.82,0.11,123.79,263.77,"""None"""


In [37]:
cols = ["gameId", "playId", "team", "nflId", "position", "frameId", "x", "y"]

new_data_final = (tracks.
                  join(players_to_merge, on = "nflId", how = "left").
                  filter(pl.col("team") != "football").
                  with_columns(pl.col("team").str.replace("away", "visitor")).
                  select(cols).
                  join(game_play, on = ["gameId", "playId", "team"], how = "left").
                  drop("HV").
                  with_columns(pl.lit("Passing").alias("PlayType")).
                  select(["PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"]).
                  with_columns((pl.col("frameId")%2).alias("filter")).
                  filter(pl.col("filter") == 1).
                  drop("filter"))

In [38]:
new_data_final.shape

(4024108, 10)

In [39]:
new_data_final.write_parquet(path+"/processed_df.parquet")

In [40]:
new_data_final.select(["gameId", "nflId"]).unique()

gameId,nflId
i64,f64
2021100312,43426.0
2021100300,45481.0
2021101800,47971.0
2021101711,35446.0
2021100309,44917.0
2021102407,53579.0
2021100310,46331.0
2021093000,52515.0
2021102407,47810.0
2021092607,44872.0
