In [1]:
!pip install polars



In [2]:
import os
import pandas as pd
import polars as pl

from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [3]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [4]:
os.listdir()

['.DS_Store',
 'Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020']

**2022 Data Bowl**

In [5]:
path = "data bowl 2022"
os.listdir(path)

['nfl-big-data-bowl-2022.zip',
 'track2018.parquet',
 'games.parquet',
 'plays.parquet',
 'track2019.parquet',
 'track2020.parquet',
 'tracks.parquet',
 'processed_df.parquet']

In [6]:
import zipfile

zip_file_path = 'data bowl 2022/nfl-big-data-bowl-2022.zip'

with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    file_list = zip_ref.namelist()
    for file_name in file_list:
        print(file_name)

PFFScoutingData.csv
games.csv
players.csv
plays.csv
tracking2018.csv
tracking2019.csv
tracking2020.csv


In [7]:
games = pl.read_parquet("data bowl 2022/games.parquet")
plays = pl.read_parquet("data bowl 2022/plays.parquet")
tracks = pl.read_parquet("data bowl 2022/tracks.parquet")

In [16]:
plays.columns

['gameid',
 'playid',
 'playdescription',
 'quarter',
 'down',
 'yardstogo',
 'possessionteam',
 'specialteamsplaytype',
 'specialteamsresult',
 'kickerid',
 'returnerid',
 'kickblockerid',
 'yardlineside',
 'yardlinenumber',
 'gameclock',
 'penaltycodes',
 'penaltyjerseynumbers',
 'penaltyyards',
 'presnaphomescore',
 'presnapvisitorscore',
 'passresult',
 'kicklength',
 'kickreturnyardage',
 'playresult',
 'absoluteyardlinenumber']

In [8]:
game_adapted = (games.
                select(["gameid", "hometeamabbr", "visitorteamabbr"]).
                melt("gameid", ["hometeamabbr", "visitorteamabbr"], variable_name = "team", value_name = "TeamAbbr").
                with_columns(pl.col('team').str.replace("teamabbr", '')))

play_adapted = (plays.
                select(["gameid", "playid", "possessionteam"]).
                rename({"possessionteam": "TeamAbbr"}).
                with_columns(pl.lit("Offense").alias("OffDef")))

game_play = play_adapted.join(game_adapted, on=["gameid", "TeamAbbr"], how = "left")

game_adapted.columns = ['gameId', 'team', 'TeamAbbr']
play_adapted.columns = ['gameId', 'playId', 'TeamAbbr', 'OffDef']
game_play.columns = ['gameId', 'playId', 'TeamAbbr', 'OffDef', 'team']

In [9]:
to_merge_end = plays.select(["gameid", "playid", "specialteamsplaytype"]).unique()
to_merge_end.columns = ['gameId', 'playId', 'specialteamsplaytype']

In [10]:
to_merge_end = to_merge_end.with_columns([
    (pl.col("gameId").cast(pl.Int64)),
    (pl.col("playId").cast(pl.Int64))])

In [11]:
import numpy as np

def get_defense(arr, ref):
  if arr[4] == "home":
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "visitor"))
    return filtered.to_numpy()
  else:
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "home"))
    return filtered.to_numpy()

def get_second_team(arr, ref):
  second = get_defense(arr, ref)[0]
  to_stack = np.array([arr[0], arr[1], second[2], "Defense", second[1]])
  return np.stack([arr, to_stack])

def get_new_arr(df, ref):
  arr = np.stack([get_second_team(v, ref) for v in df.to_numpy()])
  arr = np.reshape(arr, (arr.shape[0]*arr.shape[1], arr.shape[2]))
  return arr

In [12]:
new_arr = get_new_arr(game_play, game_adapted)
new_dict = {game_play.columns[i] : [r[i] for r in new_arr] for i in range(len(game_play.columns))}

new_dict["gameId"] = [int(v) for v in new_dict["gameId"]]
new_dict["playId"] = [int(v) for v in new_dict["playId"]]
new_dict["TeamAbbr"] = [str(v) for v in new_dict["TeamAbbr"]]
new_dict["OffDef"] = [str(v) for v in new_dict["OffDef"]]
new_dict["team"] = [str(v) for v in new_dict["team"]]

game_play = pl.from_dict(new_dict)

In [13]:
cols = ["gameId", "playId", "team", "nflId", "position", "frameId", "x", "y"]

new_data_final = (tracks.
                  filter(pl.col("team") != "football").
                  with_columns(pl.col("team").str.replace("away", "visitor")).
                  select(cols).
                  join(game_play, on = ["gameId", "playId", "team"], how = "left").
                  drop("team").
                  rename({"TeamAbbr": "team"}).
                  with_columns(pl.lit("Passing").alias("PlayType")).
                  select(["PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"]).
                  with_columns((pl.col("frameId")%2).alias("filter")).
                  filter(pl.col("filter") == 1).
                  drop("filter").
                  drop("PlayType").
                  join(to_merge_end, on = ["gameId", "playId"], how = "left").
                  rename({"specialteamsplaytype": "PlayType"}).
                  select(["PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"]))

In [14]:
new_data_final.shape

(17695304, 10)

In [15]:
new_data_final.write_parquet(path+"/processed_df.parquet")