In [1]:
!pip install polars



In [2]:
import os
import pandas as pd
import polars as pl

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [5]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/QB-GPT/data_preprocessing/0_raw/")

os.listdir()

['Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020',
 'asonty',
 'Highlights_NGS_2019',
 'Highlights_NGS_Prime',
 'data bowl 2024']

In [6]:
path = "data bowl 2024"
os.listdir(path)

['games.csv',
 'players.csv',
 'plays.csv',
 'tackles.csv',
 'tracking_week_1.csv',
 'tracking_week_2.csv',
 'tracking_week_3.csv',
 'tracking_week_4.csv',
 'tracking_week_5.csv',
 'tracking_week_6.csv',
 'tracking_week_7.csv',
 'tracking_week_8.csv',
 'tracking_week_9.csv']

In [11]:
def read_file(path):
  return pl.from_pandas(pd.read_csv(path))

if "tracks.parquet" not in os.listdir(path):
  tracks = pl.concat([read_file(path + "/" + p) for p in os.listdir(path) if "tracking" in p])
  tracks.write_json(path+ "/"+"tracks.json")

In [18]:
games = pl.from_pandas(pd.read_csv("data bowl 2024/games.csv"))
plays = pl.from_pandas(pd.read_csv("data bowl 2024/plays.csv"))
tracks = pl.read_json("data bowl 2024/tracks.json")
players = pl.from_pandas(pd.read_csv("data bowl 2024/players.csv"))

In [21]:
players_to_merge = players.select(["nflId", "position"]).with_columns((pl.col("nflId").cast(pl.Float64)))

In [22]:
game_adapted = (games.
                select(["gameId", "homeTeamAbbr", "visitorTeamAbbr"]).
                melt("gameId", ["homeTeamAbbr", "visitorTeamAbbr"], variable_name = "team", value_name = "TeamAbbr").
                with_columns(pl.col('team').str.replace("TeamAbbr", '')))

play_adapted = (plays.
                select(["gameId", "playId", "possessionTeam"]).
                rename({"possessionTeam": "TeamAbbr"}).
                with_columns(pl.lit("Offense").alias("OffDef")))

game_play = play_adapted.join(game_adapted, on=["gameId", "TeamAbbr"], how = "left")

In [23]:
import numpy as np

def get_defense(arr, ref):
  if arr[4] == "home":
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "visitor"))
    return filtered.to_numpy()
  else:
    filtered = (ref.
                filter(pl.col("gameId") == arr[0]).
                filter(pl.col("team") == "home"))
    return filtered.to_numpy()

def get_second_team(arr, ref):
  second = get_defense(arr, ref)[0]
  to_stack = np.array([arr[0], arr[1], second[2], "Defense", second[1]])
  return np.stack([arr, to_stack])

def get_new_arr(df, ref):
  arr = np.stack([get_second_team(v, ref) for v in df.to_numpy()])
  arr = np.reshape(arr, (arr.shape[0]*arr.shape[1], arr.shape[2]))
  return arr

In [24]:
new_arr = get_new_arr(game_play, game_adapted)
new_dict = {game_play.columns[i] : [r[i] for r in new_arr] for i in range(len(game_play.columns))}

new_dict["gameId"] = [int(v) for v in new_dict["gameId"]]
new_dict["playId"] = [int(v) for v in new_dict["playId"]]
new_dict["TeamAbbr"] = [str(v) for v in new_dict["TeamAbbr"]]
new_dict["OffDef"] = [str(v) for v in new_dict["OffDef"]]
new_dict["team"] = [str(v) for v in new_dict["team"]]

game_play = pl.from_dict(new_dict)

In [25]:
game_play = game_play.rename({"team" : "HV", "TeamAbbr": "team"})

In [35]:
plays.columns

['gameId',
 'playId',
 'ballCarrierId',
 'ballCarrierDisplayName',
 'playDescription',
 'quarter',
 'down',
 'yardsToGo',
 'possessionTeam',
 'defensiveTeam',
 'yardlineSide',
 'yardlineNumber',
 'gameClock',
 'preSnapHomeScore',
 'preSnapVisitorScore',
 'passResult',
 'passLength',
 'penaltyYards',
 'prePenaltyPlayResult',
 'playResult',
 'playNullifiedByPenalty',
 'absoluteYardlineNumber',
 'offenseFormation',
 'defendersInTheBox',
 'passProbability',
 'preSnapHomeTeamWinProbability',
 'preSnapVisitorTeamWinProbability',
 'homeTeamWinProbabilityAdded',
 'visitorTeamWinProbilityAdded',
 'expectedPoints',
 'expectedPointsAdded',
 'foulName1',
 'foulName2',
 'foulNFLId1',
 'foulNFLId2']

In [30]:
game_play

gameId,playId,team,OffDef,HV
i64,i64,str,str,str
2022100908,3537,"""ATL""","""Offense""","""visitor"""
2022100908,3537,"""TB""","""Defense""","""home"""
2022091103,3126,"""PIT""","""Offense""","""visitor"""
2022091103,3126,"""CIN""","""Defense""","""home"""
2022091111,1148,"""LV""","""Offense""","""visitor"""
2022091111,1148,"""LAC""","""Defense""","""home"""
2022100212,2007,"""DEN""","""Offense""","""visitor"""
2022100212,2007,"""LV""","""Defense""","""home"""
2022091900,1372,"""BUF""","""Offense""","""home"""
2022091900,1372,"""TEN""","""Defense""","""visitor"""


In [29]:
tracks

gameId,playId,nflId,displayName,frameId,time,jerseyNumber,club,playDirection,x,y,s,a,dis,o,dir,event
i64,i64,f64,str,i64,str,f64,str,str,f64,f64,f64,f64,f64,f64,f64,str
2022090800,56,35472.0,"""Rodger Saffold…",1,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.37,27.27,1.62,1.15,0.16,231.74,147.9,
2022090800,56,35472.0,"""Rodger Saffold…",2,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.47,27.13,1.67,0.61,0.17,230.98,148.53,"""pass_arrived"""
2022090800,56,35472.0,"""Rodger Saffold…",3,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.56,27.01,1.57,0.49,0.15,230.98,147.05,
2022090800,56,35472.0,"""Rodger Saffold…",4,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.64,26.9,1.44,0.89,0.14,232.38,145.42,
2022090800,56,35472.0,"""Rodger Saffold…",5,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.72,26.8,1.29,1.24,0.13,233.36,141.95,
2022090800,56,35472.0,"""Rodger Saffold…",6,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.8,26.7,1.15,1.42,0.12,234.48,139.41,"""pass_outcome_c…"
2022090800,56,35472.0,"""Rodger Saffold…",7,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.87,26.64,0.93,1.69,0.09,235.77,134.32,
2022090800,56,35472.0,"""Rodger Saffold…",8,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.91,26.59,0.68,1.74,0.07,240.0,131.01,
2022090800,56,35472.0,"""Rodger Saffold…",9,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.94,26.57,0.42,1.74,0.04,243.56,122.29,
2022090800,56,35472.0,"""Rodger Saffold…",10,"""2022-09-08 20:…",76.0,"""BUF""","""left""",88.95,26.58,0.14,1.83,0.01,246.07,85.87,


In [31]:
cols = ["gameId", "playId", "team", "nflId", "position", "frameId", "x", "y"]

new_data_final = (tracks.
                  rename({"club" : "team"}).
                  join(players_to_merge, on = "nflId", how = "left").
                  filter(pl.col("team") != "football").
                  with_columns(pl.col("team").str.replace("away", "visitor")).
                  select(cols).
                  join(game_play, on = ["gameId", "playId", "team"], how = "left").
                  drop("HV").
                  with_columns(pl.lit("Passing").alias("PlayType")).
                  select(["PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"]).
                  with_columns((pl.col("frameId")%2).alias("filter")).
                  filter(pl.col("filter") == 1).
                  drop("filter"))

In [33]:
new_data_final.select("PlayType").unique()

PlayType
str
"""Passing"""


In [41]:
new_data_final.write_parquet(path+"/processed_df.parquet")