In [1]:
!pip install polars nfl_data_py

Collecting nfl_data_py
  Downloading nfl_data_py-0.3.0.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastparquet>0.5 (from nfl_data_py)
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-snappy>0.5 (from nfl_data_py)
  Downloading python_snappy-0.6.1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.9/55.9 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
Collecting cramjam>=2.3 (from fastparquet>0.5->nfl_data_py)
  Downloading cramjam-2.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
import os
import pandas as pd
import polars as pl
import nfl_data_py as nfl

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [4]:
os.listdir()

['chromedriver',
 '.DS_Store',
 'Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020',
 'asonty',
 'Highlights_NGS_2019',
 'Highlights_NGS_Prime',
 'geckodriver.log']

In [5]:
season_2019 = nfl.import_pbp_data([2019, 2020])

2019 done.
2020 done.
Downcasting floats.


In [6]:
def format_ref(df):
  selected = df[["old_game_id", "play_id", "play_type", "home_team", "away_team", "posteam", "posteam_type"]].melt(id_vars = ["old_game_id",	"play_id",	"play_type", "posteam",	"posteam_type"], var_name = "team", value_name = "TeamAbbr")
  selected["team"] = [v.replace("_team", "") for v in selected["team"]]
  selected = selected[(selected["play_type"] != "no_play") & (selected["play_type"].notnull())].reset_index(drop = True)
  selected["old_game_id"] = selected["old_game_id"].astype(int)
  selected["play_id"] = selected["play_id"].astype(int)

  selected["to_merge"] = selected["posteam"] == selected["TeamAbbr"]
  selected = selected.drop("posteam", axis = "columns").rename({"to_merge": "possTeam"}, axis = "columns")
  selected["OffDef"] = ["Offense" if v is True else "Defense" for v in selected["possTeam"]]
  selected = selected.drop(["posteam_type",	"team"], axis = "columns").rename({"TeamAbbr": "team", "old_game_id" : "game_id"}, axis = "columns")
  return selected

ref_season = format_ref(season_2019)

In [7]:
ref_season

Unnamed: 0,game_id,play_id,play_type,team,possTeam,OffDef
0,2019090804,36,kickoff,MIN,False,Defense
1,2019090804,51,pass,MIN,False,Defense
2,2019090804,79,run,MIN,False,Defense
3,2019090804,100,run,MIN,False,Defense
4,2019090804,121,punt,MIN,False,Defense
...,...,...,...,...,...,...
166649,2021020700,4256,pass,KC,True,Offense
166650,2021020700,4280,pass,KC,True,Offense
166651,2021020700,4307,qb_kneel,KC,False,Defense
166652,2021020700,4328,qb_kneel,KC,False,Defense


In [None]:
ref_season[ref_season["game_id"] == 2020010501]

Unnamed: 0,game_id,play_id,play_type,team,possTeam,OffDef
39915,2020010501,37,kickoff,PHI,False,Defense
39916,2020010501,76,pass,PHI,False,Defense
39917,2020010501,100,run,PHI,False,Defense
39918,2020010501,137,pass,PHI,False,Defense
39919,2020010501,159,punt,PHI,False,Defense
...,...,...,...,...,...,...
123377,2020010501,3917,run,SEA,True,Offense
123378,2020010501,3955,pass,SEA,True,Offense
123379,2020010501,3996,qb_kneel,SEA,True,Offense
123380,2020010501,4017,qb_kneel,SEA,True,Offense


**2019 concat**

In [8]:
from datetime import datetime

def convert_time(time):
    return datetime.strptime(time, "%Y-%m-%d %H:%M:%S.%f")

In [12]:
path = "Highlights_NGS_2019"

from datetime import datetime

def insert_frame_id(df):
  df.insert(1, "frameId", range(1, df.shape[0]+1))
  return df

def get_data_from_csv(path, reference):
  print(path)
  data = pd.read_csv(path, index_col = "Unnamed: 0")
  filtered = data[data["displayName"] != "ball"].reset_index(drop = True)
  filtered["time"] = filtered["time"].apply(convert_time)

  possible_frames = filtered[["play_id", "time"]].drop_duplicates().sort_values("time").reset_index(drop = True)
  frame_dfs = [possible_frames[possible_frames["play_id"] == v] for v in possible_frames["play_id"].unique()]
  frame_dfs = [insert_frame_id(d) for d in frame_dfs]
  final_df = pd.concat(frame_dfs).reset_index(drop = True)

  final = pd.merge(filtered, final_df, on = ["play_id", "time"], how = "left")
  final["game_id"] = final["game_id"].astype(int)

  if path == "Highlights_NGS_2019/Highlight_19_post.csv":
    final["possTeam"] = final["IsOnOffense"]

  final = final.merge(reference, on = ["game_id", "play_id", "possTeam"], how = "left")[["play_type", "game_id", "play_id", "team", "OffDef", "gsisId", "position", "frameId", "x", "y"]].rename({"play_type": "PlayType", "game_id": "gameId", "play_id": "playId"}, axis = "columns")

  if path == "Highlights_NGS_2019/Highlight_19_post.csv":
    final = final.dropna().reset_index(drop = True)

  final["PlayType"] = final["PlayType"].replace({"kickoff": "Kickoff",
                                                "pass": "Passing",
                                                "run": "Rushing",
                                                "punt": "Punt",
                                                "extra_point": "Extra Point",
                                                "qb_kneel": "Kneel",
                                                "field_goal": "Field Goal",
                                                "qb_spike": "Spike"})



  print(path)
  print(sum(final.isna().sum()))

  return pl.from_pandas(final)

In [13]:
to_read_2019 = os.listdir(path)
avoid = ["Highlight_19_week1.csv", "Highlight_19_week21.csv", "processed_df.parquet"]

to_read_2019 = [v for v in to_read_2019 if v not in avoid]

new_data_final = pl.concat([get_data_from_csv("Highlights_NGS_2019/"+path, ref_season) for path in to_read_2019])
new_data_final = (new_data_final.
                  with_columns(
                      (pl.col("frameId")%2).alias("filter")).
                  filter(pl.col("filter")==1).
                  drop("filter"))

Highlights_NGS_2019/Highlight_19_week5.csv
Highlights_NGS_2019/Highlight_19_week5.csv
0
Highlights_NGS_2019/Highlight_19_week18.csv
Highlights_NGS_2019/Highlight_19_week18.csv
0
Highlights_NGS_2019/Highlight_19_week13.csv
Highlights_NGS_2019/Highlight_19_week13.csv
0
Highlights_NGS_2019/Highlight_19_week15.csv
Highlights_NGS_2019/Highlight_19_week15.csv
0
Highlights_NGS_2019/Highlight_19_week6.csv
Highlights_NGS_2019/Highlight_19_week6.csv
0
Highlights_NGS_2019/Highlight_19_week4.csv
Highlights_NGS_2019/Highlight_19_week4.csv
0
Highlights_NGS_2019/Highlight_19_week20.csv
Highlights_NGS_2019/Highlight_19_week20.csv
0
Highlights_NGS_2019/Highlight_19_week8.csv
Highlights_NGS_2019/Highlight_19_week8.csv
0
Highlights_NGS_2019/Highlight_19_week12.csv
Highlights_NGS_2019/Highlight_19_week12.csv
0
Highlights_NGS_2019/Highlight_19_week2.csv
Highlights_NGS_2019/Highlight_19_week2.csv
0
Highlights_NGS_2019/Highlight_19_week17.csv
Highlights_NGS_2019/Highlight_19_week17.csv
0
Highlights_NGS_2019/

In [14]:
new_data_final.shape

(935095, 10)

In [17]:
years_to_get = [2017, 2018, 2019, 2020, 2021, 2022]
rosters = pl.from_pandas(nfl.import_rosters(years_to_get))

In [29]:
new_data_final = (new_data_final.
                  with_columns(pl.col("gameId").cast(pl.Utf8).str.slice(0, 4).cast(pl.Int32).alias("season")).
                  join((rosters.
                        select("season", "player_id", "gsis_it_id").
                        rename({"player_id" : "gsisId", "gsis_it_id": "nflId"})),
                       on = ["season", "gsisId"],
                       how = "left").
                  drop("gsisId").
                  select("PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"))

In [30]:
new_data_final.write_parquet(path+"/processed_df.parquet")

In [None]:
path

'Highlights_NGS_2019'