In [1]:
!pip install --upgrade polars
!pip install nfl_data_py

Collecting polars
  Downloading polars-0.18.15-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.6/19.6 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
  Attempting uninstall: polars
    Found existing installation: polars 0.17.3
    Uninstalling polars-0.17.3:
      Successfully uninstalled polars-0.17.3
Successfully installed polars-0.18.15
Collecting nfl_data_py
  Downloading nfl_data_py-0.3.0.tar.gz (1.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting fastparquet>0.5 (from nfl_data_py)
  Downloading fastparquet-2023.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m16.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollec

In [2]:
import os
import pandas as pd
import polars as pl
import nfl_data_py as nfl

from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [4]:
os.listdir()

['chromedriver',
 '.DS_Store',
 'Contact Detection',
 'Punt Prediction',
 'Analytics',
 'Impact Detection',
 'data bowl 2021',
 'data bowl 2023',
 'data bowl 2022',
 'data bowl 2020',
 'asonty',
 'Highlights_NGS_2019',
 'Highlights_NGS_Prime',
 'geckodriver.log',
 'final_df.parquet']

**2023 Data Bowl**

In [5]:
path = "asonty"
os.listdir(path)[0:10]

['2018_IND_2018102808_2525.tsv',
 '2018_ARI_2018111807_973.tsv',
 '2018_CIN_2018121604_4089.tsv',
 '2018_DEN_2018123013_1861.tsv',
 '2018_JAX_2018121606_1977.tsv',
 '2018_BUF_2018112501_875.tsv',
 '2018_CLE_2018122305_3615.tsv',
 '2018_CIN_2018090902_2824.tsv',
 '2018_DET_2018093002_3466.tsv',
 '2018_CLE_2018122305_1246.tsv']

In [6]:
import csv
import numpy as np

def get_data(path):
  # open .tsv file
  with open(path) as file:
      tsv_file = csv.reader(file, delimiter="\t")
      lines = [line for line in tsv_file]
      df = pd.DataFrame(np.array(lines[1:]), columns = lines[0])
      return pl.from_pandas(df)

to_keep = ['gameId', 'playId', 'playType', 'teamAbbr', 'possessionFlag', 'gsisId', 'position', 'frame', 'x', 'y']

def process_data(df, selected_cols = to_keep):
  selected =  (df.
              filter(pl.col("displayName") != "ball").
              select(selected_cols).
               with_columns(
                  [pl.col("gameId").cast(pl.Float64).cast(pl.Int64),
                   pl.col("playId").cast(pl.Float64).cast(pl.Int64),
                   pl.col("possessionFlag").cast(pl.Float64),
                   pl.col("frame").cast(pl.Float64).cast(pl.Int64),
                   pl.col("x").cast(pl.Float64),
                   pl.col("y").cast(pl.Float64)]
               ).
              rename({"playType": "PlayType",
                      "teamAbbr": "team",
                      "frame": "frameId"}).
             with_columns(
                pl.when(pl.col("possessionFlag") == 1).
                then(pl.lit("Offense")).
                otherwise(pl.lit("Defense")).
                alias("OffDef")).
             drop("possessionFlag").
             with_columns(
                (pl.col("frameId") % 2).alias("filter")
             ).
             filter(pl.col("filter") == 1).
             drop("filter").
             with_columns(
                pl.col("PlayType").str.replace("play_type_pass", "Passing")).
             with_columns(
                pl.col("PlayType").str.replace("play_type_rush", "Rushing")).
             with_columns(
                pl.col("PlayType").str.replace("play_type_sack", "Passing")).
             with_columns(
                pl.col("PlayType").str.replace("play_type_kickoff", "Kickoff")).
             with_columns(
                pl.col("PlayType").str.replace("play_type_punt", "Punt")).
             with_columns(
                pl.col("PlayType").str.replace("play_type_two_point_conversion", "two_point")).
            select("PlayType", "gameId", "playId", "team", "OffDef", "gsisId", "position", "frameId", "x", "y"))

  return selected

def data_extract(path, cols = to_keep):
  opened = get_data(path)
  processed = process_data(opened, cols)
  return processed

In [None]:
index_path = ["ngs_highlights_index.tsv", "processed_df.parquet"]
to_read = [p for p in os.listdir(path) if p not in index_path]

from tqdm import tqdm
progress_bar = tqdm(to_read, desc="reading", ncols=100)

new_data_final = pl.concat([data_extract("asonty/"+path) for path in progress_bar])

progress_bar.close()

reading: 100%|████████████████████████████████████████████████████| 561/561 [04:05<00:00,  2.29it/s]


In [None]:
new_data_final.shape

(2186234, 10)

In [None]:
years_to_get = [2017, 2018, 2019, 2020, 2021, 2022]
rosters = pl.from_pandas(nfl.import_rosters(years_to_get))

In [None]:
new_data_final = (new_data_final.
                  with_columns(pl.col("gameId").cast(pl.Utf8).str.slice(0, 4).cast(pl.Int32).alias("season")).
                  join((rosters.
                        select("season", "player_id", "gsis_it_id").
                        rename({"player_id" : "gsisId", "gsis_it_id": "nflId"})),
                       on = ["season", "gsisId"],
                       how = "left").
                  drop("gsisId").
                  select("PlayType", "gameId", "playId", "team", "OffDef", "nflId", "position", "frameId", "x", "y"))

In [None]:
new_data_final.write_parquet(path+"/processed_df.parquet")