In [4]:
import os
import pandas as pd
import numpy as np
import polars as pl
import nfl_data_py as nfl

env = "local"

In [5]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [10]:
years_to_get = [2017, 2018, 2019, 2020, 2021, 2022]
season_data = pl.from_pandas(nfl.import_pbp_data(years_to_get))
rosters = pl.from_pandas(nfl.import_seasonal_rosters(years_to_get))

2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
Downcasting floats.


In [8]:
plays_index = pl.read_parquet("index/plays_index.parquet")
positions_index = pl.read_parquet("index/positions_index.parquet")
OL_df = pl.DataFrame({"position" : ["OL"],
                      "position_ID" : [28],
                      "Cat" : ["Pos"]})

new_positions_index = pl.concat([positions_index, OL_df])

In [11]:
team_index = (season_data.
              select("home_team").
              rename({"home_team" : "team"}).
              unique().
              with_columns(pl.when(pl.col("team") == "OAK").
                           then(pl.lit("LV")).
                           otherwise(pl.col("team")).
                           alias("team")).
              with_columns(pl.arange(0, 32).alias("team_ID")))

season,season_ID
i64,i32
2017,0
2018,1
2019,2
2020,3
2021,4
2022,5


In [12]:
yards_index = (pl.DataFrame({"yards_gained" : range(-99, 100), 
                            "yard_ID" : range(0, 199)}).
                with_columns(pl.col("yards_gained").cast(pl.Int32)).
                with_columns(pl.col("yard_ID").cast(pl.Int32)))

In [21]:
season_index = (pl.DataFrame({"season" : [2017, 2018, 2019, 2020, 2021, 2022], 
                            "season_ID" : [0, 1, 2, 3, 4, 5]}).
                with_columns(pl.col("season").cast(pl.Int64)).
                with_columns(pl.col("season_ID").cast(pl.Int32)))

In [22]:
spec_data = (season_data.
             select("season", "old_game_id", "play_id", "home_team", "away_team", "posteam", "defteam", "down", "offense_players", "defense_players", "play_type", "yards_gained").
             filter(pl.col("play_type").is_not_null()).
             filter(pl.col("play_type") != "no_play").
             filter(pl.col("offense_players") != "").
             with_columns(pl.when(pl.col("down").is_null()).
                          then(pl.lit(0.0)).
                          otherwise(pl.col("down")).
                          alias("down")).
             filter(pl.col("play_type").
                    is_in(["run", "pass"])).
             with_columns(pl.when(pl.col("yards_gained") > 0).
                          then(pl.lit(1.0)).
                          otherwise(pl.lit(0.0)).
                          alias("Success")).
             melt(id_vars = ["season", "old_game_id", "play_id", "home_team", "away_team", "posteam", "defteam", "down", "play_type", "yards_gained", "Success"], 
                  value_vars = ["offense_players", "defense_players"],
                  variable_name = "team",
                  value_name = "players").
             with_columns(pl.col("players").str.split(";")).
             explode("players").
             rename({"players" : "player_id"}).
             join(rosters.
                  select("season", "player_id", "position").
                  with_columns(pl.col("season").cast(pl.Int64)).
                  unique(),
                  how = "left",
                  on = ["season", "player_id"]).
             with_columns(pl.col("team").str.replace("_players", "")).
             drop("player_id").
             join(new_positions_index.
                  drop("Cat"), 
                  on = "position",
                  how = "left").
             drop("position").
             group_by("season", "old_game_id", "play_id", "home_team", "away_team", "posteam", "defteam", "down", "play_type", "yards_gained", "Success", "team").
             agg(pl.col("position_ID")).
             rename({"team" : "OffDef"}).
             with_columns(pl.when(pl.col("OffDef") == "offense").
                          then(pl.col("posteam")).
                          otherwise(pl.col("defteam")).
                          alias("team")).
             drop("home_team", "away_team", "posteam", "defteam").
             with_columns(pl.when(pl.col("team") == "OAK").
                          then(pl.lit("LV")).
                          otherwise(pl.col("team")).
                          alias("team")).
             with_columns(pl.when(pl.col("OffDef") == "offense").
                          then(pl.lit(1)).
                          otherwise(pl.lit(0)).
                          alias("OffDef_ID")).
             drop("OffDef").
             join(plays_index.
                  rename({"PlayType" : "play_type"}),
                  on = "play_type",
                  how = "left").
             drop("play_type").
             join(team_index,
                  on = "team",
                  how = "left").
             drop("team").
             with_columns(pl.col("down").cast(pl.Int32).alias("down_ID")).
             drop("down").
             rename({"old_game_id" : "gameId",
                     "play_id" : "playId"}).
              with_columns(pl.col("position_ID").list.lengths().alias("Length")).
              filter(pl.col("Length") == 11).
              drop("Length").
              with_columns(pl.col("gameId").cast(pl.Int32)).
              with_columns(pl.col("playId").cast(pl.Int32)).
              with_columns(pl.col("yards_gained").cast(pl.Int32)).
              join(yards_index,
                   on = "yards_gained",
                   how = "left").
              drop("yards_gained").
              join(season_index, 
                   on = "season",
                   how = "left").
              drop("season"))

In [23]:
new_data = (spec_data.
            select("gameId", "playId", "OffDef_ID").
            unique().
            group_by("gameId", "playId").
            count().
            filter(pl.col("count") == 2).
            drop("count").
            join(spec_data,
                 on = ["gameId", "playId"],
                 how = "left"))

In [43]:
new_data.select("down_ID").unique()

down_ID
i32
0
1
2
3
4


In [27]:
from sklearn.model_selection import train_test_split

train_test_df = (new_data.
                 select("season_ID", "gameId", "playId").
                 unique()).to_pandas()

train, test = train_test_split(train_test_df, test_size= 0.3, stratify = train_test_df["season_ID"].to_numpy())

In [28]:
train_data = (pl.from_pandas(train).
              join(new_data,
                   on = ["season_ID", "gameId", "playId"],
                   how = "left"))

test_data = (pl.from_pandas(test).
             join(new_data,
                  on = ["season_ID", "gameId", "playId"],
                  how = "left"))

In [30]:
pos_val = 0
scrim_val = 99
start_val = 1983

In [33]:
train_seq_dict = {str(row["gameId"]) + "_" + str(row["playId"]) + "_" + str(row["OffDef_ID"]) : 
    {"input_ids" : [11164 for i in range(len(row["position_ID"]))],
     "position_ids": row["position_ID"],
     "OffDef" : [row["OffDef_ID"] for i in range(len(row["position_ID"]))],
     "token_type_ids" : [0 for i in range(len(row["position_ID"]))],
     "pos_ids" : [pos_val for i in range(len(row["position_ID"]))],
     "team_ID" : [row["team_ID"] for i in range(len(row["position_ID"]))],
     "start_ids" : [start_val for i in range(len(row["position_ID"]))],
     "scrim_ids" : [scrim_val for i in range(len(row["position_ID"]))],
     "attention_mask" : [1 for i in range(len(row["position_ID"]))],
     "down_ID" : [row["down_ID"] for i in range(len(row["position_ID"]))],
     "season_ID" : [row["season_ID"] for i in range(len(row["position_ID"]))],
     "yard_ID" : row["yard_ID"],
     "Success" : row["Success"],
     "playId" : row["playId"],
     "gameId" : row["gameId"]} for row in train_data.iter_rows(named=True)}

test_seq_dict = {str(row["gameId"]) + "_" + str(row["playId"]) + "_" + str(row["OffDef_ID"]) : 
    {"input_ids" : [11164 for i in range(len(row["position_ID"]))],
     "position_ids": row["position_ID"],
     "OffDef" : [row["OffDef_ID"] for i in range(len(row["position_ID"]))],
     "token_type_ids" : [0 for i in range(len(row["position_ID"]))],
     "pos_ids" : [pos_val for i in range(len(row["position_ID"]))],
     "team_ID" : [row["team_ID"] for i in range(len(row["position_ID"]))],
     "start_ids" : [start_val for i in range(len(row["position_ID"]))],
     "scrim_ids" : [scrim_val for i in range(len(row["position_ID"]))],
     "attention_mask" : [1 for i in range(len(row["position_ID"]))],
     "down_ID" : [row["down_ID"] for i in range(len(row["position_ID"]))],
     "season_ID" : [row["season_ID"] for i in range(len(row["position_ID"]))],
     "yard_ID" : row["yard_ID"],
     "Success" : row["Success"],
     "playId" : row["playId"],
     "gameId" : row["gameId"]} for row in test_data.iter_rows(named=True)}

In [34]:
train_common_keys = [str(row["gameId"]) + "_" + str(row["playId"]) for row in train_data.iter_rows(named=True)]
train_off_keys = [v + "_1" for v in train_common_keys]
train_def_keys = [v + "_0" for v in train_common_keys]

test_common_keys = [str(row["gameId"]) + "_" + str(row["playId"]) for row in test_data.iter_rows(named=True)]
test_off_keys = [v + "_1" for v in test_common_keys]
test_def_keys = [v + "_0" for v in test_common_keys]

train_off_seq = [train_seq_dict[v] for v in train_off_keys]
train_def_seq = [train_seq_dict[v] for v in train_def_keys]

test_off_seq = [test_seq_dict[v] for v in test_off_keys]
test_def_seq = [test_seq_dict[v] for v in test_def_keys]

In [35]:
from tqdm import tqdm

def compile_seq(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)
        
    merged_dict = {k: np.array(v) for k,v in merged_dict.items()}
    return merged_dict

In [36]:
train_OFF = compile_seq(train_off_seq)
train_DEF = compile_seq(train_def_seq)

test_OFF = compile_seq(test_off_seq)
test_DEF = compile_seq(test_def_seq)

100%|██████████| 279612/279612 [00:00<00:00, 952239.92it/s]
100%|██████████| 279612/279612 [00:00<00:00, 1044583.33it/s]
100%|██████████| 119836/119836 [00:00<00:00, 942791.63it/s]
100%|██████████| 119836/119836 [00:00<00:00, 1023052.20it/s]


In [37]:
train = {"off" : train_OFF,
         "def" : train_DEF}

test = {"off" : test_OFF,
         "def" : test_DEF}

In [38]:
import tensorflow as tf

train_labels = train_OFF["yard_ID"]
train_dataset = tf.data.Dataset.from_tensor_slices((train, train_labels))

test_labels = test_OFF["yard_ID"]
test_dataset = tf.data.Dataset.from_tensor_slices((test, test_labels))

tf.data.Dataset.save(train_dataset, "data_models/play_pred_categ/train_play_prediction_categ")
tf.data.Dataset.save(test_dataset, "data_models/play_pred_categ/test_play_prediction_categ")

In [39]:
train_labels = train_OFF["Success"]
train_dataset = tf.data.Dataset.from_tensor_slices((train, train_labels))

test_labels = test_OFF["Success"]
test_dataset = tf.data.Dataset.from_tensor_slices((test, test_labels))

tf.data.Dataset.save(train_dataset, "data_models/play_pred_binary/train_play_prediction_binary")
tf.data.Dataset.save(test_dataset, "data_models/play_pred_binary/test_play_prediction_binary")