In [1]:
import os
import pandas as pd
import numpy as np
import polars as pl
import nfl_data_py as nfl

env = "local"

In [2]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [15]:
years_to_get = [2017, 2018, 2019, 2020, 2021, 2022, 2023]
season_data = pl.from_pandas(nfl.import_pbp_data(years_to_get))
rosters = pl.from_pandas(nfl.import_seasonal_rosters(years_to_get))

2017 done.
2018 done.
2019 done.
2020 done.
2021 done.
2022 done.
2023 done.
Downcasting floats.


In [44]:
plays_index = pl.read_parquet("index/plays_index.parquet")
positions_index = pl.read_parquet("index/positions_index.parquet")
OL_df = pl.DataFrame({"position" : ["OL"],
                      "position_ID" : [28],
                      "Cat" : ["Pos"]})

new_positions_index = pl.concat([positions_index, OL_df])

In [5]:
team_index = (season_data.
              select("home_team").
              rename({"home_team" : "team"}).
              unique().
              with_columns(pl.when(pl.col("team") == "OAK").
                           then(pl.lit("LV")).
                           otherwise(pl.col("team")).
                           alias("team")).
              with_columns(pl.arange(0, 32).alias("team_ID")))

In [6]:
yards_index = (pl.DataFrame({"yards_gained" : range(-99, 100), 
                            "yard_ID" : range(0, 199)}).
                with_columns(pl.col("yards_gained").cast(pl.Int32)).
                with_columns(pl.col("yard_ID").cast(pl.Int32)))

In [21]:
season_index = (pl.DataFrame({"season" : [2017, 2018, 2019, 2020, 2021, 2022, 2023], 
                            "season_ID" : [0, 1, 2, 3, 4, 5, 6]}).
                with_columns(pl.col("season").cast(pl.Int64)).
                with_columns(pl.col("season_ID").cast(pl.Int32)))

In [22]:
rosters_index = (rosters.
                 select("season", "team", "position", "player_name", "jersey_number", "player_id").
                 unique().
                 with_columns(pl.col("team").str.replace("OAK", "LV")).
                 with_columns(pl.when(pl.col("jersey_number").is_null()).
                              then(pl.lit(0.0)).
                              otherwise(pl.col("jersey_number")).
                              alias("jersey_number")))

index_max = rosters_index.shape[0]

rosters_index = (rosters_index.
                 with_columns(pl.arange(0, index_max).alias("player_ID")))

In [34]:
index_max

21505

In [138]:
spec_data = (season_data.
             select("season", "old_game_id", "play_id", "home_team", "away_team", "posteam", "defteam", "down", "offense_players", "defense_players", "play_type", "yards_gained").
             filter(pl.col("play_type").is_not_null()).
             filter(pl.col("play_type") != "no_play").
             filter(pl.col("offense_players") != "").
             with_columns(pl.when(pl.col("down").is_null()).
                          then(pl.lit(0.0)).
                          otherwise(pl.col("down")).
                          alias("down")).
             filter(pl.col("play_type").
                    is_in(["run", "pass"])).
             with_columns(pl.when(pl.col("yards_gained") > 0).
                          then(pl.lit(1.0)).
                          otherwise(pl.lit(0.0)).
                          alias("Success")).
             melt(id_vars = ["season", "old_game_id", "play_id", "home_team", "away_team", "posteam", "defteam", "down", "play_type", "yards_gained", "Success"], 
                  value_vars = ["offense_players", "defense_players"],
                  variable_name = "team",
                  value_name = "players").
             with_columns(pl.col("players").str.split(";")).
             explode("players").
             rename({"players" : "player_id"}).
             join(rosters.
                  select("season", "player_id", "position").
                  with_columns(pl.col("season").cast(pl.Int64)).
                  unique(),
                  how = "left",
                  on = ["season", "player_id"]).
             with_columns(pl.col("team").str.replace("_players", "")).
             rename({"team" : "OffDef"}).
             with_columns(pl.when(pl.col("OffDef") == "offense").
                          then(pl.col("posteam")).
                          otherwise(pl.col("defteam")).
                          alias("team")).
             drop("home_team", "away_team", "posteam", "defteam").
             with_columns(pl.when(pl.col("team") == "OAK").
                          then(pl.lit("LV")).
                          otherwise(pl.col("team")).
                          alias("team")).
             join(rosters_index.
                  select("season", "team", "player_id", "player_ID").
                  with_columns(pl.col("season").cast(pl.Int64)),
                  on = ["season", "team", "player_id"],
                  how = "left").
             with_columns(pl.col("player_ID").cumcount().over("player_ID").alias("count")).
             with_columns(pl.when(pl.col("count") < 25).
              then(pl.lit(index_max)).
              otherwise(pl.col("player_ID")).
              alias("player_ID")).
             drop("player_id"))

In [139]:
spec_data = (spec_data.
               join(new_positions_index.
                    drop("Cat"), 
                    on = "position",
                    how = "left").
               drop("position").
               group_by("season", "old_game_id", "play_id", "team", "OffDef", "down", "play_type", "yards_gained", "Success").
               agg(pl.col("position_ID"),
                    pl.col("player_ID")).
               with_columns(pl.when(pl.col("OffDef") == "offense").
                              then(pl.lit(1)).
                              otherwise(pl.lit(0)).
                              alias("OffDef_ID")).
               drop("OffDef").
               join(plays_index.
                    rename({"PlayType" : "play_type"}),
                    on = "play_type",
                    how = "left").
               drop("play_type").
               join(team_index,
                    on = "team",
                    how = "left").
               drop("team").
               with_columns(pl.col("down").cast(pl.Int32).alias("down_ID")).
               drop("down").
               rename({"old_game_id" : "gameId",
                         "play_id" : "playId"}).
               with_columns(pl.col("position_ID").list.lengths().alias("Length")).
               filter(pl.col("Length") == 11).
               drop("Length").
               with_columns(pl.col("gameId").cast(pl.Int32)).
               with_columns(pl.col("playId").cast(pl.Int32)).
               with_columns(pl.col("yards_gained").cast(pl.Int32)).
               join(season_index, 
                    on = "season",
                    how = "left").
               drop("season"))

In [140]:
new_data = (spec_data.
            select("gameId", "playId", "OffDef_ID").
            unique().
            group_by("gameId", "playId").
            count().
            filter(pl.col("count") == 2).
            drop("count").
            join(spec_data,
                 on = ["gameId", "playId"],
                 how = "left"))

In [141]:
new_data.select("down_ID").unique()

down_ID
i32
0
1
2
3
4


In [142]:
from sklearn.model_selection import train_test_split

train_test_df = (new_data.
                 select("season_ID", "gameId", "playId").
                 unique()).to_pandas()

train, test = train_test_split(train_test_df, test_size= 0.3, stratify = train_test_df["season_ID"].to_numpy())

In [143]:
train_data = (pl.from_pandas(train).
              join(new_data,
                   on = ["season_ID", "gameId", "playId"],
                   how = "left"))

test_data = (pl.from_pandas(test).
             join(new_data,
                  on = ["season_ID", "gameId", "playId"],
                  how = "left"))

In [144]:
test_data.select("position_ID").to_series().to_list()[0]

[2, 4, 2, 12, 2, 12, 4, 2, 4, 2, 2]

In [145]:
pos_val = 0
scrim_val = 99
start_val = 1032

In [146]:
train_data.null_count()

season_ID,gameId,playId,yards_gained,Success,position_ID,player_ID,OffDef_ID,PlayType_ID,team_ID,down_ID
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0


In [147]:
test_data.columns

['season_ID',
 'gameId',
 'playId',
 'yards_gained',
 'Success',
 'position_ID',
 'player_ID',
 'OffDef_ID',
 'PlayType_ID',
 'team_ID',
 'down_ID']

In [161]:
train_seq_dict = {str(row["gameId"]) + "_" + str(row["playId"]) + "_" + str(row["OffDef_ID"]) : 
    {"input_ids" : [10877 for i in range(len(row["position_ID"]))],
     "player_ids": row["player_ID"],
     "position_ids": row["position_ID"],
     "OffDef" : [row["OffDef_ID"] for i in range(len(row["position_ID"]))],
     "token_type_ids" : [0 for i in range(len(row["position_ID"]))],
     "pos_ids" : [pos_val for i in range(len(row["position_ID"]))],
     "team_ID" : [row["team_ID"] for i in range(len(row["position_ID"]))],
     "start_ids" : [start_val for i in range(len(row["position_ID"]))],
     "scrim_ids" : [scrim_val for i in range(len(row["position_ID"]))],
     "attention_mask" : [1 for i in range(len(row["position_ID"]))],
     "PlayType" : [row["PlayType_ID"] for i in range(len(row["position_ID"]))],
     "down_ID" : [row["down_ID"] for i in range(len(row["position_ID"]))],
     "season_ID" : [row["season_ID"] for i in range(len(row["position_ID"]))],
     "yard_gained" : float(row["yards_gained"]),
     "Success" : row["Success"],
     "playId" : row["playId"],
     "gameId" : row["gameId"]} for row in train_data.iter_rows(named=True)}

test_seq_dict = {str(row["gameId"]) + "_" + str(row["playId"]) + "_" + str(row["OffDef_ID"]) : 
    {"input_ids" : [10877 for i in range(len(row["position_ID"]))],
     "player_ids": row["player_ID"],
     "position_ids": row["position_ID"],
     "OffDef" : [row["OffDef_ID"] for i in range(len(row["position_ID"]))],
     "token_type_ids" : [0 for i in range(len(row["position_ID"]))],
     "pos_ids" : [pos_val for i in range(len(row["position_ID"]))],
     "team_ID" : [row["team_ID"] for i in range(len(row["position_ID"]))],
     "start_ids" : [start_val for i in range(len(row["position_ID"]))],
     "scrim_ids" : [scrim_val for i in range(len(row["position_ID"]))],
     "attention_mask" : [1 for i in range(len(row["position_ID"]))],
     "PlayType" : [row["PlayType_ID"] for i in range(len(row["position_ID"]))],
     "down_ID" : [row["down_ID"] for i in range(len(row["position_ID"]))],
     "season_ID" : [row["season_ID"] for i in range(len(row["position_ID"]))],
     "yard_gained" : float(row["yards_gained"]),
     "Success" : row["Success"],
     "playId" : row["playId"],
     "gameId" : row["gameId"]} for row in test_data.iter_rows(named=True)}

In [162]:
train_common_keys = [str(row["gameId"]) + "_" + str(row["playId"]) for row in train_data.iter_rows(named=True)]
train_off_keys = [v + "_1" for v in train_common_keys]
train_def_keys = [v + "_0" for v in train_common_keys]

test_common_keys = [str(row["gameId"]) + "_" + str(row["playId"]) for row in test_data.iter_rows(named=True)]
test_off_keys = [v + "_1" for v in test_common_keys]
test_def_keys = [v + "_0" for v in test_common_keys]

train_off_seq = [train_seq_dict[v] for v in train_off_keys]
train_def_seq = [train_seq_dict[v] for v in train_def_keys]

test_off_seq = [test_seq_dict[v] for v in test_off_keys]
test_def_seq = [test_seq_dict[v] for v in test_def_keys]

In [163]:
from tqdm import tqdm

def compile_seq(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)
        
    merged_dict = {k: np.array(v) for k,v in merged_dict.items()}
    return merged_dict

In [164]:
train_OFF = compile_seq(train_off_seq)
train_DEF = compile_seq(train_def_seq)

test_OFF = compile_seq(test_off_seq)
test_DEF = compile_seq(test_def_seq)

  0%|          | 0/290864 [00:00<?, ?it/s]

100%|██████████| 290864/290864 [00:00<00:00, 821083.23it/s]
100%|██████████| 290864/290864 [00:00<00:00, 855368.60it/s]
100%|██████████| 124656/124656 [00:00<00:00, 638953.17it/s]
100%|██████████| 124656/124656 [00:00<00:00, 835255.94it/s]


In [165]:
train_OFF = {k : v.astype(float) for k,v in train_OFF.items()}
train_DEF = {k : v.astype(float) for k,v in train_DEF.items()}

test_OFF = {k : v.astype(float) for k,v in test_OFF.items()}
test_DEF = {k : v.astype(float) for k,v in test_DEF.items()}

In [166]:
train = {"off" : train_OFF,
         "def" : train_DEF}

test = {"off" : test_OFF,
         "def" : test_DEF}

In [167]:
import tensorflow as tf

train_labels = train_OFF["yard_gained"]
train_dataset = tf.data.Dataset.from_tensor_slices((train, train_labels))

test_labels = test_OFF["yard_gained"]
test_dataset = tf.data.Dataset.from_tensor_slices((test, test_labels))

tf.data.Dataset.save(train_dataset, "data_models/Helenos_categ/train_play_prediction_reg")
tf.data.Dataset.save(test_dataset, "data_models/Helenos_categ/test_play_prediction_reg")

In [168]:
train_labels = train_OFF["Success"]
train_dataset = tf.data.Dataset.from_tensor_slices((train, train_labels))

test_labels = test_OFF["Success"]
test_dataset = tf.data.Dataset.from_tensor_slices((test, test_labels))

tf.data.Dataset.save(train_dataset, "data_models/Helenos_binary/train_play_prediction_binary")
tf.data.Dataset.save(test_dataset, "data_models/Helenos_binary/test_play_prediction_binary")

In [38]:
yards_index

yards_gained,yard_ID
i32,i32
-99,0
-98,1
-97,2
-96,3
-95,4
-94,5
-93,6
-92,7
-91,8
-90,9


In [37]:
from collections import Counter
Counter(train_OFF["yard_ID"])

Counter({99: 67236,
         101: 18886,
         102: 18054,
         103: 17420,
         104: 15910,
         100: 15838,
         105: 13688,
         106: 11574,
         107: 9948,
         108: 9930,
         110: 6820,
         109: 6302,
         98: 5710,
         111: 5418,
         112: 4684,
         113: 4162,
         97: 3884,
         114: 3366,
         115: 3080,
         116: 2616,
         96: 2520,
         117: 2314,
         118: 2044,
         119: 1870,
         95: 1860,
         92: 1654,
         120: 1592,
         94: 1530,
         91: 1500,
         93: 1446,
         121: 1334,
         90: 1186,
         122: 1162,
         123: 1090,
         124: 984,
         89: 794,
         125: 738,
         126: 676,
         127: 628,
         128: 542,
         129: 480,
         88: 452,
         130: 428,
         131: 394,
         132: 382,
         133: 360,
         134: 286,
         135: 272,
         87: 266,
         137: 266,
         136: 242,
  

In [27]:
189040/(90572+189040)

0.6760797104559175