In [40]:
import os
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import polars as pl

env = "local"

In [41]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [42]:
os.listdir()

['data_models',
 '.DS_Store',
 'app',
 'LICENSE',
 'models',
 'README.md',
 '.gitignore',
 '.gitattributes',
 'data_preprocessing',
 'index',
 '.git',
 'notebooks']

In [43]:
training_data = tf.data.Dataset.load("data_models/Helenos/train_data")
testing_data = tf.data.Dataset.load("data_models/Helenos/test_data")

train_length = [i for i,_ in enumerate(training_data)][-1] + 1
test_length = [i for i,_ in enumerate(testing_data)][-1] + 1

print("Train length is : ", str(train_length))
print("Test length is : ", str(test_length))

batch_size = 32

training_data = training_data.shuffle(train_length).batch(batch_size)
testing_data = testing_data.shuffle(test_length).batch(batch_size)

Train length is :  141673
Test length is :  60718


In [44]:
from models.modeling.QBGPT.models import QBGPT, LargeQBGPT

moves_to_pred = 10876
input_size = 10878
starts_size = 1033
scrimmage_size = 100
positions_id = 29

temp_ids = 52
off_def_size = 2
token_type_size = 3
play_type_size = 9

model_large = LargeQBGPT(input_vocab_size = input_size,
                         positional_vocab_size = temp_ids,
                         position_vocab_size=positions_id,
                         start_vocab_size=starts_size,
                         scrimmage_vocab_size=scrimmage_size,
                         offdef_vocab_size = off_def_size,
                         type_vocab_size = token_type_size,
                         playtype_vocab_size = play_type_size,
                         embedding_dim = 256,
                         hidden_dim = 256,
                         to_pred_size = moves_to_pred)

model_medium = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 256,
                    hidden_dim = 256,
                    to_pred_size = moves_to_pred)

model_small = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 128,
                    hidden_dim = 128,
                    to_pred_size = moves_to_pred)

model_tiny = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 64,
                    hidden_dim = 64,
                    to_pred_size = moves_to_pred)



In [45]:
model_tiny.load_weights("models/modeling/QBGPT/weights/model_tiny/QBGPT")
model_small.load_weights("models/modeling/QBGPT/weights/model_small/QBGPT")
model_medium.load_weights("models/modeling/QBGPT/weights/model_medium/QBGPT")
model_large.load_weights("models/modeling/QBGPT/weights/model_large/QBGPT")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x331103150>

In [46]:
from models.modeling.StratFormer.models import StratEncoder

tiny_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=64,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_tiny.Encoder)

small_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=128,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_small.Encoder)

medium_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=256,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_medium.Encoder)

large_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=256,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_large.Encoder)



In [47]:
tiny_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_tiny/StratFormer/")
small_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_small/StratFormer/")
medium_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_medium/StratFormer/")
large_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_large/StratFormer/")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x2c26f9bd0>

## Encoding

### Off Def encoding

In [48]:
def convert_success(value):
    return tf.cast(value !=0, dtype="int32")

In [51]:
from tqdm import tqdm
import gc

encodings = []

for batch in tqdm(training_data, desc="Processing", total=len(training_data), ncols=100):

  off_encoding = tiny_encoder(batch["off"])[:,0,:]
  def_encoding = tiny_encoder(batch["def"])[:,0,:]
  
  _ = gc.collect()
  encodings.append({"gameId" : batch["off"]["gameId"],
                    "playId" : batch["off"]["playId"],
                    "Off" : off_encoding,
                    "Def" : def_encoding,
                    "playtype" : batch["off"]["PlayType"],
                    "Success" : convert_success(batch["off"]["yards_gained"]),
                    "yards_gained" : batch["off"]["yards_gained"]})

Processing: 100%|███████████████████████████████████████████████| 4428/4428 [10:25<00:00,  7.08it/s]


In [53]:
test_encodings = []

for batch in tqdm(testing_data, desc="Processing", total=len(testing_data), ncols=100):

  off_encoding = tiny_encoder(batch["off"])[:,0,:]
  def_encoding = tiny_encoder(batch["def"])[:,0,:]
  
  _ = gc.collect()
  test_encodings.append({"gameId" : batch["off"]["gameId"],
                         "playId" : batch["off"]["playId"],
                         "Off" : off_encoding,
                         "Def" : def_encoding,
                         "playtype" : batch["off"]["PlayType"],
                         "Success" : convert_success(batch["off"]["yards_gained"]),
                         "yards_gained" : batch["off"]["yards_gained"]})

Processing: 100%|███████████████████████████████████████████████| 1898/1898 [04:25<00:00,  7.16it/s]


In [54]:
def unbatch_dict(d : dict):
    conc_vector = tf.concat([d["Off"], d["Def"]], axis = 1).numpy()
    sub_vector = np.array(d["Off"]-d["Def"])
    
    l_of_b = [{"gameId" : np.array(d["gameId"])[i],
               "playId" : np.array(d["playId"])[i],
               "concatenated_vector" : conc_vector[i],
               "sub_vector" : sub_vector[i],
               "playtype" :np.unique(np.array(d["playtype"])[i])[0],
               "Success" : np.array(d["Success"])[i],
               "yards" :  np.array(d["yards_gained"])[i]} for i in range(d["Success"].shape[0])]
    return l_of_b

def append_id(d, i):
    d_copy = d.copy()
    d_copy['id'] = i
    return d_copy

In [55]:
from tqdm import tqdm

def compile_seq(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)
        
    merged_dict = {k: np.array(v) for k,v in merged_dict.items()}
    return merged_dict

In [56]:
encodings_unb = [unbatch_dict(d) for d in encodings]
encodings_unb = [d for l in encodings_unb for d in l]
print(len(encodings_unb))

test_encodings_unb = [unbatch_dict(d) for d in test_encodings]
test_encodings_unb = [d for l in test_encodings_unb for d in l]
print(len(test_encodings_unb))

141673
60718


In [57]:
encodings_unb = [append_id(encodings_unb[i], i) for i in range(len(encodings_unb))]
test_encodings_unb = [append_id(test_encodings_unb[i], i) for i in range(len(test_encodings_unb))]

In [58]:
train_dataset = compile_seq(encodings_unb)
test_dataset = compile_seq(test_encodings_unb)

100%|██████████| 141673/141673 [00:00<00:00, 1407229.71it/s]
100%|██████████| 60718/60718 [00:00<00:00, 1623951.83it/s]


In [59]:
train_total = tf.data.Dataset.from_tensor_slices(train_dataset)
tf.data.Dataset.save(train_total, "data_models/Helenos/encoded_train_tiny")

test_total = tf.data.Dataset.from_tensor_slices(test_dataset)
tf.data.Dataset.save(test_total, "data_models/Helenos/encoded_test_tiny")