In [1]:
import os
import pandas as pd
import tensorflow as tf
import tensorflow_probability as tfp
import numpy as np
import polars as pl

env = "local"

In [2]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [3]:
os.listdir()

['data_models',
 '.DS_Store',
 'app',
 'LICENSE',
 'models',
 'README.md',
 '.gitignore',
 '.gitattributes',
 'indexv2',
 'data_preprocessing',
 'index',
 '.git',
 'notebooks']

In [4]:
training_data = tf.data.Dataset.load("data_models/Helenos/train_data")
testing_data = tf.data.Dataset.load("data_models/Helenos/test_data")

train_length = [i for i,_ in enumerate(training_data)][-1] + 1
test_length = [i for i,_ in enumerate(testing_data)][-1] + 1

print("Train length is : ", str(train_length))
print("Test length is : ", str(test_length))

batch_size = 32

training_data = training_data.shuffle(train_length).batch(batch_size)
testing_data = testing_data.shuffle(test_length).batch(batch_size)

Train length is :  144074
Test length is :  61746


In [5]:
from models.modeling.QBGPT.models import QBGPT, LargeQBGPT, XLargeQBGPT
from models.modeling.QBGPT.losses_and_metrics import CustomSparseCategoricalAccuracy, CustomTopKAccuracy, CustomSparseCategoricalCrossentropy

moves_to_pred = 11170
input_size = 11172
starts_size = 1954
scrimmage_size = 100
positions_id = 29

temp_ids = 52
off_def_size = 2
token_type_size = 3
play_type_size = 9

model_large = LargeQBGPT(input_vocab_size = input_size,
                         positional_vocab_size = temp_ids,
                         position_vocab_size=positions_id,
                         start_vocab_size=starts_size,
                         scrimmage_vocab_size=scrimmage_size,
                         offdef_vocab_size = off_def_size,
                         type_vocab_size = token_type_size,
                         playtype_vocab_size = play_type_size,
                         embedding_dim = 128,
                         hidden_dim = 128,
                         num_heads = 3,
                         diag_masks = True,
                         to_pred_size = moves_to_pred)

model_medium = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 256,
                    hidden_dim = 256,
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)

model_small = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 128,
                    hidden_dim = 128,
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)

model_tiny = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 64,
                    hidden_dim = 64,
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)

In [6]:
model_tiny.load_weights("models/modeling/QBGPT/weights/model_tinyv2/QBGPT")
model_small.load_weights("models/modeling/QBGPT/weights/model_smallv2/QBGPT")
model_medium.load_weights("models/modeling/QBGPT/weights/model_mediumv2/QBGPT")
model_large.load_weights("models/modeling/QBGPT/weights/model_largev2/QBGPT")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x296ab7850>

In [7]:
from models.modeling.StratFormer.models import StratEncoder

tiny_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=64,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_tiny.Encoder)

small_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=128,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_small.Encoder)

medium_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=256,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_medium.Encoder)

large_encoder = StratEncoder(num_spec_token= 1,
                            hidden_dim=256,
                            team_vocab_size=32,
                            player_vocab_size=7229,
                            season_vocab_size= 7,
                            down_vocab_size= 5,
                            base_encoder=model_large.Encoder)

In [8]:
#tiny_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_tiny/StratFormer/")
#small_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_small/StratFormer/")
medium_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_medium/StratFormer/")
#large_encoder.load_weights("models/modeling/StratFormer/weights/stratformer_large/StratFormer/")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x296bec5d0>

## Encoding

### Off Def encoding

In [9]:
def convert_success(value):
    return tf.cast(value !=0, dtype="int32")

In [10]:
from tqdm import tqdm
import gc

encodings = []

for batch in tqdm(training_data, desc="Processing", total=len(training_data), ncols=100):

  off_encoding = tiny_encoder(batch["off"])[:,0,:]
  def_encoding = tiny_encoder(batch["def"])[:,0,:]
  
  _ = gc.collect()
  encodings.append({"gameId" : batch["off"]["gameId"],
                    "playId" : batch["off"]["playId"],
                    "Off" : off_encoding,
                    "Def" : def_encoding,
                    "playtype" : batch["off"]["PlayType"],
                    "Success" : convert_success(batch["off"]["yards_gained"]),
                    "yards_gained" : batch["off"]["yards_gained"]})

Processing:   0%|                                                          | 0/4503 [00:00<?, ?it/s]

Processing: 100%|███████████████████████████████████████████████| 4503/4503 [08:18<00:00,  9.03it/s]


In [11]:
test_encodings = []

for batch in tqdm(testing_data, desc="Processing", total=len(testing_data), ncols=100):

  off_encoding = tiny_encoder(batch["off"])[:,0,:]
  def_encoding = tiny_encoder(batch["def"])[:,0,:]
  
  _ = gc.collect()
  test_encodings.append({"gameId" : batch["off"]["gameId"],
                         "playId" : batch["off"]["playId"],
                         "Off" : off_encoding,
                         "Def" : def_encoding,
                         "playtype" : batch["off"]["PlayType"],
                         "Success" : convert_success(batch["off"]["yards_gained"]),
                         "yards_gained" : batch["off"]["yards_gained"]})

Processing: 100%|███████████████████████████████████████████████| 1930/1930 [03:55<00:00,  8.20it/s]


In [12]:
def unbatch_dict(d : dict):
    conc_vector = tf.concat([d["Off"], d["Def"]], axis = 1).numpy()
    sub_vector = np.array(d["Off"]-d["Def"])
    
    l_of_b = [{"gameId" : np.array(d["gameId"])[i],
               "playId" : np.array(d["playId"])[i],
               "concatenated_vector" : conc_vector[i],
               "sub_vector" : sub_vector[i],
               "playtype" :np.unique(np.array(d["playtype"])[i])[0],
               "Success" : np.array(d["Success"])[i],
               "yards" :  np.array(d["yards_gained"])[i]} for i in range(d["Success"].shape[0])]
    return l_of_b

def append_id(d, i):
    d_copy = d.copy()
    d_copy['id'] = i
    return d_copy

In [13]:
from tqdm import tqdm

def compile_seq(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)
        
    merged_dict = {k: np.array(v) for k,v in merged_dict.items()}
    return merged_dict

In [14]:
encodings_unb = [unbatch_dict(d) for d in encodings]
encodings_unb = [d for l in encodings_unb for d in l]
print(len(encodings_unb))

test_encodings_unb = [unbatch_dict(d) for d in test_encodings]
test_encodings_unb = [d for l in test_encodings_unb for d in l]
print(len(test_encodings_unb))

144074
61746


In [15]:
encodings_unb = [append_id(encodings_unb[i], i) for i in range(len(encodings_unb))]
test_encodings_unb = [append_id(test_encodings_unb[i], i) for i in range(len(test_encodings_unb))]

In [16]:
train_dataset = compile_seq(encodings_unb)
test_dataset = compile_seq(test_encodings_unb)

100%|██████████| 144074/144074 [00:00<00:00, 1566106.84it/s]
100%|██████████| 61746/61746 [00:00<00:00, 1651983.76it/s]


In [17]:
train_total = tf.data.Dataset.from_tensor_slices(train_dataset)
tf.data.Dataset.save(train_total, "data_models/Helenos/encoded_train_tiny")

test_total = tf.data.Dataset.from_tensor_slices(test_dataset)
tf.data.Dataset.save(test_total, "data_models/Helenos/encoded_test_tiny")