In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf

env = "local"

In [2]:
if env != "local":
  !git clone https://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@github.com/samchaineau/QB-GPT.git
  import sys
  sys.path.append("/content/QB-GPT/")

In [3]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/QB-GPT/")

In [7]:
testing_data = tf.data.Dataset.load("data_models/QBGPT/test_tokens_NFL_GPT_v2")
test_length = [i for i,_ in enumerate(testing_data)][-1] + 1

batch_size = 32

testing_data = testing_data.shuffle(test_length).batch(batch_size)

In [8]:
from models.modeling.QBGPT.models import QBGPT, LargeQBGPT, XLargeQBGPT
from models.modeling.QBGPT.losses_and_metrics import CustomSparseCategoricalAccuracy, CustomTopKAccuracy, CustomSparseCategoricalCrossentropy

moves_to_pred = 11170
input_size = 11172
starts_size = 1954
scrimmage_size = 100
positions_id = 29

temp_ids = 52
off_def_size = 2
token_type_size = 3
play_type_size = 9

model_large = LargeQBGPT(input_vocab_size = input_size,
                         positional_vocab_size = temp_ids,
                         position_vocab_size=positions_id,
                         start_vocab_size=starts_size,
                         scrimmage_vocab_size=scrimmage_size,
                         offdef_vocab_size = off_def_size,
                         type_vocab_size = token_type_size,
                         playtype_vocab_size = play_type_size,
                         embedding_dim = 128,
                         hidden_dim = 128,
                         num_heads = 3,
                         diag_masks = True,
                         to_pred_size = moves_to_pred)

model_medium = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 256,
                    hidden_dim = 256,
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)

model_small = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 128,
                    hidden_dim = 128,
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)

model_tiny = QBGPT(input_vocab_size = input_size,
                    positional_vocab_size = temp_ids,
                    position_vocab_size=positions_id,
                    start_vocab_size=starts_size,
                    scrimmage_vocab_size=scrimmage_size,
                    offdef_vocab_size = off_def_size,
                    type_vocab_size = token_type_size,
                    playtype_vocab_size = play_type_size,
                    embedding_dim = 64,
                    hidden_dim = 64,                    
                    num_heads = 3,
                    diag_masks = True,
                    to_pred_size = moves_to_pred)



In [9]:
model_tiny.load_weights("models/modeling/QBGPT/weights/model_tinyv2/QBGPT")
model_small.load_weights("models/modeling/QBGPT/weights/model_smallv2/QBGPT")
model_medium.load_weights("models/modeling/QBGPT/weights/model_mediumv2/QBGPT")
model_large.load_weights("models/modeling/QBGPT/weights/model_largev2/QBGPT")

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x29681c150>

## Evaluation protocol

In [10]:
tiny_prediction = []
small_prediction = []
medium_prediction = []
large_prediction = []

trues = []
time = []
playtype = []
positions = []

In [11]:
import gc
from tqdm import tqdm

for batch in tqdm(testing_data, desc="Processing", total=len(testing_data), ncols=100):

  predictions = tf.argmax(model_tiny(batch[0]), axis = -1)
  _ = gc.collect()
  tiny_prediction.append(predictions)
  
  predictions = tf.argmax(model_small(batch[0]), axis = -1)
  _ = gc.collect()
  small_prediction.append(predictions)
  
  predictions = tf.argmax(model_medium(batch[0]), axis = -1)
  _ = gc.collect()
  medium_prediction.append(predictions)
  
  predictions = tf.argmax(model_large(batch[0]), axis = -1)
  _ = gc.collect()
  large_prediction.append(predictions)

  trues.append(batch[1])
  time.append(batch[0]["pos_ids"])
  playtype.append(batch[0]["PlayType"])
  positions.append(batch[0]["position_ids"])

Processing: 100%|███████████████████████████████████████████████| 1575/1575 [24:36<00:00,  1.07it/s]


In [12]:
from app.tools import tokenizer
import polars as pl

QBGPT_tokenizer = tokenizer(moves_index= "indexv2/moves_index.parquet",
                            play_index= "indexv2/plays_index.parquet",
                            positions_index= "indexv2/positions_index.parquet",
                            scrimmage_index= "indexv2/scrimmage_index.parquet",
                            starts_index= "indexv2/starts_index.parquet",
                            time_index= "indexv2/time_index.parquet")

def rmse(a : np.array, b : np.array):
    return np.sqrt(np.sum((a - b)**2, axis = 1))
    
def model_rmse(df : pl.DataFrame, tokenizer):
    labels_list = np.array(tokenizer.decode(df.select("label").to_series().to_list(), type = "moves"))
    predicted_list = np.array(tokenizer.decode(df.select("pred").to_series().to_list(), type = "moves"))
    return rmse(labels_list, predicted_list)

In [13]:
tiny_eval_df = (pl.DataFrame({"label" : np.vstack(trues).flatten(),
                             "pred" : np.vstack(tiny_prediction).flatten(),
                             "time" : np.vstack(time).flatten(),
                             "playtype" : np.vstack(playtype).flatten(),
                             "positions" : np.vstack(positions).flatten()}).
                filter(pl.col("label") != -100).
                with_columns((pl.col("label") == pl.col("pred")).cast(pl.Float32).alias("Correct")))

tiny_rmse = model_rmse(tiny_eval_df, tokenizer=QBGPT_tokenizer)
tiny_eval_df = (tiny_eval_df.
                with_columns(pl.Series(tiny_rmse).alias("RMSE")))

small_eval_df = (pl.DataFrame({"label" : np.vstack(trues).flatten(),
                             "pred" : np.vstack(small_prediction).flatten(),
                             "time" : np.vstack(time).flatten(),
                             "playtype" : np.vstack(playtype).flatten(),
                             "positions" : np.vstack(positions).flatten()}).
                filter(pl.col("label") != -100).
                with_columns((pl.col("label") == pl.col("pred")).cast(pl.Float32).alias("Correct")))

small_rmse = model_rmse(small_eval_df, tokenizer=QBGPT_tokenizer)
small_eval_df = (small_eval_df.
                with_columns(pl.Series(small_rmse).alias("RMSE")))

medium_eval_df = (pl.DataFrame({"label" : np.vstack(trues).flatten(),
                             "pred" : np.vstack(medium_prediction).flatten(),
                             "time" : np.vstack(time).flatten(),
                             "playtype" : np.vstack(playtype).flatten(),
                             "positions" : np.vstack(positions).flatten()}).
                filter(pl.col("label") != -100).
                with_columns((pl.col("label") == pl.col("pred")).cast(pl.Float32).alias("Correct")))

medium_rmse = model_rmse(medium_eval_df, tokenizer=QBGPT_tokenizer)
medium_eval_df = (medium_eval_df.
                with_columns(pl.Series(medium_rmse).alias("RMSE")))

large_eval_df = (pl.DataFrame({"label" : np.vstack(trues).flatten(),
                             "pred" : np.vstack(large_prediction).flatten(),
                             "time" : np.vstack(time).flatten(),
                             "playtype" : np.vstack(playtype).flatten(),
                             "positions" : np.vstack(positions).flatten()}).
                filter(pl.col("label") != -100).
                with_columns((pl.col("label") == pl.col("pred")).cast(pl.Float32).alias("Correct")))

large_rmse = model_rmse(large_eval_df, tokenizer=QBGPT_tokenizer)
large_eval_df = (large_eval_df.
                with_columns(pl.Series(large_rmse).alias("RMSE")))

In [14]:
eval_df_dict = {"tiny_model" : tiny_eval_df,
                "small_model" : small_eval_df,
                "medium_model" : medium_eval_df,
                "large_model" : large_eval_df}

In [15]:
def average_per_cat(metric : str, cat : str, model_name : str, eval_df : pl.DataFrame):
    return (eval_df.
            select(cat, metric).
            group_by(cat).
            mean().
            sort(cat).
            with_columns(pl.lit(model_name).alias("Model")))
    
def model_comparison_per_cat(metric : str, cat : str, eval_dict : dict):
    return (pl.concat([average_per_cat(metric, cat, k, v) for k,v in eval_dict.items()]).
            pivot(values= metric, columns="Model", index=cat))

In [16]:
cat_to_evaluate = ["playtype", "time", "positions"]

ac_model_comparisons = {m : model_comparison_per_cat("Correct", m, eval_dict=eval_df_dict) for m in cat_to_evaluate}
rmse_model_comparisons = {m : model_comparison_per_cat("RMSE", m, eval_dict=eval_df_dict) for m in cat_to_evaluate}

In [20]:
ac_model_comparisons.keys()

dict_keys(['playtype', 'time', 'positions'])

In [21]:
ac_model_comparisons["positions"]

positions,tiny_model,small_model,medium_model,large_model
i64,f32,f32,f32,f32
0,0.686088,0.710431,0.778349,0.701892
1,0.355464,0.373017,0.519051,0.36067
2,0.330067,0.341761,0.490186,0.331653
3,0.50009,0.523961,0.639078,0.510583
4,0.575532,0.606186,0.697874,0.591549
5,0.620273,0.645542,0.732785,0.63634
6,0.353114,0.366891,0.526154,0.350927
7,0.365508,0.386154,0.528785,0.371036
8,0.686713,0.711184,0.775771,0.70236
9,0.33237,0.350805,0.509836,0.337392


In [22]:
for cat, comp_df in ac_model_comparisons.items():
    comp_df.write_parquet("models/modeling/QBGPT/evaluations/v2/acc/"+cat+"_acc_eval"+".parquet")
    
for cat, comp_df in rmse_model_comparisons.items():
    comp_df.write_parquet("models/modeling/QBGPT/evaluations/v2/rmse/"+cat+"_rmse_eval"+".parquet")

In [20]:
os.listdir("models/modeling/QBGPT/evaluations/")

['data_models',
 '.DS_Store',
 'app',
 'LICENSE',
 'models',
 'README.md',
 '.gitignore',
 '.gitattributes',
 'data_preprocessing',
 'index',
 '.git',
 'notebooks']