In [6]:
import os
import pandas as pd
import numpy as np
import polars as pl

env = "local"

In [7]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [8]:
os.listdir()

['data_models',
 '.DS_Store',
 'app',
 'LICENSE',
 'models',
 'README.md',
 '.gitignore',
 '.gitattributes',
 'data_preprocessing',
 'index',
 '.git',
 'notebooks']

In [9]:
data = pl.read_parquet("data_preprocessing/1_mapped/mapped_df.parquet")

In [10]:
zones_max = 10875
starts_max = 1031
scrimmage_max = 99
positions_max = 28

start_token = 10876

pad_input = 10877
pad_pos_id = 51
pad_position_id = 28
pad_scrimmage_id = 99
pad_start_id = 1032

In [11]:
pos = pl.read_parquet("index/positions_index.parquet")
pos.tail()

position,position_ID,Cat
str,i64,str
"""SAF""",23,"""Pos"""
"""SS""",24,"""Pos"""
"""T""",25,"""Pos"""
"""TE""",26,"""Pos"""
"""WR""",27,"""Pos"""


In [12]:
moves = pl.read_parquet("index/moves_index.parquet")
moves.tail()

x,y,Zone_ID,Cat
i64,i64,i64,str
75,-14,10871,"""Moves"""
75,-1,10872,"""Moves"""
76,-39,10873,"""Moves"""
76,-1,10874,"""Moves"""
77,-39,10875,"""Moves"""


In [13]:
starts = pl.read_parquet("index/starts_index.parquet")
starts.tail()

Starting_x,Starting_y,Start_ID,Cat
i64,i64,i32,str
-21,21,1031,"""Start"""
-21,22,1031,"""Start"""
-21,23,1031,"""Start"""
-21,24,1031,"""Start"""
-21,25,1031,"""Start"""


In [14]:
pad_input

10877

## Dataframe to a list of dicts

In [15]:
def sort_list_and_reproduce(a, b):
    # Pair elements from lists A and B
    paired_lists = list(zip(a, b))

    # Sort the paired lists based on list A
    paired_lists.sort(key=lambda x: x[0])

    # Unpack the sorted pairs into new lists A and B
    sorted_a, sorted_b = zip(*paired_lists)

    return list(sorted_a), list(sorted_b)

def sort_seq(x : dict):
  new_dict = x.copy()
  new_pos, new_ids = sort_list_and_reproduce(new_dict["pos_ids"], new_dict["input_ids"])
  new_dict["pos_ids"] = new_pos
  new_dict["input_ids"] = new_ids
  return new_dict

In [16]:
seq_dict = [{"pos_ids" : row["Frame_ID"],
             "input_ids": row["Zone_ID"],
             "start_ID" : row["Start_ID"],
             "scrimmage_ID" : row["Scrimmage_ID"],
             "position_ID" : row["position_ID"],
             "side_ID" : row["side_ID"],
             "OffDef_ID" : row["OffDef_ID"],
             "PlayType_ID" : row["PlayType_ID"],
             "nflId" : row["nflId"],
             "playId" : row["playId"],
             "gameId" : row["gameId"]} for row in data.iter_rows(named=True)]

In [17]:
seq_dict = [sort_seq(d) for d in seq_dict]

In [18]:
def pad_and_truncate_lists(A, B, max_size, pad_step, n, A_pad, B_pad):
    # Determine the current length of lists A and B
    current_length = len(A)

    # If the lists are smaller than max_size, pad them with -100
    if current_length < max_size:
        padding_length = max_size - current_length
        A.extend([A_pad] * padding_length)
        B.extend([B_pad] * padding_length)

    result_A, result_B = [], []
    for i in range(0, current_length, pad_step):
        truncated_A = A[i:i + max_size]
        truncated_B = B[i:i + max_size]

        # Count the number of elements that are different from -100
        non_negative_count = len([x for x in truncated_A if x != A_pad])

        if non_negative_count >= n:
            if len(truncated_A) < max_size:
                truncated_A.extend([A_pad] * (max_size - len(truncated_A)))
                truncated_B.extend([B_pad] * (max_size - len(truncated_B)))
            result_A.append(truncated_A)
            result_B.append(truncated_B)

    return result_A, result_B

def assign_values_dict(d, key_a, val_a, key_b, val_b, i):
  new_dict = d.copy()
  new_dict[key_a] = val_a
  new_dict[key_b] = val_b
  new_dict["Traj"] = i

  return new_dict

def process_seq(x : dict, max_length, pad_step, min_size, pad_input, pad_pos):
  new_dict = x.copy()
  new_pos, new_ids = pad_and_truncate_lists(A = new_dict["pos_ids"], B = new_dict["input_ids"], max_size = max_length, pad_step = pad_step, n = min_size, A_pad = pad_pos, B_pad = pad_input)
  list_new_dicts = [assign_values_dict(new_dict, "pos_ids", new_pos[i], "input_ids", new_ids[i], i) for i in range(len(new_pos))]

  return list_new_dicts

In [19]:
seq_dict = [process_seq(seq, max_length = 21, pad_step = 12, min_size = 6, pad_input = pad_input, pad_pos = pad_pos_id) for seq in seq_dict]

In [20]:
seq_dict = [sub for l in seq_dict for sub in l]

## List of dicts to a dict of games, plays and traj

In [27]:
from tqdm import tqdm

def merge_dicts_and_create_lists(list_of_dicts):
    merged_dict = {}

    with tqdm(total=len(list_of_dicts)) as pbar:
      for d in list_of_dicts:
        for key, value in d.items():
          merged_dict[key] = merged_dict.get(key, []) + [value]
        pbar.update(1)

    return merged_dict

def merge_dicts_and_create_lists_add(list_of_dicts):
    merged_dict = {}

    for d in list_of_dicts:
        for key, value in d.items():
            if key in merged_dict:
                merged_dict[key] += value
            else:
                merged_dict[key] = value

    return merged_dict

In [28]:
seq_dict = [{"_".join([str(d["gameId"]), str(d["playId"]), str(d["Traj"]), str(d["OffDef_ID"])]): d} for d in seq_dict]
seq_dict  = merge_dicts_and_create_lists(seq_dict)
seq_dict = {k : v[:11] for k,v in seq_dict.items()}

100%|██████████| 2348029/2348029 [00:01<00:00, 1333431.64it/s]


In [35]:
seq_dict["2018110405_3362_0_0"][2]

{'pos_ids': [1,
  2,
  3,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  12,
  13,
  14,
  15,
  16,
  17,
  18,
  19,
  20,
  21],
 'input_ids': [5081,
  5081,
  5081,
  5081,
  5081,
  5081,
  5081,
  5081,
  5081,
  5081,
  4999,
  4915,
  4832,
  4745,
  4661,
  4578,
  4491,
  4404,
  4238,
  4154,
  3987],
 'start_ID': 840,
 'scrimmage_ID': 63,
 'position_ID': 24,
 'side_ID': 0,
 'OffDef_ID': 0,
 'PlayType_ID': 4,
 'nflId': 43533,
 'playId': 3362,
 'gameId': 2018110405,
 'Traj': 0}

## Sequence builder

In [41]:
def create_labels(ids, input_pad):
  labels = ids[1:] + [-100]
  labels = [-100 if v not in range(0,10875+1) else v for v in labels]
  return labels

def create_single_traj(d : dict, input_pad):
  
  x = d.copy()
  
  if x["Traj"] == 0:
    pos_ids = [0] + x["pos_ids"]
    input_ids = [start_token] + x["input_ids"]
    token_type_ids = [0] + [1 for v in x["input_ids"]]
    length = len(x["input_ids"])+1
    start_ids = [x["start_ID"] for v in range(length)]
    scrim_ids = [x["scrimmage_ID"] for v in range(length)]
    position_ids = [x["position_ID"] for v in range(length)]
    side_ids = [x["side_ID"] for v in range(length)]
    playtype_ids = [x["PlayType_ID"] for v in range(length)]
    team_ids = [x["OffDef_ID"] for v in range(length)]

    single_traj = {"input_ids" : input_ids,
                  "position_ids" : position_ids,
                  "scrim_ids" : scrim_ids,
                  "start_ids" : start_ids,
                  "pos_ids" : pos_ids,
                  "side_ids" : side_ids,
                  "token_type_ids" : token_type_ids,
                  "OffDef" : team_ids,
                  "PlayType" : playtype_ids,
                  "labels" : create_labels(input_ids, input_pad),
                  "attention_mask" : [1 if v != input_pad else 0 for v in input_ids]}
    
  else:
    pos_ids = x["pos_ids"]
    input_ids =  x["input_ids"]
    token_type_ids = [1 for v in x["input_ids"]]
    length = len(x["input_ids"])
    start_ids = [x["start_ID"] for v in range(length)]
    scrim_ids = [x["scrimmage_ID"] for v in range(length)]
    position_ids = [x["position_ID"] for v in range(length)]
    side_ids = [x["side_ID"] for v in range(length)]
    playtype_ids = [x["PlayType_ID"] for v in range(length)]
    team_ids = [x["OffDef_ID"] for v in range(length)]

    single_traj = {"input_ids" : input_ids,
                  "position_ids" : position_ids,
                  "scrim_ids" : scrim_ids,
                  "start_ids" : start_ids,
                  "pos_ids" : pos_ids,
                  "side_ids" : side_ids,
                  "token_type_ids" : token_type_ids,
                  "OffDef" : team_ids,
                  "PlayType" : playtype_ids,
                  "labels" : create_labels(input_ids, input_pad),
                  "attention_mask" : [1 if v != input_pad else 0 for v in input_ids]}

  return single_traj

def padd_to_length(team_traj, 
                   max_len, 
                   input_pad = pad_input, 
                   pos_id_pad = pad_pos_id, 
                   position_id_pad = pad_position_id,
                   scrim_id_pad = pad_scrimmage_id, 
                   start_id_pad = pad_start_id, 
                   labels_pad = -100, 
                   attention_pad = 0):
  
  current_length = len(team_traj["input_ids"])
  current_length_sides = len(team_traj["side_ids"])
  current_length_positions = len(team_traj["position_ids"])
  current_length_start = len(team_traj["start_ids"])
  current_length_scrim = len(team_traj["scrim_ids"])
  current_length_types = len(team_traj["token_type_ids"])
  current_length_OffDef = len(team_traj["OffDef"])
  current_length_PlayType = len(team_traj["PlayType"])

  to_pad = max_len - current_length
  to_pad_sides = max_len - current_length_sides
  to_pad_OffDef = max_len - current_length_OffDef
  to_pad_position = max_len - current_length_positions
  to_pad_start = max_len - current_length_start
  to_pad_scrim = max_len - current_length_scrim
  to_pad_types = max_len - current_length_types
  to_pad_PlayType = max_len - current_length_PlayType

  padded_input = [input_pad for i in range(to_pad)]
  padded_pos = [pos_id_pad for i in range(to_pad)]
  padded_position = [position_id_pad for i in range(to_pad_position)]
  padded_start = [start_id_pad for i in range(to_pad_start)]
  padded_scrim = [scrim_id_pad for i in range(to_pad_scrim)]
  padded_labels = [labels_pad for i in range(to_pad)]
  padded_attention = [attention_pad for i in range(to_pad)]
  padded_sides = [team_traj["side_ids"][0] for i in range(to_pad_sides)]
  padded_token_types = [2 for i in range(to_pad_types)]
  padded_OffDef = [team_traj["OffDef"][0] for i in range(to_pad_OffDef)]
  padded_PlayType = [team_traj["PlayType"][0] for i in range(to_pad_PlayType)]

  padded_traj = team_traj.copy()

  padded_traj["input_ids"] += padded_input
  padded_traj["side_ids"] += padded_sides
  padded_traj["pos_ids"] += padded_pos
  padded_traj["position_ids"] += padded_position
  padded_traj["start_ids"] += padded_start
  padded_traj["scrim_ids"] += padded_scrim
  padded_traj["OffDef"] += padded_OffDef
  padded_traj["token_type_ids"] += padded_token_types
  padded_traj["PlayType"] += padded_PlayType
  padded_traj["labels"] += padded_labels
  padded_traj["attention_mask"] += padded_attention

  return padded_traj


def create_team_traj(list_dicts, input_pad):
  
  trajs = [create_single_traj(d.copy(), input_pad) for d in list_dicts.copy()]
  merged = merge_dicts_and_create_lists_add(trajs.copy())
  merged = padd_to_length(merged, 256)
  return {k : np.array(v) for k,v in merged.items()}

In [42]:
pad_input

10877

In [43]:
plays = np.array([[v, seq_dict[v][0]["PlayType_ID"]] for v in seq_dict.keys()])
plays_df = pd.DataFrame(plays, columns = ["game_play_id", "PlayType_ID"]).drop_duplicates().reset_index(drop = True)

In [44]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(plays_df, test_size = 0.2)

train_games = train["game_play_id"].to_numpy()
test_games = test["game_play_id"].to_numpy()

In [45]:
print(len(train_games))

205851


In [46]:
print(len(test_games))

51463


In [47]:
train.to_csv("models/modeling/QBGPT/splits/train_split.csv", sep = ";", index = False)
test.to_csv("models/modeling/QBGPT/splits/test_split.csv", sep = ";", index = False)

In [48]:
train_dict = {}
test_dict = {}

with tqdm(total=len(train_games)) as pbar:
    for key in train_games:
      train_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

with tqdm(total=len(test_games)) as pbar:
    for key in test_games:
      test_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

100%|██████████| 205851/205851 [00:38<00:00, 5414.94it/s]
100%|██████████| 51463/51463 [00:10<00:00, 4980.54it/s]


In [49]:
def compile_traj(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)

    return merged_dict

In [50]:
train_dict = compile_traj(list(train_dict.values()))
test_dict = compile_traj(list(test_dict.values()))

100%|██████████| 205851/205851 [00:00<00:00, 713400.63it/s]
100%|██████████| 51463/51463 [00:00<00:00, 752481.12it/s]


In [51]:
train_dict = {key : np.vstack(value) for key,value in train_dict.items()}
test_dict = {key : np.vstack(value) for key,value in test_dict.items()}

In [52]:
import tensorflow as tf

train_labels = train_dict["labels"]
train_dataset = tf.data.Dataset.from_tensor_slices((train_dict, train_labels))

test_labels = test_dict["labels"]
test_dataset = tf.data.Dataset.from_tensor_slices((test_dict, test_labels))

In [53]:
see = np.unique(train_dict["labels"].flatten())
see

array([ -100,     0,     1, ..., 10871, 10873, 10875])

In [54]:
see = np.unique(train_dict["start_ids"].flatten())
see

array([   0,    1,    2, ..., 1030, 1031, 1032])

In [55]:
see = np.unique(test_dict["labels"].flatten())
see

array([ -100,     7,     9, ..., 10869, 10872, 10874])

In [56]:
train_dataset

<_TensorSliceDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'position_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'scrim_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'start_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'pos_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'side_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'token_type_ids': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'OffDef': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'PlayType': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'labels': TensorSpec(shape=(256,), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.int64, name=None)}, TensorSpec(shape=(256,), dtype=tf.int64, name=None))>

In [57]:
tf.data.Dataset.save(train_dataset, "data_models/QBGPT/train_tokens_NFL_GPT")
tf.data.Dataset.save(test_dataset, "data_models/QBGPT/test_tokens_NFL_GPT")