In [3]:
import os
import pandas as pd
import numpy as np
import polars as pl

env = "local"

In [4]:
if env == "local":
    os.chdir("/Users/samuel/Documents/GitHub/QB-GPT/")
else:
    from google.colab import drive
    drive.mount('/content/gdrive')
    os.chdir("/content/gdrive/MyDrive/NFL_Challenge/NFL-GPT/NFL data")

In [5]:
os.listdir()

['data_models',
 'test.gif',
 '.DS_Store',
 'app',
 'LICENSE',
 'test_1.gif',
 'models',
 'README.md',
 'test_1_true.gif',
 '.gitignore',
 '.gitattributes',
 'indexv2',
 'data_preprocessing',
 'index',
 '.git',
 'generation.gif',
 'notebooks']

In [6]:
data = pl.read_parquet("data_preprocessing/1_mapped/mapped_df_v2.parquet")

In [7]:
zones_max = 11221
starts_max = 2111
scrimmage_max = 99
positions_max = 28

start_token = 2112

pad_input = 11222
pad_pos_id = 51
pad_position_id = 28
pad_scrimmage_id = 99
pad_start_id = 2113

In [8]:
moves = pl.read_parquet("indexv2/moves_index.parquet")
moves.tail()

x,y,Zone_ID,Cat
i64,i64,i64,str
75,-39,11216,"""Moves"""
75,-1,11217,"""Moves"""
76,-39,11218,"""Moves"""
76,-1,11219,"""Moves"""
77,-39,11220,"""Moves"""


In [9]:
starts = pl.read_parquet("indexv2/starts_index.parquet")
starts.tail()

Starting_x,Starting_y,Start_ID,Cat
i64,i64,i32,str
74,1,2111,"""Start"""
74,2,2111,"""Start"""
74,3,2111,"""Start"""
75,1,2111,"""Start"""
75,8,2111,"""Start"""


In [10]:
pad_input

11222

## Dataframe to a list of dicts

In [11]:
def sort_list_and_reproduce(a, b):
    # Pair elements from lists A and B
    paired_lists = list(zip(a, b))

    # Sort the paired lists based on list A
    paired_lists.sort(key=lambda x: x[0])

    # Unpack the sorted pairs into new lists A and B
    sorted_a, sorted_b = zip(*paired_lists)

    return list(sorted_a), list(sorted_b)

def sort_seq(x : dict):
  new_dict = x.copy()
  new_pos, new_ids = sort_list_and_reproduce(new_dict["pos_ids"], new_dict["input_ids"])
  new_dict["pos_ids"] = new_pos
  new_dict["input_ids"] = new_ids
  return new_dict

In [12]:
seq_dict = [{"pos_ids" : row["Frame_ID"],
             "input_ids": row["Zone_ID"],
             "start_ID" : row["Start_ID"],
             "scrimmage_ID" : row["Scrimmage_ID"],
             "position_ID" : row["position_ID"],
             "side_ID" : row["side_ID"],
             "OffDef_ID" : row["OffDef_ID"],
             "PlayType_ID" : row["PlayType_ID"],
             "nflId" : row["nflId"],
             "playId" : row["playId"],
             "gameId" : row["gameId"]} for row in data.iter_rows(named=True)]

In [13]:
seq_dict = [sort_seq(d) for d in seq_dict]

In [14]:
def pad_and_truncate_lists(A, B, max_size, pad_step, n, A_pad, B_pad):
    # Determine the current length of lists A and B
    current_length = len(A)

    # If the lists are smaller than max_size, pad them with -100
    if current_length < max_size:
        padding_length = max_size - current_length
        A.extend([A_pad] * padding_length)
        B.extend([B_pad] * padding_length)

    result_A, result_B = [], []
    for i in range(0, current_length, pad_step):
        truncated_A = A[i:i + max_size]
        truncated_B = B[i:i + max_size]

        # Count the number of elements that are different from -100
        non_negative_count = len([x for x in truncated_A if x != A_pad])

        if non_negative_count >= n:
            if len(truncated_A) < max_size:
                truncated_A.extend([A_pad] * (max_size - len(truncated_A)))
                truncated_B.extend([B_pad] * (max_size - len(truncated_B)))
            result_A.append(truncated_A)
            result_B.append(truncated_B)

    return result_A, result_B

def assign_values_dict(d, key_a, val_a, key_b, val_b, i):
  new_dict = d.copy()
  new_dict[key_a] = val_a
  new_dict[key_b] = val_b
  new_dict["Traj"] = i

  return new_dict

def process_seq(x : dict, max_length, pad_step, min_size, pad_input, pad_pos):
  new_dict = x.copy()
  new_pos, new_ids = pad_and_truncate_lists(A = new_dict["pos_ids"], B = new_dict["input_ids"], max_size = max_length, pad_step = pad_step, n = min_size, A_pad = pad_pos, B_pad = pad_input)
  list_new_dicts = [assign_values_dict(new_dict, "pos_ids", new_pos[i], "input_ids", new_ids[i], i) for i in range(len(new_pos))]

  return list_new_dicts

In [15]:
seq_dict = [process_seq(seq, max_length = 21, pad_step = 12, min_size = 6, pad_input = pad_input, pad_pos = pad_pos_id) for seq in seq_dict]

In [16]:
seq_dict = [sub for l in seq_dict for sub in l]

## List of dicts to a dict of games, plays and traj

In [17]:
from tqdm import tqdm

def merge_dicts_and_create_lists(list_of_dicts):
    merged_dict = {}

    with tqdm(total=len(list_of_dicts)) as pbar:
      for d in list_of_dicts:
        for key, value in d.items():
          merged_dict[key] = merged_dict.get(key, []) + [value]
        pbar.update(1)

    return merged_dict

def merge_dicts_and_create_lists_add(list_of_dicts):
    merged_dict = {}

    for d in list_of_dicts:
        for key, value in d.items():
            if key in merged_dict:
                merged_dict[key] += value
            else:
                merged_dict[key] = value

    return merged_dict

In [18]:
seq_dict = [{"_".join([str(d["gameId"]), str(d["playId"]), str(d["Traj"]), str(d["OffDef_ID"])]): d} for d in seq_dict]
seq_dict  = merge_dicts_and_create_lists(seq_dict)
seq_dict = {k : v[:11] for k,v in seq_dict.items()}

100%|██████████| 2680360/2680360 [00:03<00:00, 709673.65it/s] 


## Sequence builder

In [19]:
def create_labels(ids, input_pad):
  labels = ids[1:] + [-100]
  labels = [-100 if v not in range(0,input_pad) else v for v in labels]
  return labels

def create_single_traj(d : dict, input_pad):
  
  x = d.copy()
  
  if x["Traj"] == 0:
    pos_ids = [0] + x["pos_ids"]
    input_ids = [start_token] + x["input_ids"]
    token_type_ids = [0] + [1 for v in x["input_ids"]]
    length = len(x["input_ids"])+1
    start_ids = [x["start_ID"] for v in range(length)]
    scrim_ids = [x["scrimmage_ID"] for v in range(length)]
    position_ids = [x["position_ID"] for v in range(length)]
    side_ids = [x["side_ID"] for v in range(length)]
    playtype_ids = [x["PlayType_ID"] for v in range(length)]
    team_ids = [x["OffDef_ID"] for v in range(length)]

    single_traj = {"input_ids" : input_ids[:-1],
                  "position_ids" : position_ids[:-1],
                  "scrim_ids" : scrim_ids[:-1],
                  "start_ids" : start_ids[:-1],
                  "pos_ids" : pos_ids[:-1],
                  "side_ids" : side_ids[:-1],
                  "token_type_ids" : token_type_ids[:-1],
                  "OffDef" : team_ids[:-1],
                  "PlayType" : playtype_ids[:-1],
                  "labels" : create_labels(input_ids, input_pad)[:-1],
                  "attention_mask" : [1 if v != input_pad else 0 for v in input_ids][:-1],
                  "Traj" : [x["Traj"] for i in range(len(input_ids))][:-1],
                  "nflId" : [x["nflId"] for i in range(len(input_ids))][:-1],
                  "playId" : [x["playId"] for i in range(len(input_ids))][:-1],
                  "gameId" : [x["gameId"] for i in range(len(input_ids))][:-1]}
    
  else:
    pos_ids = x["pos_ids"]
    input_ids =  x["input_ids"]
    token_type_ids = [1 for v in x["input_ids"]]
    length = len(x["input_ids"])
    start_ids = [x["start_ID"] for v in range(length)]
    scrim_ids = [x["scrimmage_ID"] for v in range(length)]
    position_ids = [x["position_ID"] for v in range(length)]
    side_ids = [x["side_ID"] for v in range(length)]
    playtype_ids = [x["PlayType_ID"] for v in range(length)]
    team_ids = [x["OffDef_ID"] for v in range(length)]

    single_traj = {"input_ids" : input_ids[:-1],
                  "position_ids" : position_ids[:-1],
                  "scrim_ids" : scrim_ids[:-1],
                  "start_ids" : start_ids[:-1],
                  "pos_ids" : pos_ids[:-1],
                  "side_ids" : side_ids[:-1],
                  "token_type_ids" : token_type_ids[:-1],
                  "OffDef" : team_ids[:-1],
                  "PlayType" : playtype_ids[:-1],
                  "labels" : create_labels(input_ids, input_pad)[:-1],
                  "attention_mask" : [1 if v != input_pad else 0 for v in input_ids][:-1],
                  "Traj" : [x["Traj"] for i in range(len(input_ids))][:-1],
                  "nflId" : [x["nflId"] for i in range(len(input_ids))][:-1],
                  "playId" : [x["playId"] for i in range(len(input_ids))][:-1],
                  "gameId" : [x["gameId"] for i in range(len(input_ids))][:-1]}

  return single_traj

def padd_to_length(team_traj, 
                   max_len, 
                   input_pad = pad_input, 
                   pos_id_pad = pad_pos_id, 
                   position_id_pad = pad_position_id,
                   scrim_id_pad = pad_scrimmage_id, 
                   start_id_pad = pad_start_id, 
                   labels_pad = -100, 
                   attention_pad = 0):
  
  current_length = len(team_traj["input_ids"])
  current_length_sides = len(team_traj["side_ids"])
  current_length_positions = len(team_traj["position_ids"])
  current_length_start = len(team_traj["start_ids"])
  current_length_scrim = len(team_traj["scrim_ids"])
  current_length_types = len(team_traj["token_type_ids"])
  current_length_OffDef = len(team_traj["OffDef"])
  current_length_PlayType = len(team_traj["PlayType"])

  to_pad = max_len - current_length
  to_pad_sides = max_len - current_length_sides
  to_pad_OffDef = max_len - current_length_OffDef
  to_pad_position = max_len - current_length_positions
  to_pad_start = max_len - current_length_start
  to_pad_scrim = max_len - current_length_scrim
  to_pad_types = max_len - current_length_types
  to_pad_PlayType = max_len - current_length_PlayType

  padded_input = [input_pad for i in range(to_pad)]
  padded_pos = [pos_id_pad for i in range(to_pad)]
  padded_position = [position_id_pad for i in range(to_pad_position)]
  padded_start = [start_id_pad for i in range(to_pad_start)]
  padded_scrim = [scrim_id_pad for i in range(to_pad_scrim)]
  padded_labels = [labels_pad for i in range(to_pad)]
  padded_attention = [attention_pad for i in range(to_pad)]
  padded_sides = [team_traj["side_ids"][0] for i in range(to_pad_sides)]
  padded_token_types = [2 for i in range(to_pad_types)]
  padded_OffDef = [team_traj["OffDef"][0] for i in range(to_pad_OffDef)]
  padded_PlayType = [team_traj["PlayType"][0] for i in range(to_pad_PlayType)]
  
  padded_traj_id = [team_traj["Traj"][0] for i in range(to_pad)]
  padded_nfl_id = [-1000 for i in range(to_pad)]
  padded_play_id = [team_traj["playId"][0] for i in range(to_pad)]
  padded_game_id = [team_traj["gameId"][0] for i in range(to_pad)]
  
  
  padded_traj = team_traj.copy()

  padded_traj["input_ids"] += padded_input
  padded_traj["side_ids"] += padded_sides
  padded_traj["pos_ids"] += padded_pos
  padded_traj["position_ids"] += padded_position
  padded_traj["start_ids"] += padded_start
  padded_traj["scrim_ids"] += padded_scrim
  padded_traj["OffDef"] += padded_OffDef
  padded_traj["token_type_ids"] += padded_token_types
  padded_traj["PlayType"] += padded_PlayType
  padded_traj["labels"] += padded_labels
  padded_traj["attention_mask"] += padded_attention
  
  padded_traj["Traj"] += padded_traj_id
  padded_traj["nflId"] += padded_nfl_id
  padded_traj["playId"] += padded_play_id
  padded_traj["gameId"] += padded_game_id

  return padded_traj


def create_team_traj(list_dicts, input_pad):
  
  trajs = [create_single_traj(d.copy(), input_pad) for d in list_dicts.copy()]
  merged = merge_dicts_and_create_lists_add(trajs.copy())
  merged = padd_to_length(merged, 256)
  return {k : np.array(v) for k,v in merged.items()}

In [20]:
pad_input

11222

In [21]:
plays = np.array([[v, seq_dict[v][0]["PlayType_ID"]] for v in seq_dict.keys()])
plays_df = pd.DataFrame(plays, columns = ["game_play_id", "PlayType_ID"]).drop_duplicates().reset_index(drop = True)

In [22]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(plays_df, test_size = 0.2)

train_games = train["game_play_id"].to_numpy()
test_games = test["game_play_id"].to_numpy()

In [23]:
print(len(train_games))

233862


In [24]:
print(len(test_games))

58466


In [25]:
train.to_csv("models/modeling/QBGPT/splits/train_split.csv", sep = ";", index = False)
test.to_csv("models/modeling/QBGPT/splits/test_split.csv", sep = ";", index = False)

In [26]:
train_dict = {}
test_dict = {}

with tqdm(total=len(train_games)) as pbar:
    for key in train_games:
      train_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

with tqdm(total=len(test_games)) as pbar:
    for key in test_games:
      test_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

100%|██████████| 233862/233862 [00:54<00:00, 4291.24it/s]
100%|██████████| 58466/58466 [00:14<00:00, 4072.02it/s]


In [27]:
def compile_traj(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)

    return merged_dict

In [28]:
train_dict = compile_traj(list(train_dict.values()))
test_dict = compile_traj(list(test_dict.values()))

100%|██████████| 233862/233862 [00:00<00:00, 594797.88it/s]
100%|██████████| 58466/58466 [00:00<00:00, 703478.54it/s]


In [29]:
train_dict = {key : np.vstack(value) for key,value in train_dict.items()}
test_dict = {key : np.vstack(value) for key,value in test_dict.items()}

In [30]:
train_check = pl.DataFrame({k : v.flatten() for k,v in train_dict.items()})

In [31]:
ui = (data.
 select("Start_ID").
 unique().
 to_series().
 to_list())

In [32]:
import tensorflow as tf

train_labels = train_dict["labels"]
train_dataset = tf.data.Dataset.from_tensor_slices((train_dict, train_labels))

test_labels = test_dict["labels"]
test_dataset = tf.data.Dataset.from_tensor_slices((test_dict, test_labels))

In [33]:
see = np.unique(train_dict["labels"].flatten())
see

array([ -100,     0,     2, ..., 11218, 11219, 11220])

In [34]:
see = np.unique(train_dict["start_ids"].flatten())
see

array([   0,    1,    2, ..., 2110, 2111, 2113])

In [35]:
see = np.unique(test_dict["labels"].flatten())
see

array([ -100,     1,     7, ..., 11207, 11208, 11213])

In [36]:
tf.data.Dataset.save(train_dataset, "data_models/QBGPT/train_tokens_NFL_GPT_v2")
tf.data.Dataset.save(test_dataset, "data_models/QBGPT/test_tokens_NFL_GPT_v2")

In [37]:
tf.data.Dataset.save(test_dataset, "data_models/QBGPT/test_tokens_NFL_GPT_v2_eval")

In [39]:
essai = pl.DataFrame({k: v.flatten() for k,v in train_dict.items()})

In [41]:
(essai.
 select("pos_ids", "labels").
 group_by("pos_ids", "labels").
 count().
 filter(pl.col("labels") == -100).
 sort("count"))

pos_ids,labels,count
i64,i64,u32
10,-100,117
11,-100,245
12,-100,895
13,-100,4041
47,-100,5664
46,-100,6345
48,-100,6381
49,-100,6660
38,-100,7336
39,-100,7976


In [128]:
(essai.
 filter(pl.col("labels") == -100).
 select("pos_ids").
 group_by("pos_ids").
 count())

pos_ids,count
i64,u32
48,6174
24,24720
16,12744
40,8053
32,36197
25,27727
41,9584
33,16253
49,6495
17,14349
