In [1]:
import os
import pandas as pd
import numpy as np
import polars as pl

In [2]:
os.listdir()

['positions_index.parquet',
 'final_df.parquet',
 '.DS_Store',
 'train_play_prediction_binary',
 'scrimmage_index.parquet',
 'starts_index.parquet',
 'index.parquet',
 'train_split.csv',
 'test_play_prediction_binary',
 '4_play_pred.ipynb',
 'moves_index.parquet',
 '2_dataset_mapping.ipynb',
 'time_index.parquet',
 'mapped_df.parquet',
 'class_weights.parquet',
 'test_split.csv',
 'test_play_prediction_categ',
 'train_test_split.csv',
 'train_play_prediction_categ',
 '3_sequence_creation.ipynb',
 'plays_index.parquet']

In [3]:
data = pl.read_parquet("mapped_df.parquet")

In [4]:
index = pl.read_parquet("index.parquet")
index

Cat,ID
str,i32
"""Pos""",0
"""Moves""",0
"""Start""",0
"""Scrimm""",0
"""Scrimm""",1
"""Start""",1
"""Pos""",1
"""Moves""",1
"""Pos""",2
"""Start""",2


In [5]:
zones_max = 11164
starts_max = 1983
scrimmage_max = 99
positions_max = 28

start_token = 11164

pad_input = 11165
pad_pos_id = 51
pad_position_id = 28
pad_scrimmage_id = 99
pad_start_id = 1983

In [6]:
pad_input

11165

## Dataframe to a list of dicts

In [7]:
def sort_list_and_reproduce(a, b):
    # Pair elements from lists A and B
    paired_lists = list(zip(a, b))

    # Sort the paired lists based on list A
    paired_lists.sort(key=lambda x: x[0])

    # Unpack the sorted pairs into new lists A and B
    sorted_a, sorted_b = zip(*paired_lists)

    return list(sorted_a), list(sorted_b)

def sort_seq(x : dict):
  new_dict = x.copy()
  new_pos, new_ids = sort_list_and_reproduce(new_dict["pos_ids"], new_dict["input_ids"])
  new_dict["pos_ids"] = new_pos
  new_dict["input_ids"] = new_ids
  return new_dict

In [8]:
seq_dict = [{"pos_ids" : row["Frame_ID"],
             "input_ids": row["Zone_ID"],
             "start_ID" : row["Start_ID"],
             "scrimmage_ID" : row["Scrimmage_ID"],
             "position_ID" : row["position_ID"],
             "side_ID" : row["side_ID"],
             "OffDef_ID" : row["OffDef_ID"],
             "PlayType_ID" : row["PlayType_ID"],
             "nflId" : row["nflId"],
             "playId" : row["playId"],
             "gameId" : row["gameId"]} for row in data.iter_rows(named=True)]

del data

In [9]:
seq_dict = [sort_seq(d) for d in seq_dict]

In [10]:
def pad_and_truncate_lists(A, B, max_size, pad_step, n, A_pad, B_pad):
    # Determine the current length of lists A and B
    current_length = len(A)

    # If the lists are smaller than max_size, pad them with -100
    if current_length < max_size:
        padding_length = max_size - current_length
        A.extend([A_pad] * padding_length)
        B.extend([B_pad] * padding_length)

    result_A, result_B = [], []
    for i in range(0, current_length, pad_step):
        truncated_A = A[i:i + max_size]
        truncated_B = B[i:i + max_size]

        # Count the number of elements that are different from -100
        non_negative_count = len([x for x in truncated_A if x != A_pad])

        if non_negative_count >= n:
            if len(truncated_A) < max_size:
                truncated_A.extend([A_pad] * (max_size - len(truncated_A)))
                truncated_B.extend([B_pad] * (max_size - len(truncated_B)))
            result_A.append(truncated_A)
            result_B.append(truncated_B)

    return result_A, result_B

def assign_values_dict(d, key_a, val_a, key_b, val_b, i):
  new_dict = d.copy()
  new_dict[key_a] = val_a
  new_dict[key_b] = val_b
  new_dict["Traj"] = i

  return new_dict

def process_seq(x : dict, max_length, pad_step, min_size, pad_input, pad_pos):
  new_dict = x.copy()
  new_pos, new_ids = pad_and_truncate_lists(A = new_dict["pos_ids"], B = new_dict["input_ids"], max_size = max_length, pad_step = pad_step, n = min_size, A_pad = pad_pos, B_pad = pad_input)
  list_new_dicts = [assign_values_dict(new_dict, "pos_ids", new_pos[i], "input_ids", new_ids[i], i) for i in range(len(new_pos))]

  return list_new_dicts

In [11]:
seq_dict = [process_seq(seq, max_length = 21, pad_step = 12, min_size = 6, pad_input = pad_input, pad_pos = pad_pos_id) for seq in seq_dict]

In [12]:
seq_dict = [sub for l in seq_dict for sub in l]

## List of dicts to a dict of games, plays and traj

In [13]:
from tqdm import tqdm

def merge_dicts_and_create_lists(list_of_dicts):
    merged_dict = {}

    with tqdm(total=len(list_of_dicts)) as pbar:
      for d in list_of_dicts:
        for key, value in d.items():
          merged_dict[key] = merged_dict.get(key, []) + [value]
        pbar.update(1)

    return merged_dict

def merge_dicts_and_create_lists_add(list_of_dicts):
    merged_dict = {}

    for d in list_of_dicts:
        for key, value in d.items():
            if key in merged_dict:
                merged_dict[key] += value
            else:
                merged_dict[key] = value

    return merged_dict

In [14]:
seq_dict = [{"_".join([str(d["gameId"]), str(d["playId"]), str(d["Traj"]), str(d["OffDef_ID"])]): d} for d in seq_dict]
seq_dict  = merge_dicts_and_create_lists(seq_dict)
seq_dict = {k : v[:11] for k,v in seq_dict.items()}

100%|██████████| 2348029/2348029 [00:01<00:00, 1450428.75it/s]


## Sequence builder

In [15]:
def create_labels(ids, input_pad):
  labels = ids[1:] + [-100]
  labels = [-100 if v not in range(0,11164+1) else v for v in labels]
  return labels

def labels_traj_spec(final_dict):
  minimum_time_step = min([v for v in final_dict["pos_ids"] if v !=0])
  if minimum_time_step > 1:
    new_labels = [-100 if final_dict["pos_ids"][i] in [0, minimum_time_step] else final_dict["labels"][i] for i in range(len(final_dict["pos_ids"]))]
  else:
    new_labels = final_dict["labels"]
  final_dict["labels"] = new_labels
  return final_dict

def create_single_traj(x : dict, input_pad):
  
  
  pos_ids = [0] + x["pos_ids"]
  input_ids = [start_token] + x["input_ids"]
  token_type_ids = [0] + [1 for v in x["input_ids"]]
  length = len(x["input_ids"])
  start_ids = [x["start_ID"] for v in range(length)]
  scrim_ids = [x["scrimmage_ID"] for v in range(length)]
  position_ids = [x["position_ID"] for v in range(length)]
  side_ids = [x["side_ID"] for v in range(length)]
  playtype_ids = [x["PlayType_ID"] for v in range(length)]
  team_ids = [x["OffDef_ID"] for v in range(length)]

  single_traj = {"input_ids" : input_ids,
                 "position_ids" : position_ids,
                 "scrim_ids" : scrim_ids,
                 "start_ids" : start_ids,
                 "pos_ids" : pos_ids,
                 "side_ids" : side_ids,
                 "token_type_ids" : token_type_ids,
                 "OffDef" : team_ids,
                 "PlayType" : playtype_ids,
                 "labels" : create_labels(input_ids, input_pad),
                 "attention_mask" : [1 if v != input_pad else 0 for v in input_ids]}

  return single_traj

def padd_to_length(team_traj, 
                   max_len, 
                   input_pad = pad_input, 
                   pos_id_pad = pad_pos_id, 
                   position_id_pad = pad_position_id,
                   scrim_id_pad = pad_scrimmage_id, 
                   start_id_pad = pad_start_id, 
                   labels_pad = -100, 
                   attention_pad = 0):
  current_length = len(team_traj["input_ids"])
  current_length_sides = len(team_traj["side_ids"])
  current_length_positions = len(team_traj["position_ids"])
  current_length_start = len(team_traj["start_ids"])
  current_length_scrim = len(team_traj["scrim_ids"])
  current_length_types = len(team_traj["token_type_ids"])
  current_length_OffDef = len(team_traj["OffDef"])
  current_length_PlayType = len(team_traj["PlayType"])

  to_pad = max_len - current_length
  to_pad_sides = max_len - current_length_sides
  to_pad_OffDef = max_len - current_length_OffDef
  to_pad_position = max_len - current_length_positions
  to_pad_start = max_len - current_length_start
  to_pad_scrim = max_len - current_length_scrim
  to_pad_types = max_len - current_length_types
  to_pad_PlayType = max_len - current_length_PlayType

  padded_input = [input_pad for i in range(to_pad)]
  padded_pos = [pos_id_pad for i in range(to_pad)]
  padded_position = [position_id_pad for i in range(to_pad_position)]
  padded_start = [start_id_pad for i in range(to_pad_start)]
  padded_scrim = [scrim_id_pad for i in range(to_pad_scrim)]
  padded_labels = [labels_pad for i in range(to_pad)]
  padded_attention = [attention_pad for i in range(to_pad)]
  padded_sides = [team_traj["side_ids"][0] for i in range(to_pad_sides)]
  padded_token_types = [3 for i in range(to_pad_types)]
  padded_OffDef = [team_traj["OffDef"][0] for i in range(to_pad_OffDef)]
  padded_PlayType = [team_traj["PlayType"][0] for i in range(to_pad_PlayType)]

  padded_traj = team_traj.copy()

  padded_traj["input_ids"] += padded_input
  padded_traj["side_ids"] += padded_sides
  padded_traj["pos_ids"] += padded_pos
  padded_traj["position_ids"] += padded_position
  padded_traj["start_ids"] += padded_start
  padded_traj["scrim_ids"] += padded_scrim
  padded_traj["OffDef"] += padded_OffDef
  padded_traj["token_type_ids"] += padded_token_types
  padded_traj["PlayType"] += padded_PlayType
  padded_traj["labels"] += padded_labels
  padded_traj["attention_mask"] += padded_attention

  return padded_traj


def create_team_traj(list_dicts, input_pad):
  trajs = [create_single_traj(d, input_pad) for d in list_dicts]

  merged = merge_dicts_and_create_lists_add(trajs)
  merged = padd_to_length(merged, 256)
  merged = labels_traj_spec(merged)

  return {k : np.array(v) for k,v in merged.items()}

In [16]:
pad_input

11165

In [17]:
plays = np.array([[v, seq_dict[v][0]["PlayType_ID"]] for v in seq_dict.keys()])
plays_df = pd.DataFrame(plays, columns = ["game_play_id", "PlayType_ID"]).drop_duplicates().reset_index(drop = True)

In [18]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(plays_df, test_size = 0.2 )

train_games = train["game_play_id"].to_numpy()
test_games = test["game_play_id"].to_numpy()

In [19]:
print(len(train_games))

205851


In [20]:
print(len(test_games))

51463


In [21]:
train.to_csv("train_split.csv", sep = ";", index = False)
test.to_csv("test_split.csv", sep = ";", index = False)

In [22]:
train_dict = {}
test_dict = {}

with tqdm(total=len(train_games)) as pbar:
    for key in train_games:
      train_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

with tqdm(total=len(test_games)) as pbar:
    for key in test_games:
      test_dict[key] = create_team_traj(seq_dict[key], pad_input)
      pbar.update(1)  # Update the progress bar

100%|██████████| 205851/205851 [00:38<00:00, 5367.96it/s]
100%|██████████| 51463/51463 [00:09<00:00, 5299.33it/s]


In [40]:
see = create_single_traj(seq_dict["2018120902_1430_0_1"][0], pad_input)
see["input_ids"]

[11164,
 5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5435,
 5436,
 5352,
 5270,
 5182,
 5010,
 4925,
 4842,
 4680,
 4599,
 4514,
 4428,
 4346]

In [41]:
see["labels"]

[5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5517,
 5435,
 5436,
 5352,
 5270,
 5182,
 5010,
 4925,
 4842,
 4680,
 4599,
 4514,
 4428,
 4346,
 -100]

In [25]:
see = create_team_traj(seq_dict["2018120902_1430_0_1"], pad_input)
see.keys()

dict_keys(['input_ids', 'position_ids', 'scrim_ids', 'start_ids', 'pos_ids', 'side_ids', 'token_type_ids', 'OffDef', 'PlayType', 'labels', 'attention_mask'])

In [42]:
see["pos_ids"]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

In [27]:
see["labels"]

array([5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5435, 5436, 5352,
       5270, 5182, 5010, 4925, 4842, 4680, 4599, 4514, 4428, 4346, -100,
       5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5435,
       5435, 5352, 5269, 5182, 5182, 5095, 5011, 4926, 4844, 4845, -100,
       5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5435, 5435,
       5352, 5353, 5271, 5272, 5273, 5187, 5188, 5190, 5191, 5193, -100,
       5517, 5517, 5517, 5517, 5517, 5517, 5517, 5598, 5598, 5681, 5681,
       5763, 5848, 5931, 5931, 6014, 6015, 6103, 6104, 6104, 6105, -100,
       5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5435, 5351,
       5268, 5269, 5181, 5094, 5009, 4925, 4843, 4763, 4682, 4602, -100,
       5517, 5517, 5517, 5517, 5517, 5517, 5517, 5517, 5516, 5516, 5433,
       5432, 5347, 5346, 5345, 5428, 5427, 5425, 5424, 5423, 5338, -100,
       -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
       -100, -100, -100, -100, -100, -100, -100, -1

In [28]:
see["position_ids"]

array([27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
       27, 27, 27, 27, 27, 27, 27, 27, 26, 26, 26, 26, 26, 26, 26, 26, 26,
       26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 20, 20, 20, 20, 20,
       20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 27,
       27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27, 27,
       27, 27, 27, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21,
       21, 21, 21, 21, 21, 21, 21, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 28,
       28, 28, 28, 28, 28

In [29]:
see["token_type_ids"]

array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [30]:
see["start_ids"]

array([ 978,  978,  978,  978,  978,  978,  978,  978,  978,  978,  978,
        978,  978,  978,  978,  978,  978,  978,  978,  978,  978,  955,
        955,  955,  955,  955,  955,  955,  955,  955,  955,  955,  955,
        955,  955,  955,  955,  955,  955,  955,  955,  955,  941,  941,
        941,  941,  941,  941,  941,  941,  941,  941,  941,  941,  941,
        941,  941,  941,  941,  941,  941,  941,  941,  938,  938,  938,
        938,  938,  938,  938,  938,  938,  938,  938,  938,  938,  938,
        938,  938,  938,  938,  938,  938,  938,  946,  946,  946,  946,
        946,  946,  946,  946,  946,  946,  946,  946,  946,  946,  946,
        946,  946,  946,  946,  946,  946,  648,  648,  648,  648,  648,
        648,  648,  648,  648,  648,  648,  648,  648,  648,  648,  648,
        648,  648,  648,  648,  648, 1983, 1983, 1983, 1983, 1983, 1983,
       1983, 1983, 1983, 1983, 1983, 1983, 1983, 1983, 1983, 1983, 1983,
       1983, 1983, 1983, 1983, 1983, 1983, 1983, 19

In [31]:
see["scrim_ids"]

array([49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49, 49,
       49, 49, 49, 49, 49, 49, 49, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99, 99,
       99, 99, 99, 99, 99

In [32]:
def compile_traj(list_of_trajs):
    merged_dict = {k : [] for k in list_of_trajs[0].keys()}

    with tqdm(total=len(list_of_trajs)) as pbar:
      for d in list_of_trajs:
        for key, value in d.items():
          merged_dict[key] += [value]
        pbar.update(1)

    return merged_dict

In [33]:
train_dict = compile_traj(list(train_dict.values()))
test_dict = compile_traj(list(test_dict.values()))

100%|██████████| 205851/205851 [00:00<00:00, 751283.83it/s]
100%|██████████| 51463/51463 [00:00<00:00, 1042332.70it/s]


In [34]:
train_dict = {key : np.vstack(value) for key,value in train_dict.items()}
test_dict = {key : np.vstack(value) for key,value in test_dict.items()}

In [35]:
import tensorflow as tf

train_labels = train_dict["labels"]
train_dataset = tf.data.Dataset.from_tensor_slices((train_dict, train_labels))

test_labels = test_dict["labels"]
test_dataset = tf.data.Dataset.from_tensor_slices((test_dict, test_labels))

In [36]:
see = np.unique(train_dict["labels"].flatten())

In [37]:
see

array([ -100,     0,     1, ..., 11161, 11162, 11163])

In [38]:
see = np.unique(test_dict["labels"].flatten())
see

array([ -100,     7,     9, ..., 11138, 11140, 11144])

In [39]:
tf.data.Dataset.save(train_dataset, "train_tokens_NFL_GPT")
tf.data.Dataset.save(test_dataset, "test_tokens_NFL_GPT")