In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install pytorch_tabnet

In [None]:
# ============================
# Rutas de datos 
# ============================
DIRECTORIO_DATOS = "/kaggle/input/nfl-big-data-bowl-2026-prediction/train"
DIRECTORIO_TEST = "/kaggle/input/nfl-big-data-bowl-2026-prediction"


def cargar_entradas_y_salidas(directorio: str):
    """
    Carga y concatena los CSV de entrada (pre-release) y salida (post-release)
    para todas las semanas disponibles.

    Returns
    -------
    datos_entrada : DataFrame
        Todos los frames pre-release (10 Hz) de todos los jugadores
        antes del lanzamiento del balón.
    datos_salida : DataFrame
        Frames post-release para los jugadores objetivo
        (solo player_to_predict y durante el vuelo del balón).
    """
    listas_entrada, listas_salida = [], []

    for semana in range(1, 19):   # semanas 1 a 18
        ruta_inp = os.path.join(directorio, f"input_2023_w{semana:02d}.csv")
        ruta_out = os.path.join(directorio, f"output_2023_w{semana:02d}.csv")

        if os.path.exists(ruta_inp):
            listas_entrada.append(pd.read_csv(ruta_inp))

        if os.path.exists(ruta_out):
            listas_salida.append(pd.read_csv(ruta_out))

    if not listas_entrada:
        raise FileNotFoundError("No se encontraron archivos input_2023_wXX.csv en DIRECTORIO_DATOS.")
    if not listas_salida:
        raise FileNotFoundError("No se encontraron archivos output_2023_wXX.csv en DIRECTORIO_DATOS.")

    datos_entrada = pd.concat(listas_entrada, ignore_index=True)
    datos_salida = pd.concat(listas_salida, ignore_index=True)

    return datos_entrada, datos_salida


def cargar_test_pre_release(directorio_test: str):
    """
    Carga el archivo de test pre-release.
    """
    ruta_test_input = os.path.join(directorio_test, "test_input.csv")
    if not os.path.exists(ruta_test_input):
        raise FileNotFoundError("No se encontró test_input.csv en DIRECTORIO_TEST.")
    return pd.read_csv(ruta_test_input)


# ============================
# Carga principal de datos
# ============================
entrada_, salida_ = cargar_entradas_y_salidas(DIRECTORIO_DATOS)
entrada_test_ = cargar_test_pre_release(DIRECTORIO_TEST)

print("=== Estructura general de los DataFrames ===")
print(f"· Entrenamiento pre-release  : filas = {entrada_.shape[0]:,} | columnas = {entrada_.shape[1]}")
print(f"· Entrenamiento post-release : filas = {salida_.shape[0]:,} | columnas = {salida_.shape[1]}")
print(f"· Test pre-release           : filas = {entrada_test_.shape[0]:,} | columnas = {entrada_test_.shape[1]}")
print()

In [None]:
entrada_.info()

In [None]:
entrada_.head(3)

# Preprocesamiento para dataset tabular base

## solo con los jugadores player_to_predict

In [None]:
# Nos quedamos solo con los jugadores objetivo
entrada_players = entrada_.query("player_to_predict == True").copy()

print(f"jugadores objetivo = {entrada_players.shape}")



## Elegir un frame de referencia por jugador

* Tomamos, por defecto, el último frame pre-release para cada.
* (game_id, play_id, nfl_id). Ese será el “snapshot” que ve TabNet.
* Ahora entrada_ref tiene una fila por jugador a predecir.

In [None]:
id_cols = ["game_id", "play_id", "nfl_id"]
col_frame = "frame_id"   # en tu data ya se llama así

entrada_ref = (
    entrada_players
      .sort_values(id_cols + [col_frame])
      .groupby(id_cols, as_index=False)
      .tail(1)          # último frame antes del release
)

print("Frames de referencia:", entrada_ref.shape)
entrada_ref.head()


## bloque multi-frame

In [None]:
# =====================================
# 3. Construir targets multi-frame
#    (x_t0, y_t0, x_t1, y_t1, ..., x_tK, y_tK)
# =====================================

id_cols = ["game_id", "play_id", "nfl_id"]

# Determinar el número máximo de frames futuros según los datos reales
T_MAX = int(entrada_players["num_frames_output"].max())
print("T_MAX detectado (máx num_frames_output):", T_MAX)

# Ordenamos por IDs y frame, y creamos índice temporal t_idx
salida_sorted = salida_.sort_values(id_cols + ["frame_id"])
salida_sorted["t_idx"] = salida_sorted.groupby(id_cols).cumcount()

# Filtrar solo frames válidos [0 .. T_MAX-1]
salida_sorted = salida_sorted[salida_sorted["t_idx"] < T_MAX].copy()

# Pasamos de tabla "larga" a "ancha": columnas = x_tk, y_tk
targets_wide = salida_sorted.pivot_table(
    index=id_cols,
    columns="t_idx",
    values=["x", "y"]
)

# Aplanar MultiIndex de columnas: ('x',0) -> 'x_t0'
targets_wide.columns = [
    f"{var}_t{t}" for (var, t) in targets_wide.columns
]
targets_wide = targets_wide.reset_index()

# Lista de columnas de targets multi-frame
target_cols = [c for c in targets_wide.columns if c.startswith("x_t") or c.startswith("y_t")]

print("targets_wide shape:", targets_wide.shape)
print("Ejemplo de target_cols:", target_cols[:10])

# Rellenar posibles NaNs por fila (frames faltantes)
targets_wide[target_cols] = targets_wide[target_cols].fillna(method="ffill", axis=1)
targets_wide[target_cols] = targets_wide[target_cols].fillna(method="bfill", axis=1)

# =====================================
# 4. Merge final: features pre-release + targets multi-frame
# =====================================

df_train_raw = entrada_ref.merge(
    targets_wide,
    on=id_cols,
    how="inner"
)

print("df_train_raw multi-frame:", df_train_raw.shape)
print(df_train_raw.columns[:25])


## Definir df_features y df_targets

In [None]:
# ============================
# Definir X (features) e y (targets) para multi-frame
# ============================

# IDs que no entran como features
id_cols = ["game_id", "play_id", "nfl_id"]

# Columnas que NO usaremos como features
cols_no_features = id_cols + target_cols + [
    "player_name",
    "player_birth_date",
    "player_height",
    "player_to_predict",
]

df_features = df_train_raw.drop(columns=cols_no_features, errors="ignore").copy()
df_targets  = df_train_raw[target_cols].copy()

feature_cols = df_features.columns.tolist()

print("Nº features:", len(feature_cols))
print("Nº targets:", len(target_cols))
print("Ejemplo feature_cols:", feature_cols[:10])
print("Ejemplo target_cols:", target_cols[:10])


## Marcar columnas categóricas y numéricas

In [None]:
# ============================
# Categóricas vs Numéricas
# ============================

# categorias que si queremos usar 
cat_cols = [
    "play_direction",
    "player_position",
    "player_side",
    "player_role",
]

# Solo las que realmente están en df_features
cat_cols = [c for c in cat_cols if c in feature_cols]

# El resto → numéricas
num_cols = [c for c in feature_cols if c not in cat_cols]

print("Categóricas:", cat_cols)
print("Numéricas  :", num_cols)


## Imputar NaNs

In [None]:
# ============================
# Imputación de NaNs
# ============================

# Numéricas → mediana
for c in num_cols:
    df_features[c] = df_features[c].astype("float32")
    df_features[c] = df_features[c].fillna(df_features[c].median())

# Categóricas → 'Unknown'
for c in cat_cols:
    df_features[c] = df_features[c].fillna("Unknown")


## Codificar categóricas + cat_idxs y cat_dims

In [None]:
from sklearn.preprocessing import LabelEncoder

# ============================
# LabelEncoding por columna categórica
# ============================

label_encoders = {}   # <-- NUEVO: guardamos los encoders para usarlos luego en test

for c in cat_cols:
    le = LabelEncoder()
    df_features[c] = le.fit_transform(df_features[c].astype(str))
    label_encoders[c] = le   # <-- guardamos el encoder entrenado



# ============================
# cat_idxs y cat_dims según el orden de feature_cols
# ============================

cat_idxs = []
cat_dims = []

for i, c in enumerate(feature_cols):
    if c in cat_cols:
        cat_idxs.append(i)
        cat_dims.append(df_features[c].nunique())

print("cat_idxs:", cat_idxs)
print("cat_dims:", cat_dims)



## X, y y split (ahora con y multi-frame)

In [None]:
from sklearn.model_selection import train_test_split
import numpy as np

# ============================
# Matrices finales
# ============================

X = df_features[feature_cols].values.astype("float32")   # (N, 16)
y = df_targets[target_cols].values.astype("float32")     # (N, 50)

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train:", X_train.shape, "y_train:", y_train.shape)
print("X_valid:", X_valid.shape, "y_valid:", y_valid.shape)


In [None]:
df_targets[target_cols]

In [None]:
df_features[feature_cols]

In [None]:
X_train

In [None]:
print("Mínimo:", entrada_players["num_frames_output"].min())
print("Promedio:", entrada_players["num_frames_output"].mean())
print("Máximo:", entrada_players["num_frames_output"].max())


# Definir y entrenar TabNet (multi-output)

In [None]:
#!pip install pytorch-tabnet -q

from pytorch_tabnet.tab_model import TabNetRegressor
import torch

reg = TabNetRegressor(
    n_d=16,
    n_a=16,
    n_steps=4,
    gamma=1.5,
    n_independent=2,
    n_shared=2,
    lambda_sparse=1e-4,

    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=1e-3),

    mask_type="sparsemax",
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=2
)

reg.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_valid, y_valid)],
    eval_name=["train", "valid"],
    eval_metric=["rmse"],
    max_epochs=200,
    patience=30,
    batch_size=1024,
    virtual_batch_size=128,
    num_workers=0,
    drop_last=False
)


In [None]:
# ============================
# SUBMISSION: predicción sobre test_input + test.csv
# ============================

# Cargamos test.csv (mock con la estructura real del test futuro)
ruta_test_csv = os.path.join(DIRECTORIO_TEST, "test.csv")
test_df = pd.read_csv(ruta_test_csv)

id_cols = ["game_id", "play_id", "nfl_id"]

# 1) Snapshot pre-release para test (igual que en train)
entrada_test_players = entrada_test_.query("player_to_predict == True").copy()

entrada_test_ref = (
    entrada_test_players
      .sort_values(id_cols + ["frame_id"])
      .groupby(id_cols, as_index=False)
      .tail(1)   # último frame pre-release
)

print("entrada_test_ref:", entrada_test_ref.shape)

# 2) Construir df_test_features con MISMAS columnas que en el train
df_test_features = entrada_test_ref[feature_cols].copy()

# === Imputación en test: usamos medianas del train y 'Unknown' ===
for c in num_cols:
    df_test_features[c] = df_test_features[c].astype("float32")
    df_test_features[c] = df_test_features[c].fillna(df_features[c].median())

for c in cat_cols:
    df_test_features[c] = df_test_features[c].fillna("Unknown")
    # aplicar el mismo encoder del train
    le = label_encoders[c]
    # cuidado: categorías nuevas -> las mandamos a una categoría "desconocida"
    df_test_features[c] = df_test_features[c].map(
        lambda v: v if v in le.classes_ else le.classes_[0]
    )
    df_test_features[c] = le.transform(df_test_features[c].astype(str))

# 3) Matriz X_test
X_test = df_test_features[feature_cols].values.astype("float32")

# 4) Predicción multi-frame
y_test_pred = reg.predict(X_test)  # shape = (N_jugadores_test, n_targets)

# 5) Convertimos a DataFrame con mismos nombres de columnas que target_cols
pred_multi = pd.DataFrame(y_test_pred, columns=target_cols)
for c in id_cols:
    pred_multi[c] = entrada_test_ref[c].values

# 6) Expandir a formato largo: una fila por frame_id
n_targets = len(target_cols)
n_frames = n_targets // 2  # pares (x_tk, y_tk)

rows = []
for _, row in pred_multi.iterrows():
    gid, pid, nid = int(row["game_id"]), int(row["play_id"]), int(row["nfl_id"])
    for t in range(n_frames):
        frame_id = t + 1  # t_idx 0 -> frame_id 1
        x_val = row[f"x_t{t}"]
        y_val = row[f"y_t{t}"]
        rows.append({
            "game_id": gid,
            "play_id": pid,
            "nfl_id":  nid,
            "frame_id": frame_id,
            "x": x_val,
            "y": y_val
        })

pred_long = pd.DataFrame(rows)

print("pred_long (toda la trayectoria):", pred_long.shape)

# 7) Nos quedamos solo con los frames que Kaggle realmente evalúa (test.csv)
submission = test_df.merge(
    pred_long,
    on=["game_id", "play_id", "nfl_id", "frame_id"],
    how="left"
)

# 8) Guardar archivo final de envío
submission = submission[["game_id", "play_id", "nfl_id", "frame_id", "x", "y"]]
submission.to_csv("submission.csv", index=False)

print("submission listo:", submission.shape)
submission.head()


## Métricas multi-frame

In [None]:
from sklearn.metrics import mean_squared_error

y_pred_valid = reg.predict(X_valid)   # (N, 50)

# RMSE global (todas las coordenadas de todos los frames)
mse_global = mean_squared_error(
    y_valid.reshape(-1),
    y_pred_valid.reshape(-1)
)
rmse_global = np.sqrt(mse_global)

print("RMSE Global multi-frame:", rmse_global)


In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# ============================
# MÉTRICAS GLOBALES (todos los frames y coordenadas)
# ============================

# Aplanamos todo: (N, n_targets) -> (N * n_targets,)
y_true_flat = y_valid.reshape(-1)
y_pred_flat = y_pred_valid.reshape(-1)

# MSE
mse_global = mean_squared_error(y_true_flat, y_pred_flat)

# RMSE
rmse_global = np.sqrt(mse_global)

# MAE
mae_global = mean_absolute_error(y_true_flat, y_pred_flat)

# MAPE (evitando división por cero)
epsilon = 1e-7
mape_global = np.mean(
    np.abs((y_valid - y_pred_valid) / (y_valid + epsilon))
) * 100

# R²
r2_global = r2_score(y_true_flat, y_pred_flat)

print("====== MÉTRICAS GLOBALES ======")
print(f"MSE Global : {mse_global:.4f}")
print(f"RMSE Global: {rmse_global:.4f}")
print(f"MAE Global : {mae_global:.4f}")
print(f"MAPE Global: {mape_global:.4f}%")
print(f"R² Global  : {r2_global:.4f}")


# ============================
# (OPCIONAL) RMSE por frame
# ============================

n_targets = y_valid.shape[1]
assert n_targets % 2 == 0, "Se espera pares (x_t, y_t) por frame"

n_frames = n_targets // 2

rmse_por_frame = []

for t in range(n_frames):
    # índices de x_t, y_t en el vector de targets
    idx_x = 2 * t
    idx_y = 2 * t + 1
    
    y_true_xy = y_valid[:, [idx_x, idx_y]]
    y_pred_xy = y_pred_valid[:, [idx_x, idx_y]]
    
    mse_t = mean_squared_error(
        y_true_xy.reshape(-1),
        y_pred_xy.reshape(-1)
    )
    rmse_t = np.sqrt(mse_t)
    rmse_por_frame.append(rmse_t)

print("\n====== RMSE POR FRAME (x_t, y_t juntos) ======")
for t, rmse_t in enumerate(rmse_por_frame):
    print(f"Frame t={t}: RMSE = {rmse_t:.4f}")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ============================
# Cálculo del RMSE oficial por frame
# ============================

rmse_oficial = []

n_targets = y_valid.shape[1]
n_frames = n_targets // 2

for t in range(n_frames):
    idx_x = 2 * t
    idx_y = 2 * t + 1

    x_true = y_valid[:, idx_x]
    y_true = y_valid[:, idx_y]
    x_pred = y_pred_valid[:, idx_x]
    y_pred = y_pred_valid[:, idx_y]

    # Fórmula oficial
    mse_t = ((x_true - x_pred)**2 + (y_true - y_pred)**2).mean() / 2
    rmse_t = np.sqrt(mse_t)

    rmse_oficial.append(rmse_t)

# ============================
# Gráfico
# ============================

plt.figure(figsize=(10,5))
plt.plot(range(n_frames), rmse_oficial, marker='o', color='purple', linewidth=2)
plt.xlabel("Frame futuro (t)")
plt.ylabel("RMSE oficial")
plt.title("RMSE oficial (métrica del concurso) por frame")
plt.grid(True, linestyle="--", alpha=0.4)
plt.show()
