In [7]:
# ==== Imports básicos ====
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# ==== Listar archivos disponibles (útil para verificar rutas) ====
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        #print(os.path.join(dirname, filename))
        pass

## Carga de datos de entrenamiento
La base de datos de entrenamiento tiene input y outputs, estos vienen datos por semana y primero es necesario concatenerlas, después es necesario que se manejen muestras que coicidan en el input como en el output, en cuanto a juego y jugada, esto elimina por ejemplo jugadas que tenían entrada pero no salida.

In [8]:
import pandas as pd

# ===== 1. Cargar INPUT (todas las semanas) =====
def load_training_input(data_path='/kaggle/input/nfl-big-data-bowl-2026-prediction/train'):
    all_data = []
    for week in range(1, 19):
        file_path = f'{data_path}/input_2023_w{week:02d}.csv'
        try:
            df = pd.read_csv(file_path)
            all_data.append(df)
        except FileNotFoundError:
            pass
    train_df = pd.concat(all_data, ignore_index=True)
    return train_df

# ===== 2. Cargar OUTPUT (todas las semanas) =====
def load_training_output(data_path='/kaggle/input/nfl-big-data-bowl-2026-prediction/train'):
    all_outputs = []
    for week in range(1, 19):
        file_path = f'{data_path}/output_2023_w{week:02d}.csv'
        try:
            df_out = pd.read_csv(file_path)
            all_outputs.append(df_out)
        except FileNotFoundError:
            pass
    output_df = pd.concat(all_outputs, ignore_index=True)
    return output_df

# ===== 3. Normalizar columnas =====
def normalize(df):
    df = df.copy()
    if 'nflId' in df.columns:
        df = df.rename(columns={'nflId': 'nfl_id'})
    for col in ['game_id','play_id','frame_id','nfl_id']:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors='coerce').astype('Int64')
    return df

# ===== 4. Cargar datasets =====
train_df = normalize(load_training_input())
output_df = normalize(load_training_output())

# ===== 5. Filtro estricto: jugadas presentes en ambos =====
key_cols = ['game_id', 'play_id']
if 'frame_id' in train_df.columns and 'frame_id' in output_df.columns:
    key_cols.append('frame_id')
if 'nfl_id' in train_df.columns and 'nfl_id' in output_df.columns:
    key_cols.append('nfl_id')

# Intersección estricta
input_keys  = train_df[key_cols].drop_duplicates()
output_keys = output_df[key_cols].drop_duplicates()
common_keys = input_keys.merge(output_keys, on=key_cols, how='inner')

# Filtra ambos datasets para que contengan solo las jugadas que existen en los dos.
train_f  = train_df.merge(common_keys, on=key_cols, how='inner')
output_f = output_df.merge(common_keys, on=key_cols, how='inner')

# Unir ambos para ver todas las columnas
train_unido = train_f.merge(output_f, on=key_cols, how='inner', suffixes=('_in', '_out'))

# ===== 6. Mostrar dimensiones y una tabla =====
print(f"\n Dataset final tras filtrado estricto:")
print(f"Filas: {train_unido.shape[0]:,} | Columnas: {train_unido.shape[1]:,}\n")

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
from IPython.display import display
display(train_unido.head(10))



 Dataset final tras filtrado estricto:
Filas: 560,426 | Columnas: 25



Unnamed: 0,game_id,play_id,player_to_predict,nfl_id,frame_id,play_direction,absolute_yardline_number,player_name,player_height,player_weight,player_birth_date,player_position,player_side,player_role,x_in,y_in,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,x_out,y_out
0,2023090700,101,True,46137,1,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.32,20.69,0.31,0.49,79.43,267.68,21,63.259998,-0.22,56.22,17.28
1,2023090700,101,True,46137,2,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.35,20.66,0.36,0.74,118.07,268.66,21,63.259998,-0.22,56.63,16.88
2,2023090700,101,True,46137,3,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.39,20.63,0.44,0.76,130.89,269.78,21,63.259998,-0.22,57.06,16.46
3,2023090700,101,True,46137,4,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.43,20.61,0.48,0.62,134.5,269.78,21,63.259998,-0.22,57.48,16.02
4,2023090700,101,True,46137,5,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.48,20.58,0.54,0.44,129.79,269.06,21,63.259998,-0.22,57.91,15.56
5,2023090700,101,True,46137,6,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.58,20.63,0.61,0.29,99.58,274.0,21,63.259998,-0.22,58.34,15.1
6,2023090700,101,True,46137,7,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.65,20.62,0.69,0.49,98.72,274.9,21,63.259998,-0.22,58.75,14.57
7,2023090700,101,True,46137,8,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.73,20.6,0.87,1.01,95.98,277.78,21,63.259998,-0.22,59.14,14.01
8,2023090700,101,True,46137,9,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.82,20.59,0.99,0.91,97.72,279.15,21,63.259998,-0.22,59.51,13.41
9,2023090700,101,True,46137,10,right,42,Justin Reid,6-1,204,1997-02-15,SS,Defense,Defensive Coverage,51.92,20.58,1.14,1.01,98.11,278.45,21,63.259998,-0.22,59.86,12.8


In [9]:
train_unido.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560426 entries, 0 to 560425
Data columns (total 25 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   game_id                   560426 non-null  Int64  
 1   play_id                   560426 non-null  Int64  
 2   player_to_predict         560426 non-null  bool   
 3   nfl_id                    560426 non-null  Int64  
 4   frame_id                  560426 non-null  Int64  
 5   play_direction            560426 non-null  object 
 6   absolute_yardline_number  560426 non-null  int64  
 7   player_name               560426 non-null  object 
 8   player_height             560426 non-null  object 
 9   player_weight             560426 non-null  int64  
 10  player_birth_date         560426 non-null  object 
 11  player_position           560426 non-null  object 
 12  player_side               560426 non-null  object 
 13  player_role               560426 non-null  o

## EDA 

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(train_unido, title="Informe de Profiling de Datos", explorative=True)
profile.to_file("Informe_EDA_Final.html")

In [10]:
#Matriz de correlación para variables numéricas, primero se retiran las categóricas 
trainCopy = train_unido.copy()
trainCopy.drop(
    [
        'player_to_predict',      # bool
        'player_position',        # object
        'player_side',            # object
        'player_role',            # object
        'play_direction',         # object
        'player_birth_date',      # object
        'player_height',         # object
        'player_name'
        
    ],
    axis=1,
    inplace=True
)

corr_matrix = trainCopy.corr() #función para calcular las correlaciones entre las variables
corr_matrix.style.background_gradient(cmap='coolwarm')

Unnamed: 0,game_id,play_id,nfl_id,frame_id,absolute_yardline_number,player_weight,x_in,y_in,s,a,dir,o,num_frames_output,ball_land_x,ball_land_y,x_out,y_out
game_id,1.0,-0.048175,0.025198,0.008903,0.000164,-0.005746,-0.004143,0.002838,0.007819,0.000392,0.00629,0.000704,0.012725,-0.013137,-0.003882,-0.010195,0.000853
play_id,-0.048175,1.0,0.013401,0.035027,-0.007978,-0.026695,-0.007105,0.003692,0.011448,-0.009505,-0.001405,-0.005558,0.061729,-0.003743,0.004816,-0.007267,0.002976
nfl_id,0.025198,0.013401,1.0,0.009506,-0.002079,-0.145233,-0.001568,0.015818,0.006906,0.015428,-0.004925,-0.006817,0.01439,-0.002359,0.011786,-0.002479,0.012666
frame_id,0.008903,0.035027,0.009506,1.0,-0.009145,-0.105705,-0.011126,-0.004293,0.706158,0.30976,0.002231,-0.003294,0.555645,-0.013171,-0.003167,-0.012872,-0.007298
absolute_yardline_number,0.000164,-0.007978,-0.002079,-0.009145,1.0,0.005745,0.95528,-0.001018,-0.006181,0.002596,0.088358,-0.030327,-0.021019,0.739103,-0.001903,0.810612,-0.002459
player_weight,-0.005746,-0.026695,-0.145233,-0.105705,0.005745,1.0,0.005079,0.003264,-0.114079,-0.010854,0.001372,0.008817,-0.180128,0.006906,0.003375,0.004938,0.002679
x_in,-0.004143,-0.007105,-0.001568,-0.011126,0.95528,0.005079,1.0,-0.000561,-0.008643,0.001722,-0.010141,0.146453,-0.024689,0.842526,-0.000691,0.910853,-0.002075
y_in,0.002838,0.003692,0.015818,-0.004293,-0.001018,0.003264,-0.000561,1.0,-0.00463,-0.002852,-0.000554,0.003752,-0.004991,0.003285,0.714359,0.0024,0.83097
s,0.007819,0.011448,0.006906,0.706158,-0.006181,-0.114079,-0.008643,-0.00463,1.0,0.427121,0.006307,-0.009489,0.408127,-0.012381,-0.004255,-0.011853,-0.008771
a,0.000392,-0.009505,0.015428,0.30976,0.002596,-0.010854,0.001722,-0.002852,0.427121,1.0,-0.001,-0.003072,0.089969,0.000108,-0.002912,0.00044,-0.004433


In [2]:
"""from cuml.utils import show_versions
show_versions()"""

# Tratamiento de variables categóricas

In [11]:
#FUNCIONES PARA LA ALTURA Y LA DIRECCIÓN
#Altura del jugador, de pies-pulgada a pulgadas
data = train_unido.copy()
def parse_height(height_str):
    if pd.isna(height_str):
        return np.nan
    try:
        feet, inches = map(int, str(height_str).split('-'))
        return feet * 12 + inches
    except:
        return np.nan


#Dirección, todos los de izquierda pasan a derecha 
# Crear una máscara para las jugadas que van a la izquierda
left_mask = train_unido['play_direction'] == 'left'

# Contar cuántas se van a normalizar
num_flipped = left_mask.sum()

# Invertir coordenadas X e Y
train_unido.loc[left_mask, 'x_in'] = 120 - train_unido.loc[left_mask, 'x_in']
train_unido.loc[left_mask, 'y_in'] = 53.3 - train_unido.loc[left_mask, 'y_in']

# Rotar los ángulos de dirección y orientación
train_unido.loc[left_mask, 'dir'] = (train_unido.loc[left_mask, 'dir'] + 180) % 360
train_unido.loc[left_mask, 'o'] = (train_unido.loc[left_mask, 'o'] + 180) % 360

# Si existen las columnas del aterrizaje del balón, también se normalizan
if 'ball_land_x' in train_unido.columns:
    train_unido.loc[left_mask, 'ball_land_x'] = 120 - train_unido.loc[left_mask, 'ball_land_x']
    train_unido.loc[left_mask, 'ball_land_y'] = 53.3 - train_unido.loc[left_mask, 'ball_land_y']



In [12]:
#Llama a la funciones y se hace get dummies para las categóricas que en este caso son: player side y player role 

train_unido['player_height'] = train_unido['player_height'].apply(parse_height)
train_unido = pd.get_dummies(train_unido, columns=['player_side','player_role'], prefix='pos') # SOLO CORRER UNA VEZ


In [20]:
#Quita las variables que no son predictorias y la posición de izquierda ya se pasó a derecha
train_unido.drop(['game_id', 'play_id','player_to_predict','nfl_id','frame_id','player_name','player_birth_date','player_position','play_direction'], axis=1, inplace=True)
train_unido.info()
train_unido.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560426 entries, 0 to 560425
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   absolute_yardline_number  560426 non-null  int64  
 1   player_height             560426 non-null  int64  
 2   player_weight             560426 non-null  int64  
 3   x_in                      560426 non-null  float64
 4   y_in                      560426 non-null  float64
 5   s                         560426 non-null  float64
 6   a                         560426 non-null  float64
 7   dir                       560426 non-null  float64
 8   o                         560426 non-null  float64
 9   ball_land_x               560426 non-null  float64
 10  ball_land_y               560426 non-null  float64
 11  x_out                     560426 non-null  float64
 12  y_out                     560426 non-null  float64
 13  pos_Defense               560426 non-null  i

Unnamed: 0,absolute_yardline_number,player_height,player_weight,x_in,y_in,s,a,dir,o,ball_land_x,ball_land_y,x_out,y_out,pos_Defense,pos_Offense,pos_Defensive Coverage,pos_Targeted Receiver
0,42,73,204,51.32,20.69,0.31,0.49,79.43,267.68,63.259998,-0.22,56.22,17.28,1,0,1,0
1,42,73,204,51.35,20.66,0.36,0.74,118.07,268.66,63.259998,-0.22,56.63,16.88,1,0,1,0
2,42,73,204,51.39,20.63,0.44,0.76,130.89,269.78,63.259998,-0.22,57.06,16.46,1,0,1,0
3,42,73,204,51.43,20.61,0.48,0.62,134.5,269.78,63.259998,-0.22,57.48,16.02,1,0,1,0
4,42,73,204,51.48,20.58,0.54,0.44,129.79,269.06,63.259998,-0.22,57.91,15.56,1,0,1,0


In [22]:
#train_unido.drop(['num_frames_output'],axis=1, inplace= True)

In [23]:
train_unido = train_unido.astype({col: int for col in train_unido.select_dtypes('bool').columns})

train_unido.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560426 entries, 0 to 560425
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   absolute_yardline_number  560426 non-null  int64  
 1   player_height             560426 non-null  int64  
 2   player_weight             560426 non-null  int64  
 3   x_in                      560426 non-null  float64
 4   y_in                      560426 non-null  float64
 5   s                         560426 non-null  float64
 6   a                         560426 non-null  float64
 7   dir                       560426 non-null  float64
 8   o                         560426 non-null  float64
 9   ball_land_x               560426 non-null  float64
 10  ball_land_y               560426 non-null  float64
 11  x_out                     560426 non-null  float64
 12  y_out                     560426 non-null  float64
 13  pos_Defense               560426 non-null  i

In [24]:
data = train_unido.copy()

# Partición de la base de datos para entrenamiento, validación y testeo

In [38]:
#Librerías para los modelos 
# Modelos lineales
from sklearn.linear_model import LinearRegression, Lasso, ElasticNet, SGDRegressor, BayesianRidge

# Modelos kernel y procesos gaussianos
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor

# Máquinas de soporte vectorial
from sklearn.svm import SVR  # Support Vector Regressor

# Ensambles (árboles de decisión)
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# XGBoost
from xgboost import XGBRegressor

# Utilidades comunes
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

In [27]:
data.describe()

Unnamed: 0,absolute_yardline_number,player_height,player_weight,x_in,y_in,s,a,dir,o,ball_land_x,ball_land_y,x_out,y_out,pos_Defense,pos_Offense,pos_Defensive Coverage,pos_Targeted Receiver
count,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0
mean,60.395003,72.792733,208.459748,61.64163,26.669856,1.724788,1.760978,131.784221,214.676432,71.593486,26.531144,60.311604,26.604846,0.714785,0.285215,0.714785,0.285215
std,23.094026,2.086369,21.723023,23.249446,10.139412,1.790043,1.478373,87.769936,89.044003,24.785883,16.482464,25.247203,13.428138,0.451517,0.451517,0.451517,0.451517
min,11.0,66.0,153.0,5.23,2.85,0.0,0.0,0.0,0.0,7.8,-4.030002,0.02,0.33,0.0,0.0,0.0,0.0
25%,41.0,71.0,193.0,42.75,18.37,0.31,0.54,74.56,108.52,51.510002,11.36,43.08,14.92,0.0,0.0,0.0,0.0
50%,60.0,73.0,203.0,58.46,26.48,1.13,1.43,104.67,257.67,69.709999,26.17,60.13,26.42,1.0,0.0,1.0,0.0
75%,79.0,74.0,220.0,78.56,35.01,2.6,2.69,170.21,278.44,90.949997,41.84,77.34,38.33,1.0,1.0,1.0,1.0
max,109.0,81.0,358.0,116.33,51.76,9.88,16.75,360.0,360.0,125.849998,57.21,120.83,53.72,1.0,1.0,1.0,1.0


In [39]:
#Partición de la base de datos 

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Guardar los nombres de columnas e índice antes de escalar
cols = data.columns
idx = data.index

# Escalar solo los valores numéricos
data_scaled = scaler.fit_transform(data)

# Reconstruir el DataFrame con nombres e índice originales
data = pd.DataFrame(data_scaled, columns=cols, index=idx)

data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 560426 entries, 0 to 560425
Data columns (total 17 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   absolute_yardline_number  560426 non-null  float64
 1   player_height             560426 non-null  float64
 2   player_weight             560426 non-null  float64
 3   x_in                      560426 non-null  float64
 4   y_in                      560426 non-null  float64
 5   s                         560426 non-null  float64
 6   a                         560426 non-null  float64
 7   dir                       560426 non-null  float64
 8   o                         560426 non-null  float64
 9   ball_land_x               560426 non-null  float64
 10  ball_land_y               560426 non-null  float64
 11  x_out                     560426 non-null  float64
 12  y_out                     560426 non-null  float64
 13  pos_Defense               560426 non-null  f

Unnamed: 0,absolute_yardline_number,player_height,player_weight,x_in,y_in,s,a,dir,o,ball_land_x,ball_land_y,x_out,y_out,pos_Defense,pos_Offense,pos_Defensive Coverage,pos_Targeted Receiver
count,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0,560426.0
mean,2.9617250000000003e-17,1.2171470000000001e-17,-7.302884e-18,1.2171470000000001e-17,3.245726e-17,-1.1360040000000002e-17,-2.190865e-17,-6.085737e-18,2.0285789999999998e-19,2.758867e-17,-1.866293e-17,1.4605770000000002e-17,-2.840011e-17,2.353152e-17,-2.353152e-17,2.353152e-17,-2.353152e-17
std,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001,1.000001
min,-2.138867,-3.25577,-2.553042,-2.426367,-2.349236,-0.9635464,-1.191161,-1.501475,-2.410905,-2.573785,-1.854163,-2.388053,-1.956702,-1.583074,-0.6316822,-1.583074,-0.6316822
25%,-0.8398285,-0.8592604,-0.7116764,-0.8125633,-0.8185744,-0.790366,-0.8258942,-0.6519803,-1.192181,-0.8102799,-0.9204423,-0.682516,-0.870177,-1.583074,-0.6316822,-1.583074,-0.6316822
50%,-0.01710414,0.09934364,-0.2513349,-0.1368477,-0.01872454,-0.3322759,-0.2238804,-0.3089241,0.4828355,-0.07599037,-0.02191082,-0.007193028,-0.01376562,0.6316822,-0.6316822,0.6316822,-0.6316822
75%,0.8056202,0.5786456,0.5312457,0.7276898,0.8225479,0.4889343,0.6284089,0.4378015,0.7160912,0.7809497,0.9287974,0.6744672,0.8731787,0.6316822,1.583074,0.6316822,1.583074
max,2.104659,3.93376,6.883958,2.352246,2.474519,4.55588,10.13887,2.600161,1.632044,2.189011,1.861304,2.397036,2.019281,0.6316822,1.583074,0.6316822,1.583074


In [40]:
from sklearn.model_selection import train_test_split
X = data.drop(columns=['x_out', 'y_out'])
y = data[['x_out', 'y_out']]


#60% entrenamiento 20% validación 20% testeo 

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42)


# ENTRENAMIENTO DE LOS MODELOS

In [41]:
#FUNCIÓN PARA CÁCULO DE LAS MÉTRICAS 
def metricas_evaluacion(y_true, y_pred):
    """
    Calcula métricas de evaluación para un modelo de regresión:
    MAE, MSE, RMSE, R2 y MAPE (tanto global como por variable).
    """
    # ---- Métricas globales ----
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / np.where(y_true == 0, 1e-10, y_true))) * 100

    # ---- Métricas por variable ----
    mae_por_columna = mean_absolute_error(y_true, y_pred, multioutput='raw_values')
    mse_por_columna = mean_squared_error(y_true, y_pred, multioutput='raw_values')
    r2_por_columna = r2_score(y_true, y_pred, multioutput='raw_values')
    mape_por_columna = np.mean(
        np.abs((y_true - y_pred) / np.where(y_true == 0, 1e-10, y_true)),
        axis=0
    ) * 100

    # ---- Impresión de resultados ----
    print("📈 Métricas por variable:")
    for i in range(len(mae_por_columna)):
        print(f"Variable {i+1}: MAE={mae_por_columna[i]:.4f}, "
              f"MSE={mse_por_columna[i]:.4f}, "
              f"R2={r2_por_columna[i]:.4f}, "
              f"MAPE={mape_por_columna.iloc[i]:.2f}%")

    
    print("\n📊 Métricas globales del modelo:")
    print(f"MAE  (Mean Absolute Error):       {mae:.4f}")
    print(f"MSE  (Mean Squared Error):        {mse:.4f}")
    print(f"RMSE (Root Mean Squared Error):   {rmse:.4f}")
    print(f"R²   (Coeficiente de determinación): {r2:.4f}")
    print(f"MAPE (Mean Absolute Percentage Error): {mape:.2f}%")

    # ---- Retorno estructurado ----
    return {
        "MAE_global": mae,
        "MSE_global": mse,
        "RMSE_global": rmse,
        "R2_global": r2,
        "MAPE_global": mape,
        "MAE_por_variable": mae_por_columna,
        "MSE_por_variable": mse_por_columna,
        "R2_por_variable": r2_por_columna,
        "MAPE_por_variable": mape_por_columna
    }


Linear Regression

In [42]:
Linear = LinearRegression(fit_intercept = True, n_jobs= 2)
Linear.fit(X_train, y_train) #Entrenamiento
y_pred = Linear.predict(X_val)

linear_metricas = metricas_evaluacion(y_val,y_pred)


[2025-10-15 04:38:07.337] [CUML] [info] Unused keyword parameter: n_jobs during cuML estimator initialization


  return func(*args, **kwargs)


📈 Métricas por variable:
Variable 1: MAE=0.4826, MSE=0.3446, R2=0.6564, MAPE=407.91%
Variable 2: MAE=0.8761, MSE=1.0016, R2=0.0004, MAPE=102.55%

📊 Métricas globales del modelo:
MAE  (Mean Absolute Error):       0.6794
MSE  (Mean Squared Error):        0.6731
RMSE (Root Mean Squared Error):   0.8204
R²   (Coeficiente de determinación): 0.3284
MAPE (Mean Absolute Percentage Error): 255.23%


In [43]:
print("Coeficientes:", Linear.coef_.shape)
print("Interceptos:", Linear.intercept_)


Coeficientes: (15, 2)
Interceptos: [ 1.14412654e-03 -9.41343160e-05]


Lasso

Optimización bayesiana

In [None]:
#pip install optuna

In [44]:

from sklearn.multioutput import MultiOutputRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import make_scorer
import optuna

In [45]:
from bayes_opt import BayesianOptimization

In [47]:
from sklearn.model_selection import KFold
from skopt import BayesSearchCV
from skopt.space import Real, Integer

laasso=  Lasso(alpha=0.1, max_iter=10000, random_state=42) #alpha = hiperparámetro de penalización de la norma

# Usamos el modelo Lasso de scikit-learn
modelo_base = Lasso(random_state=42, max_iter=10000)
search_space = {'alpha': Real(1e-6, 100.0, prior='log-uniform', name='alpha')}

# --- Configuración del optimizador bayesiano ---
opt = BayesSearchCV(
    estimator=laasso,
    search_spaces=search_space,
    n_iter=30,              # número de iteraciones (puedes aumentar)
    cv=5,                   # validación cruzada de 5 pliegues
    scoring='r2',           # métrica a maximizar
    random_state=42,
    n_jobs=-1               # usa todos los núcleos disponibles
)

# --- Entrenamiento ---
opt.fit(X_train, y_train)


# --- Resultados ---
print("Mejor valor de alpha:", opt.best_params_['alpha'])
print("Mejor R² promedio (CV):", opt.best_score_)

AssertionError: Error in `RandomForestRegressor.__init__`!. Positional arguments for estimators (that derive from `Base`) have been deprecated but parameters 'n_estimators, criterion, max_depth, min_samples_split, min_samples_leaf, min_weight_fraction_leaf, max_features, max_leaf_nodes, min_impurity_decrease, bootstrap, oob_score, n_jobs, random_state, verbose, warm_start, min_variance' can still be used as positional arguments. Please specify all parameters after `self` as keyword only by using the `*` argument

In [2]:
alpha_lasso = opt.best_params_['alpha']
Laso = Lasso(alpha=alpha_lasso, max_iter=10000, random_state=42)
y_pred1 = laasso.predict(X_val)
lasso_metricas = metricas_evaluacion(y_val,y_pred1)

NameError: name 'opt' is not defined

In [48]:
laasso=  Lasso(alpha=0.1, max_iter=10000, random_state=42) #alpha = hiperparámetro de penalización de la norma



laasso.fit(X_train, y_train)
y_pred1 = laasso.predict(X_val)
lasso_metricas = metricas_evaluacion(y_val,y_pred1)

[2025-10-15 04:43:45.145] [CUML] [info] Unused keyword parameter: random_state during cuML estimator initialization


ValueError: Expected 1 columns but got 2 columns.