In [175]:
import mlflow
import mlflow.sklearn
import optuna
import numpy as np
import pandas as pd
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import cross_val_score, TimeSeriesSplit, train_test_split
from sklearn.metrics import root_mean_squared_error

from boruta import BorutaPy

import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore", category=FutureWarning, module="sklearn.utils.deprecation")


In [180]:
ABT_path = os.path.join('..', 'data', 'ABTs', 'principal_ABT.csv')
ABT = pd.read_csv(ABT_path, index_col= 0)
ABT.head()

ABT = ABT.astype(float)

In [181]:
X = ABT.drop(columns= ['target'])
y = ABT['target']

print(X.shape, y.shape)

(2738, 24) (2738,)


In [183]:
#Separemos un conjunto de test para la los resultados finales
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.1, shuffle= False)

#Unas verificaciones para garantizar la calidad de los datos
print(f'Tamano train: {X_train.shape, y_train.shape}')
print(f'Tamano test: {X_test.shape, y_test.shape}')
print(f'Fecha inicio train {X_train.index.min()} ' , f'Fecha final train {X_train.index.max()} ')
print(f'Fecha inicio test {X_test.index.min()} ' , f'Fecha final test {X_test.index.max()} ')

Tamano train: ((2464, 24), (2464,))
Tamano test: ((274, 24), (274,))
Fecha inicio train 2014-04-17  Fecha final train 2024-01-31 
Fecha inicio test 2024-02-01  Fecha final test 2025-03-06 


In [166]:
def objective_lgb(trial, X, y):
    # Definir hiperparámetros de LightGBM a optimizar, con rangos pensados para reducir sobreajuste
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 25, 30),
        'learning_rate': trial.suggest_float('learning_rate', 1e-8, 1e-5, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 10, 20),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'min_child_samples': trial.suggest_int('min_child_samples', 10, 50),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.9, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.9, 1.0),
        'random_state': 42,
        'verbosity': -1,
    }
    
    # Utilizamos TimeSeriesSplit con 3 splits
    tscv = TimeSeriesSplit(n_splits=3)
    
    train_rmse_list = []
    val_rmse_list = []
    
    # Para cada fold, aplicamos Boruta y entrenamos el modelo LightGBM en las features seleccionadas
    for train_index, val_index in tscv.split(X):
        X_train_fold = X.iloc[train_index]
        y_train_fold = y.iloc[train_index]
        X_val_fold = X.iloc[val_index]
        y_val_fold = y.iloc[val_index]
        
        # Instanciar un estimador base para Boruta
        base_estimator = lgb.LGBMRegressor(**param, eval_set= [(X_val_fold, y_val_fold)], eval_metric='rmse')
        boruta_selector = BorutaPy(estimator=base_estimator, n_estimators='auto', verbose=0, random_state=42)
        
        # BorutaPy requiere arrays numpy
        boruta_selector.fit(X_train_fold.values, y_train_fold.values)
        
        # Transformar los conjuntos usando la máscara de features seleccionadas
        X_train_sel = boruta_selector.transform(X_train_fold.values)
        X_val_sel = boruta_selector.transform(X_val_fold.values)
        
        # Si Boruta selecciona menos de 1 feature, usamos todas (para evitar errores)
        if X_train_sel.shape[1] < 1:
            X_train_sel = X_train_fold.values
            X_val_sel = X_val_fold.values
        
        # Entrenar el modelo final en el fold con las features seleccionadas
        final_model = lgb.LGBMRegressor(**param)
        final_model.fit(X_train_sel, y_train_fold.values)
        
        # Calcular métricas de entrenamiento y validación
        y_train_pred = final_model.predict(X_train_sel)
        y_val_pred = final_model.predict(X_val_sel)
        
        rmse_train = root_mean_squared_error(y_train_fold, y_train_pred)
        rmse_val = root_mean_squared_error(y_val_fold, y_val_pred)
        
        train_rmse_list.append(rmse_train)
        val_rmse_list.append(rmse_val)
    
    # Promediar las métricas en los folds
    avg_train_rmse = np.mean(train_rmse_list)
    avg_val_rmse = np.mean(val_rmse_list)
    
    # Imprimir las métricas para seguimiento
    print(f"Avg Train RMSE: {avg_train_rmse:.4f} | Avg Validation RMSE: {avg_val_rmse:.4f}")
    
    # Registrar el experimento en MLflow (cada trial se registra como un run anidado)
    with mlflow.start_run(nested=True):
        mlflow.log_params(param)
        mlflow.log_metric("avg_train_rmse", avg_train_rmse)
        mlflow.log_metric("avg_val_rmse", avg_val_rmse)
        # No se loguea el modelo aquí ya que se entrena en cada fold; 
        # el objetivo es optimizar la métrica de validación.
    
    # Optuna busca minimizar el error de validación
    return avg_val_rmse

In [167]:
study = optuna.create_study(direction="minimize", sampler= optuna.samplers.TPESampler(seed= 42))
study.optimize(lambda trial: objective_lgb(trial, X_train, y_train), n_trials= 50)

[I 2025-03-12 23:00:16,788] A new study created in memory with name: no-name-010db96b-5da4-49fb-9d71-c27a4e72bd3b
[I 2025-03-12 23:00:34,048] Trial 0 finished with value: 0.8540299501209924 and parameters: {'n_estimators': 27, 'learning_rate': 7.114476009343412e-06, 'num_leaves': 18, 'max_depth': 4, 'min_child_samples': 16, 'subsample': 0.5779972601681014, 'colsample_bytree': 0.5290418060840998, 'reg_alpha': 0.9866176145774935, 'reg_lambda': 0.9601115011743209}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:00:45,447] Trial 1 finished with value: 0.8541128962786567 and parameters: {'n_estimators': 29, 'learning_rate': 1.1527987128232397e-08, 'num_leaves': 20, 'max_depth': 5, 'min_child_samples': 18, 'subsample': 0.5909124836035503, 'colsample_bytree': 0.5917022549267169, 'reg_alpha': 0.9304242242959538, 'reg_lambda': 0.9524756431632238}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:00:50,103] Trial 2 finished with value: 0.8541121302256922 and parameters: {'n_estimators': 27, 'learning_rate': 7.476312062252278e-08, 'num_leaves': 16, 'max_depth': 3, 'min_child_samples': 21, 'subsample': 0.6831809216468459, 'colsample_bytree': 0.728034992108518, 'reg_alpha': 0.9785175961393013, 'reg_lambda': 0.919967378215836}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:00:53,999] Trial 3 finished with value: 0.8541054830686298 and parameters: {'n_estimators': 28, 'learning_rate': 5.987474910461391e-07, 'num_leaves': 10, 'max_depth': 4, 'min_child_samples': 16, 'subsample': 0.5325257964926398, 'colsample_bytree': 0.9744427686266666, 'reg_alpha': 0.996563203307456, 'reg_lambda': 0.9808397348116461}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:01:10,897] Trial 4 finished with value: 0.8541128180326428 and parameters: {'n_estimators': 26, 'learning_rate': 1.963434157293331e-08, 'num_leaves': 17, 'max_depth': 4, 'min_child_samples': 15, 'subsample': 0.7475884550556351, 'colsample_bytree': 0.5171942605576092, 'reg_alpha': 0.9909320402078782, 'reg_lambda': 0.9258779981600017}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:01:23,675] Trial 5 finished with value: 0.8541120013681658 and parameters: {'n_estimators': 28, 'learning_rate': 8.612579192594878e-08, 'num_leaves': 15, 'max_depth': 4, 'min_child_samples': 17, 'subsample': 0.9847923138822793, 'colsample_bytree': 0.8875664116805573, 'reg_alpha': 0.9939498941564189, 'reg_lambda': 0.9894827350427648}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:01:35,216] Trial 6 finished with value: 0.854037384351373 and parameters: {'n_estimators': 28, 'learning_rate': 5.829384542994731e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 11, 'subsample': 0.6626651653816322, 'colsample_bytree': 0.6943386448447411, 'reg_alpha': 0.9271349031773896, 'reg_lambda': 0.9828737509151929}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:01:38,161] Trial 7 finished with value: 0.8541121968416152 and parameters: {'n_estimators': 27, 'learning_rate': 6.963114377829277e-08, 'num_leaves': 15, 'max_depth': 3, 'min_child_samples': 42, 'subsample': 0.5372753218398854, 'colsample_bytree': 0.9934434683002586, 'reg_alpha': 0.9772244769296657, 'reg_lambda': 0.9198715681534173}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:01:52,013] Trial 8 finished with value: 0.8540828317383925 and parameters: {'n_estimators': 25, 'learning_rate': 2.795015916508332e-06, 'num_leaves': 17, 'max_depth': 5, 'min_child_samples': 41, 'subsample': 0.5370223258670452, 'colsample_bytree': 0.6792328642721364, 'reg_alpha': 0.911586905952513, 'reg_lambda': 0.9863103425875593}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:02:07,220] Trial 9 finished with value: 0.8541118015824413 and parameters: {'n_estimators': 28, 'learning_rate': 9.83318193364489e-08, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 23, 'subsample': 0.864803089169032, 'colsample_bytree': 0.8187787356776066, 'reg_alpha': 0.9887212742576327, 'reg_lambda': 0.9472214925161949}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:02:27,636] Trial 10 finished with value: 0.8540982464774242 and parameters: {'n_estimators': 30, 'learning_rate': 1.1295831475598717e-06, 'num_leaves': 20, 'max_depth': 5, 'min_child_samples': 31, 'subsample': 0.8451235367845726, 'colsample_bytree': 0.5075721185513784, 'reg_alpha': 0.9623376518629465, 'reg_lambda': 0.9570547864881683}. Best is trial 0 with value: 0.8540299501209924.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:02:39,415] Trial 11 finished with value: 0.853998373363062 and parameters: {'n_estimators': 26, 'learning_rate': 9.48596696206155e-06, 'num_leaves': 13, 'max_depth': 3, 'min_child_samples': 11, 'subsample': 0.6446679138197002, 'colsample_bytree': 0.6424665890870148, 'reg_alpha': 0.9427112305633332, 'reg_lambda': 0.9720346261592522}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:03:02,864] Trial 12 finished with value: 0.8540228966973121 and parameters: {'n_estimators': 25, 'learning_rate': 8.268327119558118e-06, 'num_leaves': 13, 'max_depth': 4, 'min_child_samples': 29, 'subsample': 0.6372424866412405, 'colsample_bytree': 0.623712088801301, 'reg_alpha': 0.9477154944241006, 'reg_lambda': 0.9656116398580863}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:03:12,688] Trial 13 finished with value: 0.8540934336481923 and parameters: {'n_estimators': 25, 'learning_rate': 1.7834642619505332e-06, 'num_leaves': 13, 'max_depth': 3, 'min_child_samples': 31, 'subsample': 0.6768177452233212, 'colsample_bytree': 0.62077746649017, 'reg_alpha': 0.9497085393610852, 'reg_lambda': 0.9685504608128934}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:03:21,148] Trial 14 finished with value: 0.854026767673155 and parameters: {'n_estimators': 26, 'learning_rate': 7.870708682262812e-06, 'num_leaves': 12, 'max_depth': 4, 'min_child_samples': 50, 'subsample': 0.7655620151757302, 'colsample_bytree': 0.6388986074458943, 'reg_alpha': 0.9483838487541159, 'reg_lambda': 0.9383873396049074}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:03:26,534] Trial 15 finished with value: 0.8541084171789025 and parameters: {'n_estimators': 26, 'learning_rate': 3.930754316088763e-07, 'num_leaves': 13, 'max_depth': 3, 'min_child_samples': 26, 'subsample': 0.6273430605696253, 'colsample_bytree': 0.780927629058066, 'reg_alpha': 0.9306244257299805, 'reg_lambda': 0.9977266465595078}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:03:47,127] Trial 16 finished with value: 0.8540803185401374 and parameters: {'n_estimators': 25, 'learning_rate': 3.0434192157839695e-06, 'num_leaves': 12, 'max_depth': 5, 'min_child_samples': 37, 'subsample': 0.7464345575530974, 'colsample_bytree': 0.5848724076879059, 'reg_alpha': 0.960595207297969, 'reg_lambda': 0.9013179214380924}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:03:59,604] Trial 17 finished with value: 0.8539987915579883 and parameters: {'n_estimators': 26, 'learning_rate': 9.502939786216104e-06, 'num_leaves': 14, 'max_depth': 4, 'min_child_samples': 11, 'subsample': 0.8214237152952777, 'colsample_bytree': 0.6601698676289585, 'reg_alpha': 0.9036304813232119, 'reg_lambda': 0.9716969732202975}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:04:21,138] Trial 18 finished with value: 0.8541010101417829 and parameters: {'n_estimators': 26, 'learning_rate': 1.0078069494363213e-06, 'num_leaves': 14, 'max_depth': 4, 'min_child_samples': 10, 'subsample': 0.873069154903134, 'colsample_bytree': 0.8264423346529436, 'reg_alpha': 0.9061689197153427, 'reg_lambda': 0.9753330171531137}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:04:30,340] Trial 19 finished with value: 0.854071281352498 and parameters: {'n_estimators': 26, 'learning_rate': 3.4469470984197844e-06, 'num_leaves': 12, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.9578793112692217, 'colsample_bytree': 0.7396765886951213, 'reg_alpha': 0.9179409570043513, 'reg_lambda': 0.9724444164039545}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:04:42,907] Trial 20 finished with value: 0.8541101567188609 and parameters: {'n_estimators': 27, 'learning_rate': 2.4426041717654863e-07, 'num_leaves': 14, 'max_depth': 5, 'min_child_samples': 20, 'subsample': 0.810520979183597, 'colsample_bytree': 0.6684438589168324, 'reg_alpha': 0.9011620961627553, 'reg_lambda': 0.9993842785049709}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:04:57,917] Trial 21 finished with value: 0.8540248062450019 and parameters: {'n_estimators': 25, 'learning_rate': 8.063911124823062e-06, 'num_leaves': 13, 'max_depth': 4, 'min_child_samples': 27, 'subsample': 0.7158658160830185, 'colsample_bytree': 0.5774771233568973, 'reg_alpha': 0.9398650864670016, 'reg_lambda': 0.9646853965364719}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:05:20,308] Trial 22 finished with value: 0.854063614226304 and parameters: {'n_estimators': 25, 'learning_rate': 4.36897480352893e-06, 'num_leaves': 14, 'max_depth': 4, 'min_child_samples': 36, 'subsample': 0.6217410544077034, 'colsample_bytree': 0.638214636112402, 'reg_alpha': 0.9621290403683519, 'reg_lambda': 0.9681081369249528}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:05:46,467] Trial 23 finished with value: 0.8540093964570031 and parameters: {'n_estimators': 26, 'learning_rate': 8.87779949986047e-06, 'num_leaves': 11, 'max_depth': 4, 'min_child_samples': 13, 'subsample': 0.808044516839866, 'colsample_bytree': 0.7072958782974688, 'reg_alpha': 0.9156841177074695, 'reg_lambda': 0.9440915436973509}. Best is trial 11 with value: 0.853998373363062.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:06:12,939] Trial 24 finished with value: 0.8539967713363718 and parameters: {'n_estimators': 26, 'learning_rate': 9.959567566960228e-06, 'num_leaves': 11, 'max_depth': 4, 'min_child_samples': 13, 'subsample': 0.8087899997380243, 'colsample_bytree': 0.7075378225748107, 'reg_alpha': 0.9179794126492945, 'reg_lambda': 0.94174963331253}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:06:22,785] Trial 25 finished with value: 0.8540886427922363 and parameters: {'n_estimators': 27, 'learning_rate': 1.9693277174858437e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 14, 'subsample': 0.9145942595193255, 'colsample_bytree': 0.7728444381343298, 'reg_alpha': 0.9265686236998097, 'reg_lambda': 0.9326416302609251}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:06:28,663] Trial 26 finished with value: 0.8540616615947164 and parameters: {'n_estimators': 26, 'learning_rate': 4.6469487438846495e-06, 'num_leaves': 12, 'max_depth': 5, 'min_child_samples': 23, 'subsample': 0.7947316440163442, 'colsample_bytree': 0.5632613446856178, 'reg_alpha': 0.9193520715522847, 'reg_lambda': 0.9532607165232241}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:06:47,261] Trial 27 finished with value: 0.8540923058236146 and parameters: {'n_estimators': 26, 'learning_rate': 1.7257826402336034e-06, 'num_leaves': 11, 'max_depth': 4, 'min_child_samples': 10, 'subsample': 0.919072366796272, 'colsample_bytree': 0.6665321339891265, 'reg_alpha': 0.9001558318227407, 'reg_lambda': 0.9384856213078567}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:07:04,313] Trial 28 finished with value: 0.854099771105223 and parameters: {'n_estimators': 27, 'learning_rate': 1.0856347167858428e-06, 'num_leaves': 16, 'max_depth': 3, 'min_child_samples': 19, 'subsample': 0.710690940966355, 'colsample_bytree': 0.7667406804487104, 'reg_alpha': 0.9400247786229275, 'reg_lambda': 0.9776380259322753}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:07:17,817] Trial 29 finished with value: 0.854049437947909 and parameters: {'n_estimators': 27, 'learning_rate': 5.179924784204019e-06, 'num_leaves': 14, 'max_depth': 4, 'min_child_samples': 14, 'subsample': 0.8285589199362634, 'colsample_bytree': 0.8266823056276795, 'reg_alpha': 0.9100055250650881, 'reg_lambda': 0.9600241164751386}. Best is trial 24 with value: 0.8539967713363718.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:07:32,022] Trial 30 finished with value: 0.8539958975208366 and parameters: {'n_estimators': 26, 'learning_rate': 9.6854741858785e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 10, 'subsample': 0.7757969736292901, 'colsample_bytree': 0.7068795194347864, 'reg_alpha': 0.937824840212183, 'reg_lambda': 0.9925732691225597}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:07:48,389] Trial 31 finished with value: 0.8539987611769702 and parameters: {'n_estimators': 26, 'learning_rate': 9.437615592205908e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.7668287903074249, 'colsample_bytree': 0.7154287166657751, 'reg_alpha': 0.9340216931040392, 'reg_lambda': 0.993192121339292}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:08:06,735] Trial 32 finished with value: 0.8540538994962589 and parameters: {'n_estimators': 26, 'learning_rate': 4.948871439921811e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 17, 'subsample': 0.7733198997181396, 'colsample_bytree': 0.714654215237974, 'reg_alpha': 0.9385615510168296, 'reg_lambda': 0.9941835605400718}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:08:17,315] Trial 33 finished with value: 0.8540398054479708 and parameters: {'n_estimators': 27, 'learning_rate': 5.914743449497574e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 13, 'subsample': 0.7243513604984584, 'colsample_bytree': 0.7469788395904894, 'reg_alpha': 0.9345873993319739, 'reg_lambda': 0.9897148777912507}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:08:35,810] Trial 34 finished with value: 0.854077092426372 and parameters: {'n_estimators': 29, 'learning_rate': 2.709357427615371e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 16, 'subsample': 0.7845194789596304, 'colsample_bytree': 0.7040393883066481, 'reg_alpha': 0.9232650216309382, 'reg_lambda': 0.9927988525524077}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:08:44,831] Trial 35 finished with value: 0.8540732174437146 and parameters: {'n_estimators': 25, 'learning_rate': 3.485551782780602e-06, 'num_leaves': 12, 'max_depth': 3, 'min_child_samples': 22, 'subsample': 0.5691986641997921, 'colsample_bytree': 0.5433680700613227, 'reg_alpha': 0.9547502538983011, 'reg_lambda': 0.9942028102699215}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:09:01,391] Trial 36 finished with value: 0.8541127625195214 and parameters: {'n_estimators': 27, 'learning_rate': 2.206341681481931e-08, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 10, 'subsample': 0.692998701288098, 'colsample_bytree': 0.8009144282529858, 'reg_alpha': 0.9434361871925012, 'reg_lambda': 0.9812342861027725}. Best is trial 30 with value: 0.8539958975208366.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:09:08,196] Trial 37 finished with value: 0.8539953143131754 and parameters: {'n_estimators': 26, 'learning_rate': 9.784404493530382e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 18, 'subsample': 0.7424854810677134, 'colsample_bytree': 0.8534789074290605, 'reg_alpha': 0.9319706282747817, 'reg_lambda': 0.9847793339597594}. Best is trial 37 with value: 0.8539953143131754.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:09:14,524] Trial 38 finished with value: 0.8540304245002553 and parameters: {'n_estimators': 29, 'learning_rate': 6.178852766063578e-06, 'num_leaves': 19, 'max_depth': 3, 'min_child_samples': 18, 'subsample': 0.6533641813057806, 'colsample_bytree': 0.87724768159598, 'reg_alpha': 0.9234222426552057, 'reg_lambda': 0.9860814333549764}. Best is trial 37 with value: 0.8539953143131754.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:09:20,724] Trial 39 finished with value: 0.8541058703712103 and parameters: {'n_estimators': 26, 'learning_rate': 6.110390040833219e-07, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 15, 'subsample': 0.5905729205695264, 'colsample_bytree': 0.9358886958536207, 'reg_alpha': 0.9559896223052045, 'reg_lambda': 0.9223861466649126}. Best is trial 37 with value: 0.8539953143131754.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:09:27,824] Trial 40 finished with value: 0.8541105886958639 and parameters: {'n_estimators': 27, 'learning_rate': 1.9571340937055968e-07, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 20, 'subsample': 0.7416764950741466, 'colsample_bytree': 0.8570986384736295, 'reg_alpha': 0.9675137684120998, 'reg_lambda': 0.984788750344846}. Best is trial 37 with value: 0.8539953143131754.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:09:38,657] Trial 41 finished with value: 0.8539932070634068 and parameters: {'n_estimators': 26, 'learning_rate': 9.89517587604896e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.761636885611244, 'colsample_bytree': 0.7280669024814017, 'reg_alpha': 0.9334249209243208, 'reg_lambda': 0.9788958090626914}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:09:50,932] Trial 42 finished with value: 0.8540401398087555 and parameters: {'n_estimators': 26, 'learning_rate': 6.099048806600267e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 16, 'subsample': 0.8496816075439898, 'colsample_bytree': 0.731915892954884, 'reg_alpha': 0.9321389484627511, 'reg_lambda': 0.9798256325745593}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:09:56,493] Trial 43 finished with value: 0.8540019882437657 and parameters: {'n_estimators': 25, 'learning_rate': 9.843805468452728e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 14, 'subsample': 0.8875257322176258, 'colsample_bytree': 0.9353499010526168, 'reg_alpha': 0.9430980849423967, 'reg_lambda': 0.9764555779163848}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:10:02,244] Trial 44 finished with value: 0.854065062174978 and parameters: {'n_estimators': 26, 'learning_rate': 3.9602837021885415e-06, 'num_leaves': 12, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.696334211550028, 'colsample_bytree': 0.7550870772599837, 'reg_alpha': 0.9272057732996193, 'reg_lambda': 0.9508171408892675}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:10:26,410] Trial 45 finished with value: 0.8540270542208086 and parameters: {'n_estimators': 30, 'learning_rate': 6.268879281805829e-06, 'num_leaves': 11, 'max_depth': 3, 'min_child_samples': 17, 'subsample': 0.7339643289218374, 'colsample_bytree': 0.6889609231427354, 'reg_alpha': 0.9369443908260432, 'reg_lambda': 0.9879658435516105}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


[I 2025-03-12 23:10:31,003] Trial 46 finished with value: 0.8540868224657355 and parameters: {'n_estimators': 28, 'learning_rate': 2.0468376675928117e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 10, 'subsample': 0.6683577341700875, 'colsample_bytree': 0.6062007504955429, 'reg_alpha': 0.9436078566670193, 'reg_lambda': 0.9556794568166225}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:10:38,767] Trial 47 finished with value: 0.8541125341448033 and parameters: {'n_estimators': 25, 'learning_rate': 4.490611624887259e-08, 'num_leaves': 13, 'max_depth': 3, 'min_child_samples': 15, 'subsample': 0.7885639964898657, 'colsample_bytree': 0.646185763659324, 'reg_alpha': 0.9128210172448359, 'reg_lambda': 0.9455676761852684}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:10:47,185] Trial 48 finished with value: 0.8540823680597492 and parameters: {'n_estimators': 27, 'learning_rate': 2.5402937505487923e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 25, 'subsample': 0.758307986801766, 'colsample_bytree': 0.8012516648698133, 'reg_alpha': 0.9248510535726246, 'reg_lambda': 0.9113062829253293}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5011 | Avg Validation RMSE: 0.8541


[I 2025-03-12 23:11:07,856] Trial 49 finished with value: 0.8540295333344282 and parameters: {'n_estimators': 26, 'learning_rate': 7.054763710151258e-06, 'num_leaves': 12, 'max_depth': 4, 'min_child_samples': 18, 'subsample': 0.61544324995944, 'colsample_bytree': 0.6845807675385147, 'reg_alpha': 0.9534261738365837, 'reg_lambda': 0.9320707152444673}. Best is trial 41 with value: 0.8539932070634068.


Avg Train RMSE: 0.5010 | Avg Validation RMSE: 0.8540


In [168]:
best_params =  {'n_estimators': 26, 'learning_rate': 9.89517587604896e-06, 'num_leaves': 10, 'max_depth': 3, 'min_child_samples': 12, 'subsample': 0.761636885611244, 'colsample_bytree': 0.7280669024814017, 'reg_alpha': 0.9334249209243208, 'reg_lambda': 0.9788958090626914}

base_estimator = lgb.LGBMRegressor(**best_params)
boruta_selector = BorutaPy(estimator=base_estimator, n_estimators='auto', verbose=0, random_state=42)

boruta_selector.fit(X_train.values, y_train.values)
X_train_selected = boruta_selector.transform(X_train.values)
X_test_selected = boruta_selector.transform(X_test.values)

best_model = lgb.LGBMRegressor(**best_params)
best_model.fit(X_train_selected, y_train)

y_pred_train = best_model.predict(X_train_selected)
train_rmse = root_mean_squared_error(np.expm1(y_train), np.expm1(y_pred_train))
print("Train RMSE:", train_rmse)

y_pred = best_model.predict(X_test_selected)
test_rmse = root_mean_squared_error(np.expm1(y_test), np.expm1(y_pred))
print("Test RMSE:", test_rmse)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000643 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11256
[LightGBM] [Info] Number of data points in the train set: 1172, number of used features: 48
[LightGBM] [Info] Start training from score 1.693565
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000290 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11256
[LightGBM] [Info] Number of data points in the train set: 1172, number of used features: 48
[LightGBM] [Info] Start training from score 1.693565
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000268 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 11256
[LightGBM] [Info] Number of data points in the train set: 1172, number of used features: 48
[LightGBM] [Info] Start tra