All previous training was performed on the box-cox transformation of the target variable `UltimateIncurredClaimCost`. This trial will predict on the log instead.

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import (
    FunctionTransformer,
    PowerTransformer,
    MinMaxScaler,
    OneHotEncoder,
    Binarizer,
    StandardScaler
)


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.decomposition import PCA


import mlflow
import mlflow.sklearn

from matplotlib import pyplot as plt
import seaborn as sns
import optuna

In [2]:
dataset_train = pd.read_csv('../data/processed/dataset_train.csv', index_col=0)
dataset_test = pd.read_csv('../data/processed/dataset_test.csv', index_col=0)

In [3]:
# column names
cols_event_embeddings = [f'event_feat_{i}' for i in range(768)]
cols_event_dummies = ['BACK STRAIN', 'CLEANING INSTRUMENTS', 'CLEANING LEFT SHOULDER SPLINTER', 'CUT WITH KNIFE', 'DEALING CARDS RIGHT TENDON SYNOVITIS', 'FELL', 'FELL FROM LADDER', 'FELL OFF LADDER', 'FRACTURED LEFT WRIST FRACTURE', 'GRINDING FOREIGN BODY', 'GRINDING STEEL BEAM INFECTION', 'HIT AIR HOSE', 'HIT ELBOW', 'HIT FLOOR SHEET', 'INJURY', 'INJURY LIFTING', 'INJURY LIFTING STRAIN', 'JAMMED RIGHT HAND PUNCTURE', 'KNIFE LACERATION', 'KNIFE SLIPPED', 'LACERATION', 'LEFT LEG SNAKE BITE', 'LIFTING', 'LIFTING BOXES', 'LIFTING PATIENT', 'LIFTING TYRES', 'LOWER BACK STRAIN', 'MOTOR VEHICLE COLLISION', 'PREPARING PIPES FOR PAINTING PUNCTURE', 'PULLING CABLES', 'REDBACK SPIDER BITE', 'SLIPPED AND FELL', 'SLIPPED ON FLOOR', 'SLIPPED ON ROLLER', 'SLIPPED ON STAIRS', 'SLIPPED ON WET FLOOR', 'SLIPPED USING LATHE', 'SORTING ALUMINIUM BARS STRAIN', 'SPRAINED RIGHT ANKLE FRACTURE', 'STRAIN', 'STRAIN LIFTING PARTS', 'STRAIN LIFTING STRAIN', 'STRUCK KNIFE LACERATED', 'STRUCK KNIFE LACERATION', 'STRUCK PALLET', 'STRUCK WITH KNIFE', 'USING AIR HOSE STRAIN', 'USING DRILL']
cols_body_parts_embeddings = [f'body_feat_{i}' for i in range(768)]

In [4]:
def train_cv(model, X, y, metric = 'rmse', kfold = 5):

    if metric != 'rmse':
        raise NotImplemented

    scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')

    mse_scores = -scores
    rmse_scores = np.sqrt(mse_scores)

    return rmse_scores.mean(), rmse_scores.std()

In [5]:
log_transformer = Pipeline([
    ('log', FunctionTransformer(np.log, validate=False)),
    ('scaler', StandardScaler()) 
])

# gender to bool 
def gender_to_bool(gender_column):
    """
    - Fill missing values with 'M'
    - Replace 'U' with 'M'
    - Return a boolean-ish column: 1 for 'M', 0 otherwise
    """
    g = pd.Series(gender_column.squeeze(), dtype=str).fillna('M').replace('U', 'M')
    is_male = (g == 'M').astype(int)
    return is_male.values.reshape(-1, 1)

gender_transformer = FunctionTransformer(gender_to_bool, validate=False)

# hours worked per week -> to buckets -> to one_hot
def bucket_hours_worked(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 37, 41, np.inf],
        labels=["<=37", "37-41", ">41"]
    ).astype(str).values.reshape(-1, 1)

hours_worked_bucketer = FunctionTransformer(bucket_hours_worked, validate=False)
hours_worked_encoder = OneHotEncoder(drop='first')
hours_worked_pipeline = Pipeline([
    ('bucketizer', hours_worked_bucketer),
    ('encoder', hours_worked_encoder)
])

# DaysToReport (DateReported - DateTimeOfAccident) -> to buckets -> to one_hot
def bucket_days_to_report(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 80, 300, 500, np.inf],
        labels=["<=80", "80-300", "300-500", ">500"]
    ).astype(str).values.reshape(-1, 1)

days_to_report_bucketer = FunctionTransformer(bucket_days_to_report, validate=False)
days_to_report_encoder = OneHotEncoder(drop='first')
days_to_report_pipeline = Pipeline([
    ('bucketizer', days_to_report_bucketer),
    ('encoder', days_to_report_encoder)
])

# DaysWorkedPerWeek -> 1 if equals 5, 0 in any other case 
def days_worked_binarize(days_array):
    # Ensure we handle arrays or DataFrames by squeezing to 1D
    days = days_array.squeeze()
    binarized = (days == 5).astype(int)
    # Return as 2D array: (n_samples x 1)
    return binarized.values.reshape(-1, 1) if isinstance(days, pd.Series) else binarized.reshape(-1, 1)

days_worked_transformer = FunctionTransformer(days_worked_binarize, validate=False)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('log_inc', log_transformer, ['InitialIncurredCalimsCost', 'WeeklyWages']),
        ('minmax_scaler', MinMaxScaler(), ['Age', 'YearAccident']),
        ('gender_bool', gender_transformer, ['Gender']),
        ('hww_bool_onehot', hours_worked_pipeline, ['HoursWorkedPerWeek']),
        ('dtt_bool_onehot', days_to_report_pipeline, ['DaysToReport']),
        ('has_dependent_bool', Binarizer(threshold=0), ['DependentChildren']),
        ('worked_five_days_bool', days_worked_transformer, ['DaysWorkedPerWeek']),
        ('onehot', OneHotEncoder(drop='first'), ['MaritalStatus', 'PartTimeFullTime']),
        ('pca_event_embeddings', PCA(n_components=250), cols_event_embeddings),
        ('passthrough_event_dummies', 'passthrough', cols_event_dummies),
        ('pca_body_parts_embeddings', PCA(n_components=80), cols_body_parts_embeddings),
        ('std_scaler', StandardScaler(), ['event_median_cost'])
    ],
    remainder='drop'
)

In [7]:
regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(force_col_wise=True))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=False) # here's the change
)

In [8]:

scores = cross_val_score(
    model, 
    dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 
    cv=5, 
    scoring='neg_mean_squared_error'
)
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print(f"CV MSE:  {mse_scores.mean():.3f}  (+/- {mse_scores.std():.3f})")
print(f"CV RMSE: {rmse_scores.mean():.3f}  (+/- {rmse_scores.std():.3f})")

[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 8.064355




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 8.067515




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 8.063679




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 8.066131




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 8.062133
CV MSE:  569802735.060  (+/- 51080152.463)
CV RMSE: 23844.840  (+/- 1107.398)




In [9]:
X_train = dataset_train.drop(columns=['UltimateIncurredClaimCost'])
y_train = dataset_train['UltimateIncurredClaimCost']

def run_experiment(
        experiment_name, 
        run_name, 
        regressor_object, 
        kfold=5, 
        save_model=True
    ):
    """
    Runs an ML experiment with cross-validation and logs metrics to MLflow,
    tracking overall and segmented performance for targets above and below 100,000.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.
    - run_name (str): Name of the MLflow run.
    - regressor_object: The regression model.
    - kfold (int): Number of folds for cross-validation.
    - save_model (bool): Whether to save the trained model in MLflow.

    Logs:
    - Overall CV MSE, RMSE, and MAE
    - MSE, RMSE, and MAE for y_true > 100,000
    - MSE, RMSE, and MAE for y_true ≤ 100,000
    """

    # Define pipeline with preprocessing and model
    regressor_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('linear_model', regressor_object)
    ])

    model = TransformedTargetRegressor(
        regressor=regressor_pipeline,
        transformer=PowerTransformer(method='box-cox', standardize=False)
    )

    # Set MLflow experiment
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        params = model.get_params(deep=True)
        for param_name, param_value in params.items():
            if isinstance(param_value, (str, int, float, bool, type(None))):
                mlflow.log_param(param_name, param_value)
            else:
                mlflow.log_param(param_name, str(param_value)[:5000])
        
        mlflow.log_param("n_features_in", X_train.shape[1])

        # Perform cross-validation with predictions
        y_pred = cross_val_predict(model, X_train, y_train, cv=kfold)

        # Compute overall errors
        overall_mse = mean_squared_error(y_train, y_pred)
        overall_rmse = np.sqrt(overall_mse)
        overall_mae = mean_absolute_error(y_train, y_pred)

        # Compute errors for high and low y_true
        high_mask = y_train > 100000

        high_mse = mean_squared_error(y_train[high_mask], y_pred[high_mask])
        high_rmse = np.sqrt(high_mse)
        high_mae = mean_absolute_error(y_train[high_mask], y_pred[high_mask])
        
        low_mse = mean_squared_error(y_train[~high_mask], y_pred[~high_mask])
        low_rmse = np.sqrt(low_mse)
        low_mae = mean_absolute_error(y_train[~high_mask], y_pred[~high_mask])
    
        print(f"RMSE Overall:  {overall_rmse:.3f} | MAE: {overall_mae:.3f}")
        print(f"RMSE for low y_true (<= 100k): {low_rmse:.3f} | MAE: {low_mae:.3f}")
        print(f"RMSE for high y_true (> 100k): {high_rmse:.3f} | MAE: {high_mae:.3f}")
        

        # Log overall metrics
        mlflow.log_metric("cv_mse", overall_mse)
        mlflow.log_metric("cv_rmse", overall_rmse)
        mlflow.log_metric("cv_mae", overall_mae)

        # Log segmented metrics
        mlflow.log_metric("cv_mse_high", high_mse)
        mlflow.log_metric("cv_rmse_high", high_rmse)
        mlflow.log_metric("cv_mae_high", high_mae)
        mlflow.log_metric("cv_mse_low", low_mse)
        mlflow.log_metric("cv_rmse_low", low_rmse)
        mlflow.log_metric("cv_mae_low", low_mae)

        # Fit final model on full dataset
        if save_model:
            model.fit(X_train, y_train)
            mlflow.sklearn.log_model(model, artifact_path="models")
    
    return overall_rmse


In [10]:

def objective(trial):
    """Objective function for Optuna to optimize XGBoost hyperparameters."""
    
    # Suggest hyperparameters to optimize
    params = {
        "num_leaves": trial.suggest_int("num_leaves", 20, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.1, 0.4, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 600, step=100),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 13.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 13.0),
        "boosting_type": "gbdt",
        "objective": "regression",
        "force_col_wise": True,
        "metric": "rmse",
    }

    rmse = run_experiment(
        'Optuna LightGBM',
        'LightGBM optimization on Log(UltimateIncurredClaimCost)',
        LGBMRegressor(**params),
        kfold=5,
        save_model=False
    )

    return rmse


In [11]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=35)

# Print the best result
print(f"Best trial: {study.best_trial.params}")


[I 2025-03-02 19:22:19,272] A new study created in memory with name: no-name-13e4c4fd-9d3a-40ac-941d-b2dc0d652dd2


[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:22:48,675] Trial 0 finished with value: 24046.17142301382 and parameters: {'num_leaves': 321, 'max_depth': 4, 'learning_rate': 0.2173588778912527, 'n_estimators': 600, 'min_child_weight': 7, 'subsample': 0.8712045177654963, 'colsample_bytree': 0.9299045959979824, 'lambda_l1': 0.43277545545087626, 'lambda_l2': 11.433431860112103}. Best is trial 0 with value: 24046.17142301382.


RMSE Overall:  24046.171 | MAE: 5833.586
RMSE for low y_true (<= 100k): 10076.973 | MAE: 3493.365
RMSE for high y_true (> 100k): 162792.524 | MAE: 133103.188
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:23:07,638] Trial 1 finished with value: 23980.395026350405 and parameters: {'num_leaves': 34, 'max_depth': 7, 'learning_rate': 0.2305688032163441, 'n_estimators': 200, 'min_child_weight': 8, 'subsample': 0.8453661761076805, 'colsample_bytree': 0.826516929038223, 'lambda_l1': 3.3346052749154897, 'lambda_l2': 10.96407939215328}. Best is trial 1 with value: 23980.395026350405.


RMSE Overall:  23980.395 | MAE: 5769.738
RMSE for low y_true (<= 100k): 9804.286 | MAE: 3429.017
RMSE for high y_true (> 100k): 163160.293 | MAE: 133066.560
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:23:25,467] Trial 2 finished with value: 23987.951018518015 and parameters: {'num_leaves': 429, 'max_depth': 8, 'learning_rate': 0.14226040193127687, 'n_estimators': 100, 'min_child_weight': 6, 'subsample': 0.8766226764515677, 'colsample_bytree': 0.663045335355746, 'lambda_l1': 11.269668265714138, 'lambda_l2': 1.8394219699285363}. Best is trial 1 with value: 23980.395026350405.


RMSE Overall:  23987.951 | MAE: 5645.503
RMSE for low y_true (<= 100k): 9347.593 | MAE: 3263.797
RMSE for high y_true (> 100k): 164672.471 | MAE: 135171.242
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:23:36,505] Trial 3 finished with value: 23911.28615618333 and parameters: {'num_leaves': 338, 'max_depth': 3, 'learning_rate': 0.29449186212231054, 'n_estimators': 300, 'min_child_weight': 5, 'subsample': 0.852532133636825, 'colsample_bytree': 0.7146482627915585, 'lambda_l1': 4.177433657836201, 'lambda_l2': 6.998494753004205}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  23911.286 | MAE: 5764.204
RMSE for low y_true (<= 100k): 9766.603 | MAE: 3423.150
RMSE for high y_true (> 100k): 162720.877 | MAE: 133079.078
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:25:45,591] Trial 4 finished with value: 24106.44638161159 and parameters: {'num_leaves': 351, 'max_depth': 12, 'learning_rate': 0.14679841401890864, 'n_estimators': 300, 'min_child_weight': 2, 'subsample': 0.7019547926935064, 'colsample_bytree': 0.8121880560638655, 'lambda_l1': 0.91156335064735, 'lambda_l2': 10.805683194649705}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  24106.446 | MAE: 5852.180
RMSE for low y_true (<= 100k): 10015.016 | MAE: 3493.350
RMSE for high y_true (> 100k): 163492.659 | MAE: 134133.796
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:28:10,666] Trial 5 finished with value: 24236.037737008373 and parameters: {'num_leaves': 341, 'max_depth': 13, 'learning_rate': 0.10632917571341347, 'n_estimators': 500, 'min_child_weight': 10, 'subsample': 0.6125489978857293, 'colsample_bytree': 0.6339139350768911, 'lambda_l1': 4.638004432089427, 'lambda_l2': 4.7058689894253485}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  24236.038 | MAE: 5782.164
RMSE for low y_true (<= 100k): 9808.775 | MAE: 3388.829
RMSE for high y_true (> 100k): 165224.577 | MAE: 135940.306
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:28:35,943] Trial 6 finished with value: 24079.361449737793 and parameters: {'num_leaves': 268, 'max_depth': 10, 'learning_rate': 0.2084862812350318, 'n_estimators': 100, 'min_child_weight': 10, 'subsample': 0.8966741140273038, 'colsample_bytree': 0.6346145956221393, 'lambda_l1': 9.33307349336199, 'lambda_l2': 1.3712501697070527}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  24079.361 | MAE: 5752.791
RMSE for low y_true (<= 100k): 9581.759 | MAE: 3374.072
RMSE for high y_true (> 100k): 164679.414 | MAE: 135116.096
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:29:04,931] Trial 7 finished with value: 23966.9893333147 and parameters: {'num_leaves': 73, 'max_depth': 12, 'learning_rate': 0.1413440403939243, 'n_estimators': 200, 'min_child_weight': 7, 'subsample': 0.6352520822001202, 'colsample_bytree': 0.9042381140964697, 'lambda_l1': 11.015159629902353, 'lambda_l2': 5.853106680750194}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  23966.989 | MAE: 5677.857
RMSE for low y_true (<= 100k): 9395.901 | MAE: 3303.254
RMSE for high y_true (> 100k): 164353.607 | MAE: 134817.277
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:29:15,830] Trial 8 finished with value: 23933.631498237577 and parameters: {'num_leaves': 43, 'max_depth': 3, 'learning_rate': 0.34804866962294057, 'n_estimators': 300, 'min_child_weight': 6, 'subsample': 0.9325703463553032, 'colsample_bytree': 0.6724087249564572, 'lambda_l1': 12.330819326250332, 'lambda_l2': 5.287835184987618}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  23933.631 | MAE: 5748.682
RMSE for low y_true (<= 100k): 9642.796 | MAE: 3395.901
RMSE for high y_true (> 100k): 163303.337 | MAE: 133701.343
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:29:43,303] Trial 9 finished with value: 23998.546804273486 and parameters: {'num_leaves': 197, 'max_depth': 6, 'learning_rate': 0.15966237775433262, 'n_estimators': 300, 'min_child_weight': 3, 'subsample': 0.7949271838173432, 'colsample_bytree': 0.792466748806339, 'lambda_l1': 5.037861419448196, 'lambda_l2': 11.451621056199812}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  23998.547 | MAE: 5754.088
RMSE for low y_true (<= 100k): 9755.493 | MAE: 3404.857
RMSE for high y_true (> 100k): 163466.872 | MAE: 133513.703
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:30:40,270] Trial 10 finished with value: 24717.302932099243 and parameters: {'num_leaves': 479, 'max_depth': 15, 'learning_rate': 0.3482595029058139, 'n_estimators': 500, 'min_child_weight': 4, 'subsample': 0.505217360795704, 'colsample_bytree': 0.5023774577062934, 'lambda_l1': 7.860961722310554, 'lambda_l2': 8.27438848638246}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  24717.303 | MAE: 6233.217
RMSE for low y_true (<= 100k): 10451.165 | MAE: 3803.132
RMSE for high y_true (> 100k): 167021.502 | MAE: 138389.956
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:30:52,306] Trial 11 finished with value: 23987.7445870306 and parameters: {'num_leaves': 177, 'max_depth': 3, 'learning_rate': 0.3830780202744597, 'n_estimators': 400, 'min_child_weight': 5, 'subsample': 0.9947574179548043, 'colsample_bytree': 0.6933457198752895, 'lambda_l1': 12.710206046388533, 'lambda_l2': 7.911148607358308}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  23987.745 | MAE: 5781.300
RMSE for low y_true (<= 100k): 9680.862 | MAE: 3418.016
RMSE for high y_true (> 100k): 163620.278 | MAE: 134305.155
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:31:13,727] Trial 12 finished with value: 24119.186358025418 and parameters: {'num_leaves': 151, 'max_depth': 5, 'learning_rate': 0.2856997611388309, 'n_estimators': 400, 'min_child_weight': 1, 'subsample': 0.9832606519083076, 'colsample_bytree': 0.5458687202473926, 'lambda_l1': 7.9279759957648634, 'lambda_l2': 3.7205608735370217}. Best is trial 3 with value: 23911.28615618333.


RMSE Overall:  24119.186 | MAE: 5876.279
RMSE for low y_true (<= 100k): 10118.013 | MAE: 3531.071
RMSE for high y_true (> 100k): 163251.663 | MAE: 133417.111
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:31:22,137] Trial 13 finished with value: 23897.187626683113 and parameters: {'num_leaves': 106, 'max_depth': 3, 'learning_rate': 0.2848108212816855, 'n_estimators': 200, 'min_child_weight': 5, 'subsample': 0.9357648937673344, 'colsample_bytree': 0.7258722469184484, 'lambda_l1': 2.7767599683743263, 'lambda_l2': 7.567746651210365}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23897.188 | MAE: 5736.096
RMSE for low y_true (<= 100k): 9655.204 | MAE: 3394.958
RMSE for high y_true (> 100k): 162967.530 | MAE: 133055.590
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:31:36,529] Trial 14 finished with value: 23996.269108750366 and parameters: {'num_leaves': 113, 'max_depth': 5, 'learning_rate': 0.2759737430723077, 'n_estimators': 200, 'min_child_weight': 4, 'subsample': 0.7756480989237078, 'colsample_bytree': 0.7439031488726381, 'lambda_l1': 2.245041208816383, 'lambda_l2': 8.425925006348217}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23996.269 | MAE: 5814.202
RMSE for low y_true (<= 100k): 9909.663 | MAE: 3474.318
RMSE for high y_true (> 100k): 162943.192 | MAE: 133065.522
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:32:21,824] Trial 15 finished with value: 24080.153176879197 and parameters: {'num_leaves': 246, 'max_depth': 9, 'learning_rate': 0.27127674250116107, 'n_estimators': 200, 'min_child_weight': 4, 'subsample': 0.8074977926652704, 'colsample_bytree': 0.9851014834235814, 'lambda_l1': 5.507716544786024, 'lambda_l2': 7.156387599162866}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24080.153 | MAE: 5901.789
RMSE for low y_true (<= 100k): 9930.637 | MAE: 3543.494
RMSE for high y_true (> 100k): 163557.961 | MAE: 134154.336
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:32:29,290] Trial 16 finished with value: 23924.810273811832 and parameters: {'num_leaves': 403, 'max_depth': 3, 'learning_rate': 0.3034138202478118, 'n_estimators': 100, 'min_child_weight': 8, 'subsample': 0.9382297531594881, 'colsample_bytree': 0.7426644464556609, 'lambda_l1': 2.6517984429184533, 'lambda_l2': 9.113945022051768}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23924.810 | MAE: 5729.328
RMSE for low y_true (<= 100k): 9556.507 | MAE: 3374.605
RMSE for high y_true (> 100k): 163507.475 | MAE: 133787.605
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:33:03,190] Trial 17 finished with value: 24212.828353593628 and parameters: {'num_leaves': 254, 'max_depth': 6, 'learning_rate': 0.24493517056022882, 'n_estimators': 400, 'min_child_weight': 5, 'subsample': 0.7168603456934586, 'colsample_bytree': 0.604570726277496, 'lambda_l1': 4.09080376872981, 'lambda_l2': 3.6000465413807063}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24212.828 | MAE: 5918.721
RMSE for low y_true (<= 100k): 10192.216 | MAE: 3562.709
RMSE for high y_true (> 100k): 163767.535 | MAE: 134047.136
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:33:23,401] Trial 18 finished with value: 23948.199804857435 and parameters: {'num_leaves': 114, 'max_depth': 5, 'learning_rate': 0.1744027623183586, 'n_estimators': 300, 'min_child_weight': 3, 'subsample': 0.9344157559716724, 'colsample_bytree': 0.8773803410705047, 'lambda_l1': 6.567266571622337, 'lambda_l2': 9.460343719054237}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23948.200 | MAE: 5717.374
RMSE for low y_true (<= 100k): 9609.677 | MAE: 3361.803
RMSE for high y_true (> 100k): 163527.638 | MAE: 133821.781
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:33:51,834] Trial 19 finished with value: 24088.349796263956 and parameters: {'num_leaves': 307, 'max_depth': 7, 'learning_rate': 0.1881626917741441, 'n_estimators': 200, 'min_child_weight': 7, 'subsample': 0.8263387631307275, 'colsample_bytree': 0.7110168581940385, 'lambda_l1': 1.5191837374971178, 'lambda_l2': 6.664618537049732}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24088.350 | MAE: 5817.074
RMSE for low y_true (<= 100k): 9943.577 | MAE: 3463.031
RMSE for high y_true (> 100k): 163582.053 | MAE: 133838.374
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:34:51,312] Trial 20 finished with value: 24377.982042283315 and parameters: {'num_leaves': 394, 'max_depth': 10, 'learning_rate': 0.32284395538354643, 'n_estimators': 500, 'min_child_weight': 2, 'subsample': 0.7432764308086246, 'colsample_bytree': 0.59428453822122, 'lambda_l1': 6.171554743759076, 'lambda_l2': 9.653540053367516}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24377.982 | MAE: 6033.606
RMSE for low y_true (<= 100k): 10360.982 | MAE: 3655.727
RMSE for high y_true (> 100k): 164546.702 | MAE: 135351.215
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:34:58,401] Trial 21 finished with value: 23945.421427795256 and parameters: {'num_leaves': 420, 'max_depth': 3, 'learning_rate': 0.31047993668685364, 'n_estimators': 100, 'min_child_weight': 9, 'subsample': 0.9301325989442587, 'colsample_bytree': 0.7548247300438464, 'lambda_l1': 2.7782949904536776, 'lambda_l2': 12.51566233078642}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23945.421 | MAE: 5742.093
RMSE for low y_true (<= 100k): 9589.768 | MAE: 3388.379
RMSE for high y_true (> 100k): 163568.662 | MAE: 133745.544
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:35:06,592] Trial 22 finished with value: 23924.221103601758 and parameters: {'num_leaves': 497, 'max_depth': 4, 'learning_rate': 0.39359847387132124, 'n_estimators': 100, 'min_child_weight': 8, 'subsample': 0.9701512523278695, 'colsample_bytree': 0.7455345748190655, 'lambda_l1': 3.3906639733877144, 'lambda_l2': 9.664701769669115}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23924.221 | MAE: 5771.853
RMSE for low y_true (<= 100k): 9707.551 | MAE: 3428.042
RMSE for high y_true (> 100k): 163018.075 | MAE: 133236.697
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:35:17,670] Trial 23 finished with value: 23981.376784777138 and parameters: {'num_leaves': 490, 'max_depth': 4, 'learning_rate': 0.36442748453802426, 'n_estimators': 200, 'min_child_weight': 6, 'subsample': 0.9686226450654853, 'colsample_bytree': 0.7701104680008323, 'lambda_l1': 4.114581674861977, 'lambda_l2': 7.161559693684393}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23981.377 | MAE: 5818.477
RMSE for low y_true (<= 100k): 9843.644 | MAE: 3473.181
RMSE for high y_true (> 100k): 163039.363 | MAE: 133364.131
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:35:25,949] Trial 24 finished with value: 23908.693517445747 and parameters: {'num_leaves': 459, 'max_depth': 4, 'learning_rate': 0.25525647473848356, 'n_estimators': 100, 'min_child_weight': 5, 'subsample': 0.8859641899540083, 'colsample_bytree': 0.8512087651315926, 'lambda_l1': 1.8057433860408303, 'lambda_l2': 9.743585213077246}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23908.694 | MAE: 5701.081
RMSE for low y_true (<= 100k): 9512.411 | MAE: 3349.015
RMSE for high y_true (> 100k): 163516.750 | MAE: 133614.889
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:35:54,853] Trial 25 finished with value: 24137.27131771903 and parameters: {'num_leaves': 449, 'max_depth': 6, 'learning_rate': 0.25044590625667423, 'n_estimators': 300, 'min_child_weight': 5, 'subsample': 0.84958896055518, 'colsample_bytree': 0.8381447609324484, 'lambda_l1': 1.712059499065929, 'lambda_l2': 5.898514723626924}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24137.271 | MAE: 5907.308
RMSE for low y_true (<= 100k): 10253.525 | MAE: 3568.208
RMSE for high y_true (> 100k): 162939.587 | MAE: 133115.967
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:36:05,864] Trial 26 finished with value: 23960.366658081693 and parameters: {'num_leaves': 375, 'max_depth': 4, 'learning_rate': 0.2603670214356507, 'n_estimators': 200, 'min_child_weight': 4, 'subsample': 0.895252334292457, 'colsample_bytree': 0.8411019039695077, 'lambda_l1': 0.1364929702245099, 'lambda_l2': 7.4204724251389305}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23960.367 | MAE: 5775.324
RMSE for low y_true (<= 100k): 9789.411 | MAE: 3434.834
RMSE for high y_true (> 100k): 163045.866 | MAE: 133059.547
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:36:15,651] Trial 27 finished with value: 23938.67647143942 and parameters: {'num_leaves': 221, 'max_depth': 5, 'learning_rate': 0.23048721957081916, 'n_estimators': 100, 'min_child_weight': 3, 'subsample': 0.9034394432401188, 'colsample_bytree': 0.7155317469266241, 'lambda_l1': 1.6669475590518592, 'lambda_l2': 0.19279041695623}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23938.676 | MAE: 5721.240
RMSE for low y_true (<= 100k): 9597.376 | MAE: 3365.945
RMSE for high y_true (> 100k): 163489.696 | MAE: 133810.606
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:36:26,635] Trial 28 finished with value: 23940.067463906253 and parameters: {'num_leaves': 298, 'max_depth': 3, 'learning_rate': 0.3247551623193609, 'n_estimators': 300, 'min_child_weight': 5, 'subsample': 0.7725903969739459, 'colsample_bytree': 0.8614592995088225, 'lambda_l1': 3.5268035903199277, 'lambda_l2': 10.405069347744346}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23940.067 | MAE: 5772.117
RMSE for low y_true (<= 100k): 9777.027 | MAE: 3424.037
RMSE for high y_true (> 100k): 162921.084 | MAE: 133469.165
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:36:50,492] Trial 29 finished with value: 24050.408341641 and parameters: {'num_leaves': 449, 'max_depth': 4, 'learning_rate': 0.19707758425512178, 'n_estimators': 600, 'min_child_weight': 6, 'subsample': 0.8626989537548623, 'colsample_bytree': 0.9362848426990169, 'lambda_l1': 1.0641754454667378, 'lambda_l2': 12.360244728693}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24050.408 | MAE: 5808.133
RMSE for low y_true (<= 100k): 10021.444 | MAE: 3471.143
RMSE for high y_true (> 100k): 163013.456 | MAE: 132901.995
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:37:18,079] Trial 30 finished with value: 24129.243273435553 and parameters: {'num_leaves': 366, 'max_depth': 7, 'learning_rate': 0.22159968749035136, 'n_estimators': 200, 'min_child_weight': 7, 'subsample': 0.825821800290908, 'colsample_bytree': 0.7796886933660786, 'lambda_l1': 0.26870672175240484, 'lambda_l2': 4.386430347009877}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24129.243 | MAE: 5889.025
RMSE for low y_true (<= 100k): 10146.704 | MAE: 3537.868
RMSE for high y_true (> 100k): 163237.127 | MAE: 133753.349
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:37:26,311] Trial 31 finished with value: 23959.19053549558 and parameters: {'num_leaves': 498, 'max_depth': 4, 'learning_rate': 0.3923931534089749, 'n_estimators': 100, 'min_child_weight': 8, 'subsample': 0.9638192602219182, 'colsample_bytree': 0.7180535888707499, 'lambda_l1': 3.333254581517827, 'lambda_l2': 8.979871609976982}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23959.191 | MAE: 5784.541
RMSE for low y_true (<= 100k): 9735.123 | MAE: 3437.715
RMSE for high y_true (> 100k): 163212.977 | MAE: 133413.355
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:37:34,595] Trial 32 finished with value: 23922.216941204893 and parameters: {'num_leaves': 465, 'max_depth': 4, 'learning_rate': 0.30020665844702343, 'n_estimators': 100, 'min_child_weight': 5, 'subsample': 0.9544047197153724, 'colsample_bytree': 0.7994152998573533, 'lambda_l1': 2.3359147860187846, 'lambda_l2': 9.759088564776599}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23922.217 | MAE: 5732.857
RMSE for low y_true (<= 100k): 9600.137 | MAE: 3379.658
RMSE for high y_true (> 100k): 163347.386 | MAE: 133708.277
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:37:44,726] Trial 33 finished with value: 24001.63338782774 and parameters: {'num_leaves': 459, 'max_depth': 5, 'learning_rate': 0.29263719527852544, 'n_estimators': 100, 'min_child_weight': 5, 'subsample': 0.8736351149481225, 'colsample_bytree': 0.8044262249923608, 'lambda_l1': 2.2863020460534926, 'lambda_l2': 11.207361134834079}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  24001.633 | MAE: 5771.017
RMSE for low y_true (<= 100k): 9728.695 | MAE: 3419.253
RMSE for high y_true (> 100k): 163578.785 | MAE: 133668.400
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 19:37:51,948] Trial 34 finished with value: 23950.09829262799 and parameters: {'num_leaves': 286, 'max_depth': 3, 'learning_rate': 0.23927907890245503, 'n_estimators': 100, 'min_child_weight': 6, 'subsample': 0.9060713240803953, 'colsample_bytree': 0.923849896286125, 'lambda_l1': 0.9867816446774444, 'lambda_l2': 8.463786108033023}. Best is trial 13 with value: 23897.187626683113.


RMSE Overall:  23950.098 | MAE: 5691.124
RMSE for low y_true (<= 100k): 9473.517 | MAE: 3325.941
RMSE for high y_true (> 100k): 163974.491 | MAE: 134318.306
Best trial: {'num_leaves': 106, 'max_depth': 3, 'learning_rate': 0.2848108212816855, 'n_estimators': 200, 'min_child_weight': 5, 'subsample': 0.9357648937673344, 'colsample_bytree': 0.7258722469184484, 'lambda_l1': 2.7767599683743263, 'lambda_l2': 7.567746651210365}


In [12]:
import optuna.visualization as vis

fig = vis.plot_optimization_history(study)
fig.show()


In [13]:
fig = vis.plot_param_importances(study)
fig.show()


In [None]:
best_params = study.best_trial.params


regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(**best_params))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=False) # here's the change
)

### XGBoost

In [20]:

def objective(trial):
    """Objective function for Optuna to optimize XGBoost hyperparameters."""
    
    # Suggest hyperparameters to optimize
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
    }

    rmse = run_experiment(
        'Optuna XGBoost',
        'XGBoost optimization - on Log of UltimateIncurredClaimCost',
        XGBRegressor(**params),
        kfold=5,
        save_model=False
    )

    return rmse


In [21]:
study_xgb = optuna.create_study(direction="minimize")
study_xgb.optimize(objective, n_trials=35)

# Print the best result
print(f"Best trial: {study_xgb.best_trial.params}")


[I 2025-03-02 20:02:33,759] A new study created in memory with name: no-name-7fc531c1-2df7-410b-8f2e-015558e1cb5c
[I 2025-03-02 20:02:53,823] Trial 0 finished with value: 24013.52109264384 and parameters: {'n_estimators': 700, 'max_depth': 8, 'learning_rate': 0.08097393278513144, 'min_child_weight': 2, 'subsample': 0.8562247610452776, 'colsample_bytree': 0.5743978699746811, 'reg_alpha': 0.5660395870750784, 'reg_lambda': 5.839643012137597, 'gamma': 1.041963720305303}. Best is trial 0 with value: 24013.52109264384.


RMSE Overall:  24013.521 | MAE: 5564.914
RMSE for low y_true (<= 100k): 9180.294 | MAE: 3161.824
RMSE for high y_true (> 100k): 165389.156 | MAE: 136253.601


[I 2025-03-02 20:03:11,892] Trial 1 finished with value: 23894.654023240546 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.0545607974513588, 'min_child_weight': 4, 'subsample': 0.5997266818951921, 'colsample_bytree': 0.6200245970039228, 'reg_alpha': 0.5234410440583043, 'reg_lambda': 1.5719019228242648, 'gamma': 1.2846430894957357}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23894.654 | MAE: 5650.191
RMSE for low y_true (<= 100k): 9341.575 | MAE: 3284.943
RMSE for high y_true (> 100k): 163938.173 | MAE: 134280.841


[I 2025-03-02 20:03:19,679] Trial 2 finished with value: 24083.851770668534 and parameters: {'n_estimators': 200, 'max_depth': 3, 'learning_rate': 0.16225365738753686, 'min_child_weight': 1, 'subsample': 0.6715091542567694, 'colsample_bytree': 0.8566876247069521, 'reg_alpha': 0.9858854291063264, 'reg_lambda': 2.4777832294415614, 'gamma': 3.835949532663505}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24083.852 | MAE: 5638.040
RMSE for low y_true (<= 100k): 9112.962 | MAE: 3211.509
RMSE for high y_true (> 100k): 166156.270 | MAE: 137601.517


[I 2025-03-02 20:03:43,263] Trial 3 finished with value: 24022.188497368155 and parameters: {'n_estimators': 900, 'max_depth': 7, 'learning_rate': 0.046309216172979485, 'min_child_weight': 7, 'subsample': 0.9120773562091782, 'colsample_bytree': 0.7740849528019549, 'reg_alpha': 0.4968687429424459, 'reg_lambda': 7.956329758677665, 'gamma': 1.8667580896007134}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24022.188 | MAE: 5524.591
RMSE for low y_true (<= 100k): 9018.336 | MAE: 3101.340
RMSE for high y_true (> 100k): 165942.530 | MAE: 137309.662


[I 2025-03-02 20:03:58,541] Trial 4 finished with value: 24103.467406109598 and parameters: {'n_estimators': 200, 'max_depth': 10, 'learning_rate': 0.1673317697137576, 'min_child_weight': 4, 'subsample': 0.6162020052942114, 'colsample_bytree': 0.6164257438971772, 'reg_alpha': 0.694004018976115, 'reg_lambda': 6.751623060239763, 'gamma': 0.41009283366507443}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24103.467 | MAE: 5732.836
RMSE for low y_true (<= 100k): 9513.219 | MAE: 3342.828
RMSE for high y_true (> 100k): 165090.314 | MAE: 135710.021


[I 2025-03-02 20:04:15,695] Trial 5 finished with value: 24015.463749857776 and parameters: {'n_estimators': 900, 'max_depth': 2, 'learning_rate': 0.2325564821529539, 'min_child_weight': 10, 'subsample': 0.7331008299267221, 'colsample_bytree': 0.9797470288277499, 'reg_alpha': 0.5457878045310046, 'reg_lambda': 4.606505279924539, 'gamma': 3.4760786357992606}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24015.464 | MAE: 5687.917
RMSE for low y_true (<= 100k): 9331.512 | MAE: 3301.162
RMSE for high y_true (> 100k): 164943.941 | MAE: 135488.226


[I 2025-03-02 20:04:40,976] Trial 6 finished with value: 24032.07750998332 and parameters: {'n_estimators': 1000, 'max_depth': 10, 'learning_rate': 0.08501514172265764, 'min_child_weight': 4, 'subsample': 0.6686728272524329, 'colsample_bytree': 0.6095036228365355, 'reg_alpha': 0.95772012417224, 'reg_lambda': 2.1639911513616594, 'gamma': 1.06750067727367}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24032.078 | MAE: 5577.784
RMSE for low y_true (<= 100k): 9222.421 | MAE: 3170.283
RMSE for high y_true (> 100k): 165410.972 | MAE: 136506.319


[I 2025-03-02 20:05:04,585] Trial 7 finished with value: 24226.67713397915 and parameters: {'n_estimators': 200, 'max_depth': 8, 'learning_rate': 0.027228466716406836, 'min_child_weight': 8, 'subsample': 0.5223164853419815, 'colsample_bytree': 0.5632840317946828, 'reg_alpha': 0.38009127407056764, 'reg_lambda': 1.3010255881658395, 'gamma': 0.43394857627609695}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24226.677 | MAE: 5528.488
RMSE for low y_true (<= 100k): 9124.844 | MAE: 3074.419
RMSE for high y_true (> 100k): 167267.050 | MAE: 138989.569


[I 2025-03-02 20:05:17,594] Trial 8 finished with value: 24158.420312504993 and parameters: {'n_estimators': 400, 'max_depth': 6, 'learning_rate': 0.049482796008316184, 'min_child_weight': 9, 'subsample': 0.6184941023801354, 'colsample_bytree': 0.5339708778554677, 'reg_alpha': 0.7886809679625977, 'reg_lambda': 9.940711972448375, 'gamma': 4.47595931102372}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24158.420 | MAE: 5633.324
RMSE for low y_true (<= 100k): 9175.999 | MAE: 3198.381
RMSE for high y_true (> 100k): 166566.631 | MAE: 138054.273


[I 2025-03-02 20:05:30,037] Trial 9 finished with value: 24030.30559884656 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.24946974170749608, 'min_child_weight': 10, 'subsample': 0.9265839423242443, 'colsample_bytree': 0.8065051254390246, 'reg_alpha': 0.027730301007578673, 'reg_lambda': 2.43880186005321, 'gamma': 2.7451744611679594}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24030.306 | MAE: 5560.393
RMSE for low y_true (<= 100k): 9052.045 | MAE: 3135.190
RMSE for high y_true (> 100k): 165907.801 | MAE: 137451.623


[I 2025-03-02 20:05:58,681] Trial 10 finished with value: 24043.247895665216 and parameters: {'n_estimators': 700, 'max_depth': 4, 'learning_rate': 0.011018819826484677, 'min_child_weight': 5, 'subsample': 0.5072818115413829, 'colsample_bytree': 0.6895021158712298, 'reg_alpha': 0.10184168283527029, 'reg_lambda': 0.16770523586077357, 'gamma': 2.3042440137314655}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24043.248 | MAE: 5563.961
RMSE for low y_true (<= 100k): 9080.130 | MAE: 3143.497
RMSE for high y_true (> 100k): 165928.185 | MAE: 137197.449


[I 2025-03-02 20:06:16,171] Trial 11 finished with value: 23928.959415988804 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.09167684621704694, 'min_child_weight': 1, 'subsample': 0.8267303209754834, 'colsample_bytree': 0.6823138616244081, 'reg_alpha': 0.33059416408051606, 'reg_lambda': 4.763613721543579, 'gamma': 1.470004378029721}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23928.959 | MAE: 5587.277
RMSE for low y_true (<= 100k): 9197.930 | MAE: 3201.134
RMSE for high y_true (> 100k): 164655.445 | MAE: 135354.292


[I 2025-03-02 20:06:40,238] Trial 12 finished with value: 23936.48424786695 and parameters: {'n_estimators': 700, 'max_depth': 5, 'learning_rate': 0.02903270616920913, 'min_child_weight': 2, 'subsample': 0.8196141132260764, 'colsample_bytree': 0.693824089467267, 'reg_alpha': 0.2563534890929571, 'reg_lambda': 4.203285146932837, 'gamma': 1.562568222480964}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23936.484 | MAE: 5563.623
RMSE for low y_true (<= 100k): 9127.316 | MAE: 3168.291
RMSE for high y_true (> 100k): 164929.491 | MAE: 135830.387


[I 2025-03-02 20:06:53,649] Trial 13 finished with value: 23991.794464772018 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.0920785455255769, 'min_child_weight': 3, 'subsample': 0.7733187788805009, 'colsample_bytree': 0.674222903409908, 'reg_alpha': 0.29192785615874794, 'reg_lambda': 3.9500351751416938, 'gamma': 2.755881618161821}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23991.794 | MAE: 5595.027
RMSE for low y_true (<= 100k): 9126.301 | MAE: 3190.342
RMSE for high y_true (> 100k): 165377.031 | MAE: 136370.389


[I 2025-03-02 20:07:12,370] Trial 14 finished with value: 23968.363815560053 and parameters: {'n_estimators': 700, 'max_depth': 2, 'learning_rate': 0.02669966705165006, 'min_child_weight': 1, 'subsample': 0.7462453289487656, 'colsample_bytree': 0.5029353680913522, 'reg_alpha': 0.40158156280131535, 'reg_lambda': 7.784469228655197, 'gamma': 1.1384097880097}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23968.364 | MAE: 5732.747
RMSE for low y_true (<= 100k): 9363.679 | MAE: 3357.542
RMSE for high y_true (> 100k): 164464.679 | MAE: 134904.894


[I 2025-03-02 20:07:24,346] Trial 15 finished with value: 23987.151814844583 and parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.10854203760827558, 'min_child_weight': 6, 'subsample': 0.8480119304039502, 'colsample_bytree': 0.8746488543537292, 'reg_alpha': 0.16189371040662287, 'reg_lambda': 0.3700647680379614, 'gamma': 2.034459977411272}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23987.152 | MAE: 5572.356
RMSE for low y_true (<= 100k): 9069.596 | MAE: 3163.059
RMSE for high y_true (> 100k): 165509.333 | MAE: 136598.556


[I 2025-03-02 20:07:51,016] Trial 16 finished with value: 23955.93718205105 and parameters: {'n_estimators': 600, 'max_depth': 4, 'learning_rate': 0.013306832309566351, 'min_child_weight': 3, 'subsample': 0.9886256114089376, 'colsample_bytree': 0.7293478359168791, 'reg_alpha': 0.693629863247543, 'reg_lambda': 3.349176237133057, 'gamma': 0.1038683923122099}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23955.937 | MAE: 5554.631
RMSE for low y_true (<= 100k): 9105.591 | MAE: 3155.460
RMSE for high y_true (> 100k): 165151.076 | MAE: 136030.160


[I 2025-03-02 20:08:12,892] Trial 17 finished with value: 23947.18699648095 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.06343094319933937, 'min_child_weight': 5, 'subsample': 0.5560582517214927, 'colsample_bytree': 0.63956689393484, 'reg_alpha': 0.4314348319136392, 'reg_lambda': 5.3933121956824515, 'gamma': 1.3085026794811299}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  23947.187 | MAE: 5586.393
RMSE for low y_true (<= 100k): 9219.416 | MAE: 3200.650
RMSE for high y_true (> 100k): 164736.843 | MAE: 135331.637


[I 2025-03-02 20:08:24,097] Trial 18 finished with value: 24010.06321223244 and parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.13442708534517103, 'min_child_weight': 3, 'subsample': 0.6856057945503465, 'colsample_bytree': 0.7448954701875119, 'reg_alpha': 0.2698466899585369, 'reg_lambda': 1.5342459785444142, 'gamma': 3.166464703530127}. Best is trial 1 with value: 23894.654023240546.


RMSE Overall:  24010.063 | MAE: 5632.895
RMSE for low y_true (<= 100k): 9159.250 | MAE: 3227.389
RMSE for high y_true (> 100k): 165424.800 | MAE: 136452.948


[I 2025-03-02 20:08:53,258] Trial 19 finished with value: 23881.496515518007 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.04077753010699781, 'min_child_weight': 6, 'subsample': 0.5938922674853023, 'colsample_bytree': 0.6487153486610783, 'reg_alpha': 0.6533362034505681, 'reg_lambda': 6.4550817460181, 'gamma': 0.5938765010835201}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23881.497 | MAE: 5637.354
RMSE for low y_true (<= 100k): 9413.342 | MAE: 3279.134
RMSE for high y_true (> 100k): 163608.402 | MAE: 133885.830


[I 2025-03-02 20:09:06,182] Trial 20 finished with value: 24225.691749846283 and parameters: {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.01877006922928169, 'min_child_weight': 7, 'subsample': 0.5906428985496706, 'colsample_bytree': 0.6350671369447168, 'reg_alpha': 0.8469280690963699, 'reg_lambda': 6.250920664924595, 'gamma': 0.5845672579095808}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  24225.692 | MAE: 5577.147
RMSE for low y_true (<= 100k): 9071.227 | MAE: 3117.556
RMSE for high y_true (> 100k): 167417.680 | MAE: 139338.524


[I 2025-03-02 20:09:25,968] Trial 21 finished with value: 23957.35825592702 and parameters: {'n_estimators': 600, 'max_depth': 5, 'learning_rate': 0.042776902403246886, 'min_child_weight': 6, 'subsample': 0.5694142908706138, 'colsample_bytree': 0.6769085319966592, 'reg_alpha': 0.6394675554524278, 'reg_lambda': 7.344298553318478, 'gamma': 1.6641293529085557}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23957.358 | MAE: 5573.853
RMSE for low y_true (<= 100k): 9144.090 | MAE: 3176.283
RMSE for high y_true (> 100k): 165046.779 | MAE: 135962.317


[I 2025-03-02 20:09:52,709] Trial 22 finished with value: 23904.542772513592 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.05733385532792108, 'min_child_weight': 4, 'subsample': 0.7096154666885992, 'colsample_bytree': 0.5855872002822691, 'reg_alpha': 0.5967650945208628, 'reg_lambda': 9.019315522168512, 'gamma': 0.7737130017361084}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23904.543 | MAE: 5597.534
RMSE for low y_true (<= 100k): 9299.302 | MAE: 3225.053
RMSE for high y_true (> 100k): 164148.582 | MAE: 134621.552


[I 2025-03-02 20:10:45,025] Trial 23 finished with value: 23941.34355045516 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.0378990436261574, 'min_child_weight': 4, 'subsample': 0.6271683239920274, 'colsample_bytree': 0.5805574061952886, 'reg_alpha': 0.6075309331081387, 'reg_lambda': 8.90544803704389, 'gamma': 0.018529567473039465}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23941.344 | MAE: 5682.909
RMSE for low y_true (<= 100k): 9575.911 | MAE: 3323.522
RMSE for high y_true (> 100k): 163579.749 | MAE: 133994.859


[I 2025-03-02 20:11:08,933] Trial 24 finished with value: 23947.65023935795 and parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.055281396550839045, 'min_child_weight': 5, 'subsample': 0.6962350302659572, 'colsample_bytree': 0.532056749371324, 'reg_alpha': 0.7621554873888281, 'reg_lambda': 8.746450258142664, 'gamma': 0.7569002690931992}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23947.650 | MAE: 5576.682
RMSE for low y_true (<= 100k): 9237.341 | MAE: 3190.884
RMSE for high y_true (> 100k): 164685.955 | MAE: 135324.970


[I 2025-03-02 20:11:37,766] Trial 25 finished with value: 23958.42236104703 and parameters: {'n_estimators': 800, 'max_depth': 7, 'learning_rate': 0.06847720856228895, 'min_child_weight': 7, 'subsample': 0.5434312770477611, 'colsample_bytree': 0.5930791457295328, 'reg_alpha': 0.47162716562694784, 'reg_lambda': 8.948818342308186, 'gamma': 0.6301332467113275}. Best is trial 19 with value: 23881.496515518007.


RMSE Overall:  23958.422 | MAE: 5639.565
RMSE for low y_true (<= 100k): 9396.181 | MAE: 3265.498
RMSE for high y_true (> 100k): 164283.544 | MAE: 134749.850


[I 2025-03-02 20:12:09,933] Trial 26 finished with value: 23876.745382761994 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.03463735262663482, 'min_child_weight': 6, 'subsample': 0.6505482542856182, 'colsample_bytree': 0.6439850382002905, 'reg_alpha': 0.8846660757196081, 'reg_lambda': 9.906891446537806, 'gamma': 0.8975397424137239}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23876.745 | MAE: 5622.156
RMSE for low y_true (<= 100k): 9346.064 | MAE: 3259.398
RMSE for high y_true (> 100k): 163779.668 | MAE: 134117.401


[I 2025-03-02 20:12:35,503] Trial 27 finished with value: 23959.276107216185 and parameters: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.034431730133079694, 'min_child_weight': 6, 'subsample': 0.6208145439757378, 'colsample_bytree': 0.6486507253856435, 'reg_alpha': 0.9195750368928796, 'reg_lambda': 3.1125321885140345, 'gamma': 2.0464935298516505}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23959.276 | MAE: 5597.222
RMSE for low y_true (<= 100k): 9167.371 | MAE: 3203.910
RMSE for high y_true (> 100k): 164991.953 | MAE: 135754.110


[I 2025-03-02 20:12:42,884] Trial 28 finished with value: 26236.853642264574 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.019274074562730663, 'min_child_weight': 8, 'subsample': 0.6506700277675769, 'colsample_bytree': 0.7180629805517835, 'reg_alpha': 0.8693115298105631, 'reg_lambda': 6.8749789934348025, 'gamma': 4.968456106162377}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  26236.854 | MAE: 6237.314
RMSE for low y_true (<= 100k): 10337.887 | MAE: 3496.337
RMSE for high y_true (> 100k): 179756.673 | MAE: 155301.490


[I 2025-03-02 20:13:05,762] Trial 29 finished with value: 24016.720926699476 and parameters: {'n_estimators': 900, 'max_depth': 2, 'learning_rate': 0.01826270589803773, 'min_child_weight': 8, 'subsample': 0.5826399788101869, 'colsample_bytree': 0.7924549386193036, 'reg_alpha': 0.7502404021332595, 'reg_lambda': 5.696898466806424, 'gamma': 0.9731610840291927}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  24016.721 | MAE: 5655.356
RMSE for low y_true (<= 100k): 9198.995 | MAE: 3253.778
RMSE for high y_true (> 100k): 165358.378 | MAE: 136261.823


[I 2025-03-02 20:13:28,680] Trial 30 finished with value: 23897.128832563612 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.03306752125979177, 'min_child_weight': 6, 'subsample': 0.7724981074368483, 'colsample_bytree': 0.5578353795066514, 'reg_alpha': 0.8370986972035989, 'reg_lambda': 9.65348944851665, 'gamma': 0.1804613723330839}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23897.129 | MAE: 5616.703
RMSE for low y_true (<= 100k): 9294.333 | MAE: 3245.138
RMSE for high y_true (> 100k): 164104.095 | MAE: 134590.949


[I 2025-03-02 20:13:50,710] Trial 31 finished with value: 23916.76279389541 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.03104935105830938, 'min_child_weight': 6, 'subsample': 0.787610431197953, 'colsample_bytree': 0.5601163191933873, 'reg_alpha': 0.853515723493062, 'reg_lambda': 9.80235152458675, 'gamma': 0.2341421479296364}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23916.763 | MAE: 5615.785
RMSE for low y_true (<= 100k): 9283.704 | MAE: 3240.392
RMSE for high y_true (> 100k): 164295.117 | MAE: 134798.187


[I 2025-03-02 20:14:11,144] Trial 32 finished with value: 23954.39905935979 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.023270295848974297, 'min_child_weight': 5, 'subsample': 0.6504789667960138, 'colsample_bytree': 0.5085860500382017, 'reg_alpha': 0.6855763160351099, 'reg_lambda': 8.116357241480848, 'gamma': 0.9150520321743434}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23954.399 | MAE: 5656.538
RMSE for low y_true (<= 100k): 9281.444 | MAE: 3274.660
RMSE for high y_true (> 100k): 164605.442 | MAE: 135191.596


[I 2025-03-02 20:14:29,593] Trial 33 finished with value: 23901.80091575552 and parameters: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.04128026750917067, 'min_child_weight': 7, 'subsample': 0.7229352240743268, 'colsample_bytree': 0.6471512056840519, 'reg_alpha': 0.997613720227212, 'reg_lambda': 9.499244744529639, 'gamma': 0.3056608408283233}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23901.801 | MAE: 5594.231
RMSE for low y_true (<= 100k): 9238.167 | MAE: 3214.157
RMSE for high y_true (> 100k): 164314.118 | MAE: 135031.180


[I 2025-03-02 20:14:49,099] Trial 34 finished with value: 23931.152415190332 and parameters: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.03625338336894041, 'min_child_weight': 6, 'subsample': 0.5962015178126024, 'colsample_bytree': 0.5466067078968203, 'reg_alpha': 0.5312398566360711, 'reg_lambda': 8.213262925171396, 'gamma': 1.319258309749315}. Best is trial 26 with value: 23876.745382761994.


RMSE Overall:  23931.152 | MAE: 5617.001
RMSE for low y_true (<= 100k): 9268.045 | MAE: 3239.262
RMSE for high y_true (> 100k): 164459.163 | MAE: 134926.972
Best trial: {'n_estimators': 1000, 'max_depth': 4, 'learning_rate': 0.03463735262663482, 'min_child_weight': 6, 'subsample': 0.6505482542856182, 'colsample_bytree': 0.6439850382002905, 'reg_alpha': 0.8846660757196081, 'reg_lambda': 9.906891446537806, 'gamma': 0.8975397424137239}


In [22]:


fig = vis.plot_optimization_history(study_xgb)
fig.show()


In [23]:
fig = vis.plot_param_importances(study_xgb)
fig.show()


In [25]:
fig = vis.plot_slice(study_xgb)
fig.show()


### Train best LightGBM and best XGBoost and output predictions for test

In [26]:
# lightGBM
best_params = study.best_trial.params

regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(**best_params))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=False) # here's the change
)

In [27]:
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.027076 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85037
[LightGBM] [Info] Number of data points in the train set: 53999, number of used features: 395
[LightGBM] [Info] Start training from score 8.064763


In [28]:
y_test_pred = model.predict(dataset_test)
y_test_pred




X does not have valid feature names, but LGBMRegressor was fitted with feature names



array([ 6752.83340441,  2586.85431844, 22547.83284088, ...,
        6585.69552794,  7010.17951367,   406.86264687], shape=(36000,))

In [29]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_test_pred), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,6752.833404
1,WC2005111,2586.854318
2,WC6899143,22547.832841
3,WC5502023,456.162911
4,WC4785156,3134.128018


In [30]:
submission_df.to_csv('../data/output/submission_lgbm_on_log.csv', index=False)

In [35]:
# XGBoost
best_params = study_xgb.best_trial.params

regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', XGBRegressor(**best_params))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=FunctionTransformer(np.log1p, inverse_func=np.expm1, validate=False) # here's the change
)

In [36]:
model.fit(X_train, y_train)

In [37]:
y_test_pred = model.predict(dataset_test)
y_test_pred

array([ 6294.9546 ,  2672.6099 , 21258.408  , ...,  5585.009  ,
        7332.1455 ,   391.37875], shape=(36000,), dtype=float32)

In [38]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_test_pred), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,6294.95459
1,WC2005111,2672.609863
2,WC6899143,21258.408203
3,WC5502023,348.934723
4,WC4785156,2853.67749


In [39]:
submission_df.to_csv('../data/output/submission_xgb_on_log.csv', index=False)