In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import (
    FunctionTransformer,
    PowerTransformer,
    MinMaxScaler,
    OneHotEncoder,
    Binarizer,
    StandardScaler
)


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA


import mlflow
import mlflow.sklearn

from matplotlib import pyplot as plt
import seaborn as sns
import optuna

In [2]:
dataset_train = pd.read_csv('../data/processed/dataset_train.csv', index_col=0)
dataset_test = pd.read_csv('../data/processed/dataset_test.csv', index_col=0)

In [3]:
# column names
cols_event_embeddings = [f'event_feat_{i}' for i in range(768)]
cols_event_dummies = ['BACK STRAIN', 'CLEANING INSTRUMENTS', 'CLEANING LEFT SHOULDER SPLINTER', 'CUT WITH KNIFE', 'DEALING CARDS RIGHT TENDON SYNOVITIS', 'FELL', 'FELL FROM LADDER', 'FELL OFF LADDER', 'FRACTURED LEFT WRIST FRACTURE', 'GRINDING FOREIGN BODY', 'GRINDING STEEL BEAM INFECTION', 'HIT AIR HOSE', 'HIT ELBOW', 'HIT FLOOR SHEET', 'INJURY', 'INJURY LIFTING', 'INJURY LIFTING STRAIN', 'JAMMED RIGHT HAND PUNCTURE', 'KNIFE LACERATION', 'KNIFE SLIPPED', 'LACERATION', 'LEFT LEG SNAKE BITE', 'LIFTING', 'LIFTING BOXES', 'LIFTING PATIENT', 'LIFTING TYRES', 'LOWER BACK STRAIN', 'MOTOR VEHICLE COLLISION', 'PREPARING PIPES FOR PAINTING PUNCTURE', 'PULLING CABLES', 'REDBACK SPIDER BITE', 'SLIPPED AND FELL', 'SLIPPED ON FLOOR', 'SLIPPED ON ROLLER', 'SLIPPED ON STAIRS', 'SLIPPED ON WET FLOOR', 'SLIPPED USING LATHE', 'SORTING ALUMINIUM BARS STRAIN', 'SPRAINED RIGHT ANKLE FRACTURE', 'STRAIN', 'STRAIN LIFTING PARTS', 'STRAIN LIFTING STRAIN', 'STRUCK KNIFE LACERATED', 'STRUCK KNIFE LACERATION', 'STRUCK PALLET', 'STRUCK WITH KNIFE', 'USING AIR HOSE STRAIN', 'USING DRILL']
cols_body_parts_embeddings = [f'body_feat_{i}' for i in range(768)]

In [4]:
def train_cv(model, X, y, metric = 'rmse', kfold = 5):

    if metric != 'rmse':
        raise NotImplemented

    scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')

    mse_scores = -scores
    rmse_scores = np.sqrt(mse_scores)

    return rmse_scores.mean(), rmse_scores.std()

In [5]:
log_transformer = Pipeline([
    ('log', FunctionTransformer(np.log, validate=False)),
    ('scaler', StandardScaler()) 
])

# gender to bool 
def gender_to_bool(gender_column):
    """
    - Fill missing values with 'M'
    - Replace 'U' with 'M'
    - Return a boolean-ish column: 1 for 'M', 0 otherwise
    """
    g = pd.Series(gender_column.squeeze(), dtype=str).fillna('M').replace('U', 'M')
    is_male = (g == 'M').astype(int)
    return is_male.values.reshape(-1, 1)

gender_transformer = FunctionTransformer(gender_to_bool, validate=False)

# hours worked per week -> to buckets -> to one_hot
def bucket_hours_worked(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 37, 41, np.inf],
        labels=["<=37", "37-41", ">41"]
    ).astype(str).values.reshape(-1, 1)

hours_worked_bucketer = FunctionTransformer(bucket_hours_worked, validate=False)
hours_worked_encoder = OneHotEncoder(drop='first')
hours_worked_pipeline = Pipeline([
    ('bucketizer', hours_worked_bucketer),
    ('encoder', hours_worked_encoder)
])

# DaysToReport (DateReported - DateTimeOfAccident) -> to buckets -> to one_hot
def bucket_days_to_report(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 80, 300, 500, np.inf],
        labels=["<=80", "80-300", "300-500", ">500"]
    ).astype(str).values.reshape(-1, 1)

days_to_report_bucketer = FunctionTransformer(bucket_days_to_report, validate=False)
days_to_report_encoder = OneHotEncoder(drop='first')
days_to_report_pipeline = Pipeline([
    ('bucketizer', days_to_report_bucketer),
    ('encoder', days_to_report_encoder)
])

# DaysWorkedPerWeek -> 1 if equals 5, 0 in any other case 
def days_worked_binarize(days_array):
    # Ensure we handle arrays or DataFrames by squeezing to 1D
    days = days_array.squeeze()
    binarized = (days == 5).astype(int)
    # Return as 2D array: (n_samples x 1)
    return binarized.values.reshape(-1, 1) if isinstance(days, pd.Series) else binarized.reshape(-1, 1)

days_worked_transformer = FunctionTransformer(days_worked_binarize, validate=False)

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('log_inc', log_transformer, ['InitialIncurredCalimsCost', 'WeeklyWages']),
        ('minmax_scaler', MinMaxScaler(), ['Age', 'YearAccident']),
        ('gender_bool', gender_transformer, ['Gender']),
        ('hww_bool_onehot', hours_worked_pipeline, ['HoursWorkedPerWeek']),
        ('dtt_bool_onehot', days_to_report_pipeline, ['DaysToReport']),
        ('has_dependent_bool', Binarizer(threshold=0), ['DependentChildren']),
        ('worked_five_days_bool', days_worked_transformer, ['DaysWorkedPerWeek']),
        ('onehot', OneHotEncoder(drop='first'), ['MaritalStatus', 'PartTimeFullTime']),
        ('pca_event_embeddings', PCA(n_components=250), cols_event_embeddings),
        ('passthrough_event_dummies', 'passthrough', cols_event_dummies),
        ('pca_body_parts_embeddings', PCA(n_components=80), cols_body_parts_embeddings),
        ('std_scaler', StandardScaler(), ['event_median_cost'])
    ],
    remainder='drop'
)

In [None]:
regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(force_col_wise=True))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [8]:
dataset_train

Unnamed: 0,DateTimeOfAccident,DateReported,Age,Gender,MaritalStatus,DependentChildren,DependentsOther,WeeklyWages,PartTimeFullTime,HoursWorkedPerWeek,...,body_feat_758,body_feat_759,body_feat_760,body_feat_761,body_feat_762,body_feat_763,body_feat_764,body_feat_765,body_feat_766,body_feat_767
WC8285054,2002-04-09 07:00:00+00:00,2002-07-05 00:00:00+00:00,48.0,M,M,0.0,0.0,500.00,F,38.0,...,0.068680,0.025073,0.005782,-0.038884,0.058474,-0.020736,0.029686,-0.014542,-0.063414,0.004848
WC6982224,1999-01-07 11:00:00+00:00,1999-01-20 00:00:00+00:00,43.0,F,M,0.0,0.0,509.34,F,37.5,...,0.088765,0.020062,0.012920,-0.024838,0.031098,-0.021392,-0.002665,-0.011949,-0.061116,-0.003289
WC5481426,1996-03-25 00:00:00+00:00,1996-04-14 00:00:00+00:00,30.0,M,U,0.0,0.0,709.10,F,38.0,...,0.064991,0.032580,0.008295,-0.054072,0.052691,-0.002578,0.047455,-0.003457,-0.033384,-0.000423
WC9775968,2005-06-22 13:00:00+00:00,2005-07-22 00:00:00+00:00,41.0,M,S,0.0,0.0,555.46,F,38.0,...,0.070748,0.021465,0.012034,-0.043719,0.070600,0.006588,-0.030409,-0.009374,-0.076178,0.045278
WC2634037,1990-08-29 08:00:00+00:00,1990-09-27 00:00:00+00:00,36.0,M,M,0.0,0.0,377.10,F,38.0,...,0.079619,0.021140,0.043047,-0.006216,0.047252,-0.019845,0.000466,-0.033875,-0.028523,-0.011660
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
WC9370727,2004-08-21 18:00:00+00:00,2004-09-08 00:00:00+00:00,32.0,F,S,0.0,0.0,500.00,F,38.0,...,0.062220,0.018751,0.004911,-0.018006,0.007490,-0.031072,0.042807,-0.027719,-0.058011,-0.024113
WC8396269,2002-04-28 09:00:00+00:00,2002-09-03 00:00:00+00:00,20.0,F,S,0.0,0.0,500.00,F,40.0,...,0.077031,0.006508,0.023904,-0.006732,0.057064,-0.015958,0.007836,-0.026323,-0.068814,0.009221
WC3609528,1992-02-28 09:00:00+00:00,1992-03-18 00:00:00+00:00,19.0,M,S,0.0,0.0,283.00,F,40.0,...,0.049300,-0.024605,0.006685,-0.008855,0.019896,-0.023859,0.005520,-0.028199,-0.028116,-0.006528
WC5038565,1995-01-10 07:00:00+00:00,1995-01-31 00:00:00+00:00,24.0,M,S,0.0,0.0,200.00,F,38.0,...,0.051226,0.010259,-0.005054,-0.001850,-0.006115,-0.033084,0.029406,-0.032876,-0.057922,-0.025492


In [9]:

scores = cross_val_score(
    model, 
    dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 
    cv=5, 
    scoring='neg_mean_squared_error'
)
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print(f"CV MSE:  {mse_scores.mean():.3f}  (+/- {mse_scores.std():.3f})")
print(f"CV RMSE: {rmse_scores.mean():.3f}  (+/- {rmse_scores.std():.3f})")

[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370
CV MSE:  571107743.684  (+/- 52125504.997)
CV RMSE: 23871.174  (+/- 1129.060)




In [10]:
X_train = dataset_train.drop(columns=['UltimateIncurredClaimCost'])
y_train = dataset_train['UltimateIncurredClaimCost']

def run_experiment(
        experiment_name, 
        run_name, 
        regressor_object, 
        kfold=5, 
        save_model=True
    ):
    """
    Runs an ML experiment with cross-validation and logs metrics to MLflow,
    tracking overall and segmented performance for targets above and below 100,000.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.
    - run_name (str): Name of the MLflow run.
    - regressor_object: The regression model.
    - kfold (int): Number of folds for cross-validation.
    - save_model (bool): Whether to save the trained model in MLflow.

    Logs:
    - Overall CV MSE, RMSE, and MAE
    - MSE, RMSE, and MAE for y_true > 100,000
    - MSE, RMSE, and MAE for y_true ≤ 100,000
    """

    # Define pipeline with preprocessing and model
    regressor_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('linear_model', regressor_object)
    ])

    model = TransformedTargetRegressor(
        regressor=regressor_pipeline,
        transformer=PowerTransformer(method='box-cox', standardize=False)
    )

    # Set MLflow experiment
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        params = model.get_params(deep=True)
        for param_name, param_value in params.items():
            if isinstance(param_value, (str, int, float, bool, type(None))):
                mlflow.log_param(param_name, param_value)
            else:
                mlflow.log_param(param_name, str(param_value)[:5000])
        
        mlflow.log_param("n_features_in", X_train.shape[1])

        # Perform cross-validation with predictions
        y_pred = cross_val_predict(model, X_train, y_train, cv=kfold)

        # Compute overall errors
        overall_mse = mean_squared_error(y_train, y_pred)
        overall_rmse = np.sqrt(overall_mse)
        overall_mae = mean_absolute_error(y_train, y_pred)

        # Compute errors for high and low y_true
        high_mask = y_train > 100000

        high_mse = mean_squared_error(y_train[high_mask], y_pred[high_mask])
        high_rmse = np.sqrt(high_mse)
        high_mae = mean_absolute_error(y_train[high_mask], y_pred[high_mask])
        
        low_mse = mean_squared_error(y_train[~high_mask], y_pred[~high_mask])
        low_rmse = np.sqrt(low_mse)
        low_mae = mean_absolute_error(y_train[~high_mask], y_pred[~high_mask])
    
        print(f"RMSE Overall:  {overall_rmse:.3f} | MAE: {overall_mae:.3f}")
        print(f"RMSE for low y_true (<= 100k): {low_rmse:.3f} | MAE: {low_mae:.3f}")
        print(f"RMSE for high y_true (> 100k): {high_rmse:.3f} | MAE: {high_mae:.3f}")
        

        # Log overall metrics
        mlflow.log_metric("cv_mse", overall_mse)
        mlflow.log_metric("cv_rmse", overall_rmse)
        mlflow.log_metric("cv_mae", overall_mae)

        # Log segmented metrics
        mlflow.log_metric("cv_mse_high", high_mse)
        mlflow.log_metric("cv_rmse_high", high_rmse)
        mlflow.log_metric("cv_mae_high", high_mae)
        mlflow.log_metric("cv_mse_low", low_mse)
        mlflow.log_metric("cv_rmse_low", low_rmse)
        mlflow.log_metric("cv_mae_low", low_mae)

        # Fit final model on full dataset
        if save_model:
            model.fit(X_train, y_train)
            mlflow.sklearn.log_model(model, artifact_path="models")
    
    return overall_rmse


In [11]:

def objective(trial):
    """Objective function for Optuna to optimize XGBoost hyperparameters."""
    
    # Suggest hyperparameters to optimize
    params = {
        #"num_leaves": trial.suggest_int("num_leaves", 20, 500),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "learning_rate": trial.suggest_float("learning_rate", 0.09, 0.4, log=True),
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 20),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 10.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 10.0),
        "boosting_type": "gbdt",
        "objective": "regression",
        "force_col_wise": True,
        "metric": "rmse",
    }

    rmse = run_experiment(
        'Optuna LightGBM',
        'LightGBM optimization',
        LGBMRegressor(**params),
        kfold=5,
        save_model=False
    )

    return rmse


In [12]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=35)

# Print the best result
print(f"Best trial: {study.best_trial.params}")


[I 2025-03-02 18:43:25,613] A new study created in memory with name: no-name-81154639-d46b-4444-89d0-2c17f57139ad


[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:44:23,358] Trial 0 finished with value: 23983.244624546845 and parameters: {'max_depth': 14, 'learning_rate': 0.12885072465808287, 'n_estimators': 1000, 'min_child_weight': 7, 'subsample': 0.5141082823736073, 'colsample_bytree': 0.741327173181324, 'lambda_l1': 7.942096909928619, 'lambda_l2': 2.977233533779348}. Best is trial 0 with value: 23983.244624546845.


RMSE Overall:  23983.245 | MAE: 5780.766
RMSE for low y_true (<= 100k): 9774.816 | MAE: 3432.583
RMSE for high y_true (> 100k): 163279.608 | MAE: 133483.384
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:44:45,831] Trial 1 finished with value: 23983.11465233243 and parameters: {'max_depth': 15, 'learning_rate': 0.21863555571940108, 'n_estimators': 300, 'min_child_weight': 20, 'subsample': 0.566642206823366, 'colsample_bytree': 0.9051280832122182, 'lambda_l1': 8.869057241899329, 'lambda_l2': 6.707122934148144}. Best is trial 1 with value: 23983.11465233243.


RMSE Overall:  23983.115 | MAE: 5743.117
RMSE for low y_true (<= 100k): 9631.953 | MAE: 3383.889
RMSE for high y_true (> 100k): 163739.623 | MAE: 134046.426
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:44:56,472] Trial 2 finished with value: 23999.85250042222 and parameters: {'max_depth': 14, 'learning_rate': 0.27135319214539444, 'n_estimators': 100, 'min_child_weight': 7, 'subsample': 0.8180419077972416, 'colsample_bytree': 0.5777426106868437, 'lambda_l1': 5.1356219667828515, 'lambda_l2': 1.0644079613652313}. Best is trial 1 with value: 23983.11465233243.


RMSE Overall:  23999.853 | MAE: 5755.346
RMSE for low y_true (<= 100k): 9677.867 | MAE: 3399.797
RMSE for high y_true (> 100k): 163728.213 | MAE: 133858.598
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:45:04,179] Trial 3 finished with value: 23924.15147951742 and parameters: {'max_depth': 4, 'learning_rate': 0.2928749936708286, 'n_estimators': 100, 'min_child_weight': 16, 'subsample': 0.9043634155245212, 'colsample_bytree': 0.7147813438088808, 'lambda_l1': 2.4516650754091565, 'lambda_l2': 7.100085364277814}. Best is trial 3 with value: 23924.15147951742.


RMSE Overall:  23924.151 | MAE: 5736.033
RMSE for low y_true (<= 100k): 9665.416 | MAE: 3387.037
RMSE for high y_true (> 100k): 163153.609 | MAE: 133482.850
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:45:44,159] Trial 4 finished with value: 24002.079542376265 and parameters: {'max_depth': 12, 'learning_rate': 0.21714970630665492, 'n_estimators': 600, 'min_child_weight': 15, 'subsample': 0.669960692599745, 'colsample_bytree': 0.9832036320114681, 'lambda_l1': 5.361959787147869, 'lambda_l2': 2.3099452006353216}. Best is trial 3 with value: 23924.15147951742.


RMSE Overall:  24002.080 | MAE: 5841.156
RMSE for low y_true (<= 100k): 9904.804 | MAE: 3496.098
RMSE for high y_true (> 100k): 163006.644 | MAE: 133373.867
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:46:03,331] Trial 5 finished with value: 23862.926693452253 and parameters: {'max_depth': 10, 'learning_rate': 0.09590855249809059, 'n_estimators': 200, 'min_child_weight': 5, 'subsample': 0.7165726002125996, 'colsample_bytree': 0.9960603943441494, 'lambda_l1': 0.42213821897392423, 'lambda_l2': 7.850669836868018}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23862.927 | MAE: 5603.022
RMSE for low y_true (<= 100k): 9304.513 | MAE: 3231.245
RMSE for high y_true (> 100k): 163796.788 | MAE: 134588.757
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:46:56,757] Trial 6 finished with value: 24007.836997327573 and parameters: {'max_depth': 9, 'learning_rate': 0.11503972896145787, 'n_estimators': 1000, 'min_child_weight': 19, 'subsample': 0.6031092390991818, 'colsample_bytree': 0.5086492039038086, 'lambda_l1': 7.125593924827976, 'lambda_l2': 6.59842307278784}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24007.837 | MAE: 5803.550
RMSE for low y_true (<= 100k): 9913.107 | MAE: 3456.893
RMSE for high y_true (> 100k): 163026.152 | MAE: 133423.179
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:47:06,084] Trial 7 finished with value: 23974.177551149474 and parameters: {'max_depth': 5, 'learning_rate': 0.12166567336745643, 'n_estimators': 100, 'min_child_weight': 6, 'subsample': 0.7200974369215347, 'colsample_bytree': 0.5686802943018593, 'lambda_l1': 7.756439199344609, 'lambda_l2': 7.779931227077293}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23974.178 | MAE: 5642.636
RMSE for low y_true (<= 100k): 9325.965 | MAE: 3261.582
RMSE for high y_true (> 100k): 164628.066 | MAE: 135132.906
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:47:40,403] Trial 8 finished with value: 24384.51334796267 and parameters: {'max_depth': 13, 'learning_rate': 0.3836031081380503, 'n_estimators': 500, 'min_child_weight': 11, 'subsample': 0.5919979352669787, 'colsample_bytree': 0.9958736712530989, 'lambda_l1': 2.4819749915142006, 'lambda_l2': 9.591036290399522}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24384.513 | MAE: 6132.605
RMSE for low y_true (<= 100k): 10755.375 | MAE: 3792.862
RMSE for high y_true (> 100k): 163218.693 | MAE: 133376.197
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:48:40,675] Trial 9 finished with value: 24023.987113127838 and parameters: {'max_depth': 8, 'learning_rate': 0.17343065428308233, 'n_estimators': 1000, 'min_child_weight': 3, 'subsample': 0.9866953896675671, 'colsample_bytree': 0.8710946292676356, 'lambda_l1': 5.366392424141364, 'lambda_l2': 8.318545624620215}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24023.987 | MAE: 5850.294
RMSE for low y_true (<= 100k): 9989.499 | MAE: 3508.618
RMSE for high y_true (> 100k): 162904.277 | MAE: 133199.066
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:49:10,686] Trial 10 finished with value: 23929.79071639841 and parameters: {'max_depth': 11, 'learning_rate': 0.101183096460023, 'n_estimators': 400, 'min_child_weight': 1, 'subsample': 0.8041729391572938, 'colsample_bytree': 0.8341872132304209, 'lambda_l1': 0.5764750745208156, 'lambda_l2': 4.686383181300215}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23929.791 | MAE: 5706.240
RMSE for low y_true (<= 100k): 9661.545 | MAE: 3354.946
RMSE for high y_true (> 100k): 163211.872 | MAE: 133578.039
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:49:19,201] Trial 11 finished with value: 23926.881242414525 and parameters: {'max_depth': 3, 'learning_rate': 0.3671701925377869, 'n_estimators': 200, 'min_child_weight': 13, 'subsample': 0.9775618414788, 'colsample_bytree': 0.7191138011656809, 'lambda_l1': 0.9099914228212082, 'lambda_l2': 5.120832470876403}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23926.881 | MAE: 5784.030
RMSE for low y_true (<= 100k): 9840.887 | MAE: 3449.595
RMSE for high y_true (> 100k): 162604.401 | MAE: 132738.998
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:49:59,928] Trial 12 finished with value: 24077.630605380204 and parameters: {'max_depth': 7, 'learning_rate': 0.16582573061788158, 'n_estimators': 700, 'min_child_weight': 16, 'subsample': 0.8735715060037933, 'colsample_bytree': 0.6821753751104394, 'lambda_l1': 2.788438694939502, 'lambda_l2': 9.996940005069435}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24077.631 | MAE: 5852.847
RMSE for low y_true (<= 100k): 10071.595 | MAE: 3509.481
RMSE for high y_true (> 100k): 163067.918 | MAE: 133293.514
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:50:19,139] Trial 13 finished with value: 24164.74881917839 and parameters: {'max_depth': 5, 'learning_rate': 0.29242327485448044, 'n_estimators': 300, 'min_child_weight': 10, 'subsample': 0.8995308023670932, 'colsample_bytree': 0.8040066001810885, 'lambda_l1': 2.4766731090087455, 'lambda_l2': 5.48420412445619}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24164.749 | MAE: 5916.638
RMSE for low y_true (<= 100k): 10380.616 | MAE: 3578.708
RMSE for high y_true (> 100k): 162727.373 | MAE: 133061.680
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:50:31,161] Trial 14 finished with value: 23907.868899830588 and parameters: {'max_depth': 10, 'learning_rate': 0.09436636860716799, 'n_estimators': 100, 'min_child_weight': 4, 'subsample': 0.7340741609124537, 'colsample_bytree': 0.6637863621794857, 'lambda_l1': 0.06360182462587716, 'lambda_l2': 8.158149082928414}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23907.869 | MAE: 5587.282
RMSE for low y_true (<= 100k): 9260.716 | MAE: 3210.373
RMSE for high y_true (> 100k): 164293.980 | MAE: 134852.125
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:50:54,236] Trial 15 finished with value: 23935.597151500675 and parameters: {'max_depth': 10, 'learning_rate': 0.10231215317074795, 'n_estimators': 300, 'min_child_weight': 3, 'subsample': 0.6963974213575781, 'colsample_bytree': 0.6458479747756705, 'lambda_l1': 0.19908147636274304, 'lambda_l2': 8.792472077388188}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23935.597 | MAE: 5674.980
RMSE for low y_true (<= 100k): 9610.849 | MAE: 3324.537
RMSE for high y_true (> 100k): 163421.669 | MAE: 133500.507
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:51:10,594] Trial 16 finished with value: 23868.97572225675 and parameters: {'max_depth': 7, 'learning_rate': 0.0914778136748221, 'n_estimators': 200, 'min_child_weight': 5, 'subsample': 0.7709756488155141, 'colsample_bytree': 0.6217059644236205, 'lambda_l1': 3.8711112157315886, 'lambda_l2': 4.010579398627382}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23868.976 | MAE: 5624.800
RMSE for low y_true (<= 100k): 9411.194 | MAE: 3268.382
RMSE for high y_true (> 100k): 163513.902 | MAE: 133775.269
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:51:55,957] Trial 17 finished with value: 24008.333011210594 and parameters: {'max_depth': 7, 'learning_rate': 0.14569560958011368, 'n_estimators': 800, 'min_child_weight': 9, 'subsample': 0.8016687987586186, 'colsample_bytree': 0.6167539544797879, 'lambda_l1': 3.9510746527316174, 'lambda_l2': 3.4398405790287137}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24008.333 | MAE: 5813.207
RMSE for low y_true (<= 100k): 9985.854 | MAE: 3477.234
RMSE for high y_true (> 100k): 162788.577 | MAE: 132851.801
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:52:30,919] Trial 18 finished with value: 23915.436400909857 and parameters: {'max_depth': 7, 'learning_rate': 0.09003371713997294, 'n_estimators': 500, 'min_child_weight': 1, 'subsample': 0.6577457759036539, 'colsample_bytree': 0.9296456823854494, 'lambda_l1': 3.9483293541487754, 'lambda_l2': 4.103545119932238}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23915.436 | MAE: 5666.875
RMSE for low y_true (<= 100k): 9526.597 | MAE: 3306.340
RMSE for high y_true (> 100k): 163526.446 | MAE: 134041.230
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:52:59,270] Trial 19 finished with value: 24038.174407562787 and parameters: {'max_depth': 9, 'learning_rate': 0.1451524574090323, 'n_estimators': 400, 'min_child_weight': 5, 'subsample': 0.7714255210860502, 'colsample_bytree': 0.7913573575163179, 'lambda_l1': 1.453736695323396, 'lambda_l2': 0.799451382768245}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  24038.174 | MAE: 5762.272
RMSE for low y_true (<= 100k): 9846.699 | MAE: 3425.295
RMSE for high y_true (> 100k): 163491.945 | MAE: 132855.437
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:53:15,729] Trial 20 finished with value: 23970.350065197934 and parameters: {'max_depth': 11, 'learning_rate': 0.10932467237664849, 'n_estimators': 200, 'min_child_weight': 8, 'subsample': 0.8447977348467888, 'colsample_bytree': 0.5058670045138022, 'lambda_l1': 6.401525933666015, 'lambda_l2': 5.681421544467383}. Best is trial 5 with value: 23862.926693452253.


RMSE Overall:  23970.350 | MAE: 5666.565
RMSE for low y_true (<= 100k): 9500.342 | MAE: 3303.750
RMSE for high y_true (> 100k): 164053.963 | MAE: 134164.948
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:53:33,361] Trial 21 finished with value: 23853.022562628455 and parameters: {'max_depth': 10, 'learning_rate': 0.09134879752884997, 'n_estimators': 200, 'min_child_weight': 4, 'subsample': 0.7406110320807305, 'colsample_bytree': 0.6527923965773857, 'lambda_l1': 1.5854842816691632, 'lambda_l2': 8.833659109656566}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23853.023 | MAE: 5615.321
RMSE for low y_true (<= 100k): 9403.465 | MAE: 3262.643
RMSE for high y_true (> 100k): 163409.119 | MAE: 133562.416
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:53:49,838] Trial 22 finished with value: 23887.439448845646 and parameters: {'max_depth': 8, 'learning_rate': 0.13400553889592817, 'n_estimators': 200, 'min_child_weight': 5, 'subsample': 0.7656698223651479, 'colsample_bytree': 0.5995308389710214, 'lambda_l1': 1.4954135113628837, 'lambda_l2': 9.217553539589007}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23887.439 | MAE: 5685.289
RMSE for low y_true (<= 100k): 9627.620 | MAE: 3337.679
RMSE for high y_true (> 100k): 162977.127 | MAE: 133356.735
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:54:16,664] Trial 23 finished with value: 23884.695405752853 and parameters: {'max_depth': 6, 'learning_rate': 0.09350482776322516, 'n_estimators': 400, 'min_child_weight': 3, 'subsample': 0.6726528258845504, 'colsample_bytree': 0.7790942778223344, 'lambda_l1': 3.774825063757851, 'lambda_l2': 7.470485593504138}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23884.695 | MAE: 5679.230
RMSE for low y_true (<= 100k): 9535.179 | MAE: 3326.703
RMSE for high y_true (> 100k): 163250.178 | MAE: 133618.080
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:54:33,536] Trial 24 finished with value: 23953.967410649682 and parameters: {'max_depth': 9, 'learning_rate': 0.10974750642690526, 'n_estimators': 200, 'min_child_weight': 2, 'subsample': 0.7334621078176972, 'colsample_bytree': 0.5584001136239846, 'lambda_l1': 1.5989453348025777, 'lambda_l2': 6.168286056278208}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23953.967 | MAE: 5666.533
RMSE for low y_true (<= 100k): 9519.638 | MAE: 3307.008
RMSE for high y_true (> 100k): 163860.490 | MAE: 133985.971
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:54:55,938] Trial 25 finished with value: 23892.029030320216 and parameters: {'max_depth': 11, 'learning_rate': 0.10127260813510691, 'n_estimators': 300, 'min_child_weight': 5, 'subsample': 0.6449016736590364, 'colsample_bytree': 0.6327544953789176, 'lambda_l1': 2.9649946290844458, 'lambda_l2': 1.8144777901302218}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23892.029 | MAE: 5666.543
RMSE for low y_true (<= 100k): 9588.387 | MAE: 3318.718
RMSE for high y_true (> 100k): 163140.089 | MAE: 133349.695
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:55:13,265] Trial 26 finished with value: 23871.190551663687 and parameters: {'max_depth': 10, 'learning_rate': 0.11917616530894631, 'n_estimators': 200, 'min_child_weight': 11, 'subsample': 0.7621706106518572, 'colsample_bytree': 0.6976311221591945, 'lambda_l1': 1.1165961984365846, 'lambda_l2': 8.822202868423425}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23871.191 | MAE: 5647.521
RMSE for low y_true (<= 100k): 9483.526 | MAE: 3299.161
RMSE for high y_true (> 100k): 163304.399 | MAE: 133359.730
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:55:39,495] Trial 27 finished with value: 24073.174760453123 and parameters: {'max_depth': 12, 'learning_rate': 0.1493512832586195, 'n_estimators': 400, 'min_child_weight': 7, 'subsample': 0.7039941853156342, 'colsample_bytree': 0.543314901701833, 'lambda_l1': 1.8871554288543453, 'lambda_l2': 0.07620691752046849}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  24073.175 | MAE: 5813.946
RMSE for low y_true (<= 100k): 10029.547 | MAE: 3468.025
RMSE for high y_true (> 100k): 163172.391 | MAE: 133393.562
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:56:17,846] Trial 28 finished with value: 23893.45183377283 and parameters: {'max_depth': 8, 'learning_rate': 0.10235023504128188, 'n_estimators': 600, 'min_child_weight': 9, 'subsample': 0.8480887150059457, 'colsample_bytree': 0.7529913510217863, 'lambda_l1': 4.431430644751475, 'lambda_l2': 4.272069812072836}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23893.452 | MAE: 5717.441
RMSE for low y_true (<= 100k): 9705.378 | MAE: 3377.394
RMSE for high y_true (> 100k): 162774.998 | MAE: 132977.598
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:56:40,120] Trial 29 finished with value: 23908.90388612706 and parameters: {'max_depth': 12, 'learning_rate': 0.1309189166335609, 'n_estimators': 300, 'min_child_weight': 7, 'subsample': 0.501247159704613, 'colsample_bytree': 0.7489672081831672, 'lambda_l1': 9.950091867420355, 'lambda_l2': 2.9622599387055653}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23908.904 | MAE: 5675.396
RMSE for low y_true (<= 100k): 9471.342 | MAE: 3318.939
RMSE for high y_true (> 100k): 163648.051 | MAE: 133827.961
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:57:30,980] Trial 30 finished with value: 23971.982427211493 and parameters: {'max_depth': 6, 'learning_rate': 0.09231745157909571, 'n_estimators': 900, 'min_child_weight': 4, 'subsample': 0.6331119941062888, 'colsample_bytree': 0.672846519559568, 'lambda_l1': 3.26588829707755, 'lambda_l2': 3.6380093741949704}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23971.982 | MAE: 5759.556
RMSE for low y_true (<= 100k): 9910.647 | MAE: 3422.555
RMSE for high y_true (> 100k): 162741.827 | MAE: 132854.076
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:57:48,253] Trial 31 finished with value: 23882.49480164726 and parameters: {'max_depth': 10, 'learning_rate': 0.1224753816249068, 'n_estimators': 200, 'min_child_weight': 12, 'subsample': 0.7707044146912992, 'colsample_bytree': 0.7063925812901664, 'lambda_l1': 0.9732192704080953, 'lambda_l2': 8.773043243768084}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23882.495 | MAE: 5662.945
RMSE for low y_true (<= 100k): 9535.849 | MAE: 3313.197
RMSE for high y_true (> 100k): 163230.219 | MAE: 133450.688
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:58:05,413] Trial 32 finished with value: 23902.149466206698 and parameters: {'max_depth': 10, 'learning_rate': 0.11023736057534292, 'n_estimators': 200, 'min_child_weight': 13, 'subsample': 0.7585393067196997, 'colsample_bytree': 0.5970551698522931, 'lambda_l1': 1.9633662413198514, 'lambda_l2': 8.973808528197504}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23902.149 | MAE: 5659.098
RMSE for low y_true (<= 100k): 9521.499 | MAE: 3304.300
RMSE for high y_true (> 100k): 163434.979 | MAE: 133721.484
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:58:17,154] Trial 33 finished with value: 23909.40903619132 and parameters: {'max_depth': 15, 'learning_rate': 0.11973168903348029, 'n_estimators': 100, 'min_child_weight': 9, 'subsample': 0.7982394633839943, 'colsample_bytree': 0.6987844809849639, 'lambda_l1': 0.8042821660851418, 'lambda_l2': 8.109573468009915}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23909.409 | MAE: 5612.096
RMSE for low y_true (<= 100k): 9322.263 | MAE: 3243.300
RMSE for high y_true (> 100k): 164117.004 | MAE: 134435.755
[LightGBM] [Info] Total Bins 85015
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.909642




[LightGBM] [Info] Total Bins 85024
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.914034




[LightGBM] [Info] Total Bins 85026
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.945550




[LightGBM] [Info] Total Bins 85029
[LightGBM] [Info] Number of data points in the train set: 43199, number of used features: 395
[LightGBM] [Info] Start training from score 5.959808




[LightGBM] [Info] Total Bins 85028
[LightGBM] [Info] Number of data points in the train set: 43200, number of used features: 395
[LightGBM] [Info] Start training from score 5.940370


[I 2025-03-02 18:58:28,857] Trial 34 finished with value: 23903.81890669048 and parameters: {'max_depth': 9, 'learning_rate': 0.09993424828472007, 'n_estimators': 100, 'min_child_weight': 6, 'subsample': 0.6967379799988431, 'colsample_bytree': 0.6479796728670014, 'lambda_l1': 1.220999422387717, 'lambda_l2': 7.232289905627115}. Best is trial 21 with value: 23853.022562628455.


RMSE Overall:  23903.819 | MAE: 5593.308
RMSE for low y_true (<= 100k): 9273.440 | MAE: 3219.726
RMSE for high y_true (> 100k): 164222.296 | MAE: 134677.207
Best trial: {'max_depth': 10, 'learning_rate': 0.09134879752884997, 'n_estimators': 200, 'min_child_weight': 4, 'subsample': 0.7406110320807305, 'colsample_bytree': 0.6527923965773857, 'lambda_l1': 1.5854842816691632, 'lambda_l2': 8.833659109656566}


In [13]:
import optuna.visualization as vis

fig = vis.plot_optimization_history(study)
fig.show()


In [15]:
fig = vis.plot_param_importances(study)
fig.show()


In [16]:
best_params = study.best_trial.params


regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', LGBMRegressor(**best_params))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [17]:
model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.018197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 85037
[LightGBM] [Info] Number of data points in the train set: 53999, number of used features: 395
[LightGBM] [Info] Start training from score 5.933758


In [18]:
y_test_pred = model.predict(dataset_test)
y_test_pred




X does not have valid feature names, but LGBMRegressor was fitted with feature names



array([ 6976.63844136,  2392.09955833, 20966.5537356 , ...,
        7474.87164317,  7381.86565926,   380.62672658], shape=(36000,))

In [19]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_test_pred), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,6976.638441
1,WC2005111,2392.099558
2,WC6899143,20966.553736
3,WC5502023,308.760132
4,WC4785156,2855.345134


In [20]:
submission_df.to_csv('../data/output/submission_lgbm.csv', index=False)