This time I am going to ignore the outliers.

In [1]:
import pandas as pd
import numpy as np

from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import (
    FunctionTransformer,
    PowerTransformer,
    MinMaxScaler,
    OneHotEncoder,
    Binarizer,
    StandardScaler
)


from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, KFold, cross_val_predict
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.decomposition import PCA


import mlflow
import mlflow.sklearn

from matplotlib import pyplot as plt
import seaborn as sns
import optuna

In [2]:
dataset_train = pd.read_csv('../data/processed/dataset_train.csv', index_col=0)
dataset_test = pd.read_csv('../data/processed/dataset_test.csv', index_col=0)

In [3]:
# column names
cols_event_embeddings = [f'event_feat_{i}' for i in range(768)]
cols_event_dummies = ['BACK STRAIN', 'CLEANING INSTRUMENTS', 'CLEANING LEFT SHOULDER SPLINTER', 'CUT WITH KNIFE', 'DEALING CARDS RIGHT TENDON SYNOVITIS', 'FELL', 'FELL FROM LADDER', 'FELL OFF LADDER', 'FRACTURED LEFT WRIST FRACTURE', 'GRINDING FOREIGN BODY', 'GRINDING STEEL BEAM INFECTION', 'HIT AIR HOSE', 'HIT ELBOW', 'HIT FLOOR SHEET', 'INJURY', 'INJURY LIFTING', 'INJURY LIFTING STRAIN', 'JAMMED RIGHT HAND PUNCTURE', 'KNIFE LACERATION', 'KNIFE SLIPPED', 'LACERATION', 'LEFT LEG SNAKE BITE', 'LIFTING', 'LIFTING BOXES', 'LIFTING PATIENT', 'LIFTING TYRES', 'LOWER BACK STRAIN', 'MOTOR VEHICLE COLLISION', 'PREPARING PIPES FOR PAINTING PUNCTURE', 'PULLING CABLES', 'REDBACK SPIDER BITE', 'SLIPPED AND FELL', 'SLIPPED ON FLOOR', 'SLIPPED ON ROLLER', 'SLIPPED ON STAIRS', 'SLIPPED ON WET FLOOR', 'SLIPPED USING LATHE', 'SORTING ALUMINIUM BARS STRAIN', 'SPRAINED RIGHT ANKLE FRACTURE', 'STRAIN', 'STRAIN LIFTING PARTS', 'STRAIN LIFTING STRAIN', 'STRUCK KNIFE LACERATED', 'STRUCK KNIFE LACERATION', 'STRUCK PALLET', 'STRUCK WITH KNIFE', 'USING AIR HOSE STRAIN', 'USING DRILL']
cols_body_parts_embeddings = [f'body_feat_{i}' for i in range(768)]

In [4]:
(dataset_train['UltimateIncurredClaimCost'] > 5e5).sum()

np.int64(13)

In [None]:
# removing values over 500000 will erase 13 rows
dataset_train = dataset_train[dataset_train['UltimateIncurredClaimCost'] < 5e5]

In [6]:
def train_cv(model, X, y, metric = 'rmse', kfold = 5):

    if metric != 'rmse':
        raise NotImplemented

    scores = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error')

    mse_scores = -scores
    rmse_scores = np.sqrt(mse_scores)

    return rmse_scores.mean(), rmse_scores.std()

In [7]:
log_transformer = Pipeline([
    ('log', FunctionTransformer(np.log, validate=False)),
    ('scaler', StandardScaler()) 
])

# gender to bool 
def gender_to_bool(gender_column):
    """
    - Fill missing values with 'M'
    - Replace 'U' with 'M'
    - Return a boolean-ish column: 1 for 'M', 0 otherwise
    """
    g = pd.Series(gender_column.squeeze(), dtype=str).fillna('M').replace('U', 'M')
    is_male = (g == 'M').astype(int)
    return is_male.values.reshape(-1, 1)

gender_transformer = FunctionTransformer(gender_to_bool, validate=False)

# hours worked per week -> to buckets -> to one_hot
def bucket_hours_worked(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 37, 41, np.inf],
        labels=["<=37", "37-41", ">41"]
    ).astype(str).values.reshape(-1, 1)

hours_worked_bucketer = FunctionTransformer(bucket_hours_worked, validate=False)
hours_worked_encoder = OneHotEncoder(drop='first')
hours_worked_pipeline = Pipeline([
    ('bucketizer', hours_worked_bucketer),
    ('encoder', hours_worked_encoder)
])

# DaysToReport (DateReported - DateTimeOfAccident) -> to buckets -> to one_hot
def bucket_days_to_report(dtt_array):
    return pd.cut(
        dtt_array.squeeze(), 
        bins=[-np.inf, 80, 300, 500, np.inf],
        labels=["<=80", "80-300", "300-500", ">500"]
    ).astype(str).values.reshape(-1, 1)

days_to_report_bucketer = FunctionTransformer(bucket_days_to_report, validate=False)
days_to_report_encoder = OneHotEncoder(drop='first')
days_to_report_pipeline = Pipeline([
    ('bucketizer', days_to_report_bucketer),
    ('encoder', days_to_report_encoder)
])

# DaysWorkedPerWeek -> 1 if equals 5, 0 in any other case 
def days_worked_binarize(days_array):
    # Ensure we handle arrays or DataFrames by squeezing to 1D
    days = days_array.squeeze()
    binarized = (days == 5).astype(int)
    # Return as 2D array: (n_samples x 1)
    return binarized.values.reshape(-1, 1) if isinstance(days, pd.Series) else binarized.reshape(-1, 1)

days_worked_transformer = FunctionTransformer(days_worked_binarize, validate=False)

In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('log_inc', log_transformer, ['InitialIncurredCalimsCost', 'WeeklyWages']),
        ('minmax_scaler', MinMaxScaler(), ['Age', 'YearAccident']),
        ('gender_bool', gender_transformer, ['Gender']),
        ('hww_bool_onehot', hours_worked_pipeline, ['HoursWorkedPerWeek']),
        ('dtt_bool_onehot', days_to_report_pipeline, ['DaysToReport']),
        ('has_dependent_bool', Binarizer(threshold=0), ['DependentChildren']),
        ('worked_five_days_bool', days_worked_transformer, ['DaysWorkedPerWeek']),
        ('onehot', OneHotEncoder(drop='first'), ['MaritalStatus', 'PartTimeFullTime']),
        ('pca_event_embeddings', PCA(n_components=250), cols_event_embeddings),
        ('passthrough_event_dummies', 'passthrough', cols_event_dummies),
        ('pca_body_parts_embeddings', PCA(n_components=80), cols_body_parts_embeddings),
        ('std_scaler', StandardScaler(), ['event_median_cost'])
    ],
    remainder='drop'
)

In [9]:
regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', XGBRegressor())
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [10]:

scores = cross_val_score(
    model, 
    dataset_train, 
    dataset_train['UltimateIncurredClaimCost'], 
    cv=5, 
    scoring='neg_mean_squared_error'
)
mse_scores = -scores
rmse_scores = np.sqrt(mse_scores)

print(f"CV MSE:  {mse_scores.mean():.3f}  (+/- {mse_scores.std():.3f})")
print(f"CV RMSE: {rmse_scores.mean():.3f}  (+/- {rmse_scores.std():.3f})")

CV MSE:  499651363.017  (+/- 39383780.495)
CV RMSE: 22335.797  (+/- 873.800)


As expected, even the out of the box version of xgboost has better error than previous attempts. 

In [11]:
X_train = dataset_train.drop(columns=['UltimateIncurredClaimCost'])
y_train = dataset_train['UltimateIncurredClaimCost']

def run_experiment(
        experiment_name, 
        run_name, 
        regressor_object, 
        kfold=5, 
        save_model=True
    ):
    """
    Runs an ML experiment with cross-validation and logs metrics to MLflow,
    tracking overall and segmented performance for targets above and below 100,000.

    Parameters:
    - experiment_name (str): Name of the MLflow experiment.
    - run_name (str): Name of the MLflow run.
    - regressor_object: The regression model.
    - kfold (int): Number of folds for cross-validation.
    - save_model (bool): Whether to save the trained model in MLflow.

    Logs:
    - Overall CV MSE, RMSE, and MAE
    - MSE, RMSE, and MAE for y_true > 100,000
    - MSE, RMSE, and MAE for y_true ≤ 100,000
    """

    # Define pipeline with preprocessing and model
    regressor_pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('linear_model', regressor_object)
    ])

    model = TransformedTargetRegressor(
        regressor=regressor_pipeline,
        transformer=PowerTransformer(method='box-cox', standardize=False)
    )

    # Set MLflow experiment
    mlflow.set_experiment(experiment_name)

    with mlflow.start_run(run_name=run_name):
        # Log hyperparameters
        params = model.get_params(deep=True)
        for param_name, param_value in params.items():
            if isinstance(param_value, (str, int, float, bool, type(None))):
                mlflow.log_param(param_name, param_value)
            else:
                mlflow.log_param(param_name, str(param_value)[:5000])
        
        mlflow.log_param("n_features_in", X_train.shape[1])

        # Perform cross-validation with predictions
        y_pred = cross_val_predict(model, X_train, y_train, cv=kfold)

        # Compute overall errors
        overall_mse = mean_squared_error(y_train, y_pred)
        overall_rmse = np.sqrt(overall_mse)
        overall_mae = mean_absolute_error(y_train, y_pred)

        # Compute errors for high and low y_true
        high_mask = y_train > 100000

        high_mse = mean_squared_error(y_train[high_mask], y_pred[high_mask])
        high_rmse = np.sqrt(high_mse)
        high_mae = mean_absolute_error(y_train[high_mask], y_pred[high_mask])
        
        low_mse = mean_squared_error(y_train[~high_mask], y_pred[~high_mask])
        low_rmse = np.sqrt(low_mse)
        low_mae = mean_absolute_error(y_train[~high_mask], y_pred[~high_mask])
    
        print(f"RMSE Overall:  {overall_rmse:.3f} | MAE: {overall_mae:.3f}")
        print(f"RMSE for low y_true (<= 100k): {low_rmse:.3f} | MAE: {low_mae:.3f}")
        print(f"RMSE for high y_true (> 100k): {high_rmse:.3f} | MAE: {high_mae:.3f}")
        

        # Log overall metrics
        mlflow.log_metric("cv_mse", overall_mse)
        mlflow.log_metric("cv_rmse", overall_rmse)
        mlflow.log_metric("cv_mae", overall_mae)

        # Log segmented metrics
        mlflow.log_metric("cv_mse_high", high_mse)
        mlflow.log_metric("cv_rmse_high", high_rmse)
        mlflow.log_metric("cv_mae_high", high_mae)
        mlflow.log_metric("cv_mse_low", low_mse)
        mlflow.log_metric("cv_rmse_low", low_rmse)
        mlflow.log_metric("cv_mae_low", low_mae)

        # Fit final model on full dataset
        if save_model:
            model.fit(X_train, y_train)
            mlflow.sklearn.log_model(model, artifact_path="models")
    
    return overall_rmse


In [12]:

def objective(trial):
    """Objective function for Optuna to optimize XGBoost hyperparameters."""
    
    # Suggest hyperparameters to optimize
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "max_depth": trial.suggest_int("max_depth", 2, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 0.0, 1.0),
        "reg_lambda": trial.suggest_float("reg_lambda", 0.0, 10.0),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
    }

    rmse = run_experiment(
        'Optuna XGBoost - no outliers',
        'XGBoost optimization',
        XGBRegressor(**params),
        kfold=5,
        save_model=False
    )

    return rmse


In [13]:
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=35)

# Print the best result
print(f"Best trial: {study.best_trial.params}")


[I 2025-03-02 18:02:57,614] A new study created in memory with name: no-name-a5fd74d6-a3b7-4974-9e03-c000325bda38
[I 2025-03-02 18:03:16,067] Trial 0 finished with value: 22068.84161051241 and parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.05530492523604442, 'min_child_weight': 5, 'subsample': 0.8945426311715513, 'colsample_bytree': 0.56843185328282, 'reg_alpha': 0.5536031355636127, 'reg_lambda': 7.666209318263942, 'gamma': 3.953700546676619}. Best is trial 0 with value: 22068.84161051241.


RMSE Overall:  22068.842 | MAE: 5462.042
RMSE for low y_true (<= 100k): 9149.135 | MAE: 3173.861
RMSE for high y_true (> 100k): 150724.319 | MAE: 131583.146


[I 2025-03-02 18:03:34,217] Trial 1 finished with value: 21967.684197337636 and parameters: {'n_estimators': 800, 'max_depth': 6, 'learning_rate': 0.10651476051010814, 'min_child_weight': 2, 'subsample': 0.5872701543423388, 'colsample_bytree': 0.8842607026647225, 'reg_alpha': 0.3247646639166898, 'reg_lambda': 8.731439600303757, 'gamma': 2.3993869122683216}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  21967.684 | MAE: 5418.965
RMSE for low y_true (<= 100k): 9037.466 | MAE: 3142.340
RMSE for high y_true (> 100k): 150265.671 | MAE: 130903.132


[I 2025-03-02 18:04:03,604] Trial 2 finished with value: 22043.37413065177 and parameters: {'n_estimators': 900, 'max_depth': 9, 'learning_rate': 0.0244694316952387, 'min_child_weight': 1, 'subsample': 0.5867588583826903, 'colsample_bytree': 0.8456542095932117, 'reg_alpha': 0.2137328826130912, 'reg_lambda': 6.119093515902776, 'gamma': 2.613877294731091}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  22043.374 | MAE: 5376.165
RMSE for low y_true (<= 100k): 8981.667 | MAE: 3078.292
RMSE for high y_true (> 100k): 151069.958 | MAE: 132031.490


[I 2025-03-02 18:04:12,750] Trial 3 finished with value: 22037.794148419125 and parameters: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.2190143743379237, 'min_child_weight': 10, 'subsample': 0.6141006427957785, 'colsample_bytree': 0.6087609486478807, 'reg_alpha': 0.9127230203624705, 'reg_lambda': 6.428236287502665, 'gamma': 4.174116538058875}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  22037.794 | MAE: 5513.132
RMSE for low y_true (<= 100k): 9203.263 | MAE: 3237.058
RMSE for high y_true (> 100k): 150287.118 | MAE: 130966.942


[I 2025-03-02 18:04:28,877] Trial 4 finished with value: 22297.936278959987 and parameters: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.014738805666152238, 'min_child_weight': 4, 'subsample': 0.9510445139186946, 'colsample_bytree': 0.7538919549321168, 'reg_alpha': 0.42039613219298066, 'reg_lambda': 2.727516639129063, 'gamma': 4.271688347316819}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  22297.936 | MAE: 5371.026
RMSE for low y_true (<= 100k): 9011.188 | MAE: 3024.402
RMSE for high y_true (> 100k): 153056.524 | MAE: 134713.467


[I 2025-03-02 18:04:43,969] Trial 5 finished with value: 22137.13257330044 and parameters: {'n_estimators': 300, 'max_depth': 4, 'learning_rate': 0.01807952868948732, 'min_child_weight': 7, 'subsample': 0.8084832459565772, 'colsample_bytree': 0.6400612598434379, 'reg_alpha': 0.0821774916698973, 'reg_lambda': 1.4738027414213717, 'gamma': 3.710539010057676}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  22137.133 | MAE: 5409.873
RMSE for low y_true (<= 100k): 9061.102 | MAE: 3099.730
RMSE for high y_true (> 100k): 151577.025 | MAE: 132741.515


[I 2025-03-02 18:04:52,255] Trial 6 finished with value: 21982.276506782106 and parameters: {'n_estimators': 100, 'max_depth': 7, 'learning_rate': 0.11384986781781738, 'min_child_weight': 9, 'subsample': 0.5580417296428011, 'colsample_bytree': 0.8736370030842543, 'reg_alpha': 0.5117779728819585, 'reg_lambda': 1.6180852134447132, 'gamma': 1.794115678081536}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  21982.277 | MAE: 5405.111
RMSE for low y_true (<= 100k): 9030.926 | MAE: 3122.113
RMSE for high y_true (> 100k): 150407.033 | MAE: 131240.548


[I 2025-03-02 18:05:12,069] Trial 7 finished with value: 21982.990133687297 and parameters: {'n_estimators': 1000, 'max_depth': 6, 'learning_rate': 0.1457116894831324, 'min_child_weight': 9, 'subsample': 0.7142126481436597, 'colsample_bytree': 0.8258553341230326, 'reg_alpha': 0.292873039621521, 'reg_lambda': 7.787038549394808, 'gamma': 2.873239607257193}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  21982.990 | MAE: 5411.914
RMSE for low y_true (<= 100k): 9054.669 | MAE: 3134.279
RMSE for high y_true (> 100k): 150334.190 | MAE: 130951.735


[I 2025-03-02 18:05:21,819] Trial 8 finished with value: 21969.159039996608 and parameters: {'n_estimators': 300, 'max_depth': 5, 'learning_rate': 0.17591897581868884, 'min_child_weight': 4, 'subsample': 0.8408845836419407, 'colsample_bytree': 0.725848266165589, 'reg_alpha': 0.8777512281681858, 'reg_lambda': 6.976652378917115, 'gamma': 2.8986964647858326}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  21969.159 | MAE: 5434.725
RMSE for low y_true (<= 100k): 9070.536 | MAE: 3161.334
RMSE for high y_true (> 100k): 150167.913 | MAE: 130740.654


[I 2025-03-02 18:05:53,802] Trial 9 finished with value: 21998.52569491219 and parameters: {'n_estimators': 900, 'max_depth': 9, 'learning_rate': 0.024141627540025554, 'min_child_weight': 2, 'subsample': 0.6944153435273086, 'colsample_bytree': 0.7994139952822198, 'reg_alpha': 0.13269873382891229, 'reg_lambda': 7.572640908567346, 'gamma': 2.156708410193719}. Best is trial 1 with value: 21967.684197337636.


RMSE Overall:  21998.526 | MAE: 5368.360
RMSE for low y_true (<= 100k): 8990.748 | MAE: 3080.301
RMSE for high y_true (> 100k): 150672.795 | MAE: 131482.750


[I 2025-03-02 18:06:13,763] Trial 10 finished with value: 21872.584315956265 and parameters: {'n_estimators': 700, 'max_depth': 2, 'learning_rate': 0.06988017595492438, 'min_child_weight': 2, 'subsample': 0.5107856485527618, 'colsample_bytree': 0.9942620245240273, 'reg_alpha': 0.6560989608470564, 'reg_lambda': 9.988529998275391, 'gamma': 0.3699052056976946}. Best is trial 10 with value: 21872.584315956265.


RMSE Overall:  21872.584 | MAE: 5529.735
RMSE for low y_true (<= 100k): 9462.503 | MAE: 3313.672
RMSE for high y_true (> 100k): 148028.365 | MAE: 127675.823


[I 2025-03-02 18:06:33,416] Trial 11 finished with value: 21871.778486307026 and parameters: {'n_estimators': 700, 'max_depth': 2, 'learning_rate': 0.06706925446555424, 'min_child_weight': 2, 'subsample': 0.500118978152109, 'colsample_bytree': 0.9830524357474193, 'reg_alpha': 0.6976761426332289, 'reg_lambda': 9.977351292557234, 'gamma': 0.14941077072108033}. Best is trial 11 with value: 21871.778486307026.


RMSE Overall:  21871.778 | MAE: 5520.349
RMSE for low y_true (<= 100k): 9439.895 | MAE: 3304.293
RMSE for high y_true (> 100k): 148101.224 | MAE: 127666.032


[I 2025-03-02 18:06:50,978] Trial 12 finished with value: 21896.478006252157 and parameters: {'n_estimators': 600, 'max_depth': 2, 'learning_rate': 0.053269953552490146, 'min_child_weight': 3, 'subsample': 0.5010821950509589, 'colsample_bytree': 0.9896650613257924, 'reg_alpha': 0.7400879156319108, 'reg_lambda': 9.60262772373104, 'gamma': 0.03251918984173097}. Best is trial 11 with value: 21871.778486307026.


RMSE Overall:  21896.478 | MAE: 5502.257
RMSE for low y_true (<= 100k): 9324.258 | MAE: 3268.928
RMSE for high y_true (> 100k): 148708.567 | MAE: 128600.020


[I 2025-03-02 18:07:11,721] Trial 13 finished with value: 21872.405150391 and parameters: {'n_estimators': 700, 'max_depth': 2, 'learning_rate': 0.059177731122720637, 'min_child_weight': 1, 'subsample': 0.5151059435977651, 'colsample_bytree': 0.978115024606921, 'reg_alpha': 0.6869111108806304, 'reg_lambda': 9.897379626252569, 'gamma': 0.0609692523105414}. Best is trial 11 with value: 21871.778486307026.


RMSE Overall:  21872.405 | MAE: 5514.002
RMSE for low y_true (<= 100k): 9387.590 | MAE: 3291.855
RMSE for high y_true (> 100k): 148289.551 | MAE: 127995.459


[I 2025-03-02 18:07:28,436] Trial 14 finished with value: 21935.48794498943 and parameters: {'n_estimators': 500, 'max_depth': 2, 'learning_rate': 0.0383638658625009, 'min_child_weight': 1, 'subsample': 0.6590748699612663, 'colsample_bytree': 0.9279982954590396, 'reg_alpha': 0.7546015837753228, 'reg_lambda': 4.367482602127472, 'gamma': 0.9007593908035632}. Best is trial 11 with value: 21871.778486307026.


RMSE Overall:  21935.488 | MAE: 5495.093
RMSE for low y_true (<= 100k): 9222.797 | MAE: 3245.218
RMSE for high y_true (> 100k): 149378.433 | MAE: 129504.804


[I 2025-03-02 18:07:43,578] Trial 15 finished with value: 21869.44442915948 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.08175440361141084, 'min_child_weight': 6, 'subsample': 0.7613552529295636, 'colsample_bytree': 0.9512894233479394, 'reg_alpha': 0.6334183097712719, 'reg_lambda': 8.900655701094166, 'gamma': 1.1579676935923378}. Best is trial 15 with value: 21869.44442915948.


RMSE Overall:  21869.444 | MAE: 5458.544
RMSE for low y_true (<= 100k): 9261.516 | MAE: 3225.278
RMSE for high y_true (> 100k): 148701.429 | MAE: 128552.804


[I 2025-03-02 18:07:58,194] Trial 16 finished with value: 21859.813298464473 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.08932960022856933, 'min_child_weight': 7, 'subsample': 0.7794678867893635, 'colsample_bytree': 0.9252586533220631, 'reg_alpha': 0.6056550012858848, 'reg_lambda': 4.830070248663193, 'gamma': 1.2379156701115268}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21859.813 | MAE: 5452.334
RMSE for low y_true (<= 100k): 9243.349 | MAE: 3220.208
RMSE for high y_true (> 100k): 148684.263 | MAE: 128483.746


[I 2025-03-02 18:08:12,114] Trial 17 finished with value: 21876.95462036436 and parameters: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.10364914333995225, 'min_child_weight': 7, 'subsample': 0.7799343550129755, 'colsample_bytree': 0.9176591196173307, 'reg_alpha': 0.5799871773568667, 'reg_lambda': 4.166947877419478, 'gamma': 1.3234861856821656}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21876.955 | MAE: 5465.951
RMSE for low y_true (<= 100k): 9271.230 | MAE: 3230.527
RMSE for high y_true (> 100k): 148730.056 | MAE: 128679.198


[I 2025-03-02 18:08:23,088] Trial 18 finished with value: 21876.363795707137 and parameters: {'n_estimators': 400, 'max_depth': 5, 'learning_rate': 0.26544104141208513, 'min_child_weight': 7, 'subsample': 0.857185283238507, 'colsample_bytree': 0.5061872578775527, 'reg_alpha': 0.9972628859331508, 'reg_lambda': 5.379115512562503, 'gamma': 1.0675922957838824}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21876.364 | MAE: 5521.330
RMSE for low y_true (<= 100k): 9339.258 | MAE: 3289.120
RMSE for high y_true (> 100k): 148490.396 | MAE: 128557.366


[I 2025-03-02 18:08:38,746] Trial 19 finished with value: 22534.186319630808 and parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.01017014772205781, 'min_child_weight': 6, 'subsample': 0.7530592531866515, 'colsample_bytree': 0.7187555436074761, 'reg_alpha': 0.4019475328917279, 'reg_lambda': 3.1672368976637806, 'gamma': 1.5980136323529923}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  22534.186 | MAE: 5403.887
RMSE for low y_true (<= 100k): 9081.336 | MAE: 3016.723
RMSE for high y_true (> 100k): 154760.235 | MAE: 136980.776


[I 2025-03-02 18:08:45,585] Trial 20 finished with value: 22659.647212286203 and parameters: {'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.04090895634062447, 'min_child_weight': 8, 'subsample': 0.9742920448936326, 'colsample_bytree': 0.9259396878378807, 'reg_alpha': 0.8305789286135719, 'reg_lambda': 0.019550390450806532, 'gamma': 0.8771679153327243}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  22659.647 | MAE: 5425.234
RMSE for low y_true (<= 100k): 9085.208 | MAE: 3015.176
RMSE for high y_true (> 100k): 155772.423 | MAE: 138264.024


[I 2025-03-02 18:09:08,419] Trial 21 finished with value: 21861.103097973762 and parameters: {'n_estimators': 700, 'max_depth': 3, 'learning_rate': 0.07931529835224439, 'min_child_weight': 5, 'subsample': 0.7124411422483973, 'colsample_bytree': 0.9498568886316814, 'reg_alpha': 0.6231967576014658, 'reg_lambda': 8.75889211660226, 'gamma': 0.5208317884106287}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21861.103 | MAE: 5491.260
RMSE for low y_true (<= 100k): 9440.451 | MAE: 3276.426
RMSE for high y_true (> 100k): 148010.790 | MAE: 127569.564


[I 2025-03-02 18:09:28,691] Trial 22 finished with value: 21878.213482379142 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.0864043252241174, 'min_child_weight': 5, 'subsample': 0.7057166104741744, 'colsample_bytree': 0.9364670473923316, 'reg_alpha': 0.44110424025227735, 'reg_lambda': 8.574548783027861, 'gamma': 0.6475140588213777}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21878.213 | MAE: 5484.647
RMSE for low y_true (<= 100k): 9350.622 | MAE: 3257.310
RMSE for high y_true (> 100k): 148466.267 | MAE: 128252.168


[I 2025-03-02 18:09:42,293] Trial 23 finished with value: 22064.791356004112 and parameters: {'n_estimators': 600, 'max_depth': 3, 'learning_rate': 0.14645864518498145, 'min_child_weight': 6, 'subsample': 0.7533585768445136, 'colsample_bytree': 0.8849891640398009, 'reg_alpha': 0.606426673393346, 'reg_lambda': 5.570592598386893, 'gamma': 4.95948137744373}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  22064.791 | MAE: 5473.691
RMSE for low y_true (<= 100k): 9060.670 | MAE: 3177.869
RMSE for high y_true (> 100k): 150985.364 | MAE: 132015.951


[I 2025-03-02 18:10:05,301] Trial 24 finished with value: 21897.884434277836 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.042178324181261745, 'min_child_weight': 6, 'subsample': 0.6623604265357363, 'colsample_bytree': 0.9487139781564986, 'reg_alpha': 0.8029028161148553, 'reg_lambda': 8.639086201478884, 'gamma': 1.642318032369448}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21897.884 | MAE: 5424.920
RMSE for low y_true (<= 100k): 9105.138 | MAE: 3169.674
RMSE for high y_true (> 100k): 149466.642 | MAE: 129730.707


[I 2025-03-02 18:10:19,968] Trial 25 finished with value: 21861.353720601976 and parameters: {'n_estimators': 400, 'max_depth': 3, 'learning_rate': 0.13215056678438256, 'min_child_weight': 4, 'subsample': 0.776724677468858, 'colsample_bytree': 0.7791842406657532, 'reg_alpha': 0.6244658975265931, 'reg_lambda': 4.575381702158063, 'gamma': 0.5882566248107315}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21861.354 | MAE: 5511.880
RMSE for low y_true (<= 100k): 9470.446 | MAE: 3298.305
RMSE for high y_true (> 100k): 147907.214 | MAE: 127520.836


[I 2025-03-02 18:10:37,650] Trial 26 finished with value: 21920.478341864284 and parameters: {'n_estimators': 400, 'max_depth': 10, 'learning_rate': 0.14732208102607755, 'min_child_weight': 4, 'subsample': 0.8084884966478612, 'colsample_bytree': 0.7766958490036804, 'reg_alpha': 0.4805950804664292, 'reg_lambda': 4.371428656625712, 'gamma': 0.5391489122363708}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21920.478 | MAE: 5475.540
RMSE for low y_true (<= 100k): 9222.844 | MAE: 3226.052
RMSE for high y_true (> 100k): 149254.573 | MAE: 129463.987


[I 2025-03-02 18:10:45,713] Trial 27 finished with value: 21912.550111645523 and parameters: {'n_estimators': 200, 'max_depth': 5, 'learning_rate': 0.21253750441024796, 'min_child_weight': 5, 'subsample': 0.8523114043953435, 'colsample_bytree': 0.6636633658066793, 'reg_alpha': 0.5337177736073679, 'reg_lambda': 3.578896202085646, 'gamma': 1.9992798934351463}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21912.550 | MAE: 5458.960
RMSE for low y_true (<= 100k): 9177.984 | MAE: 3207.167
RMSE for high y_true (> 100k): 149341.634 | MAE: 129574.449


[I 2025-03-02 18:10:58,242] Trial 28 finished with value: 21871.711771516788 and parameters: {'n_estimators': 400, 'max_depth': 4, 'learning_rate': 0.12082946349941874, 'min_child_weight': 8, 'subsample': 0.8900145867525515, 'colsample_bytree': 0.8391061722732704, 'reg_alpha': 0.35105274985644325, 'reg_lambda': 4.880315240222966, 'gamma': 1.3506323875852166}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21871.712 | MAE: 5444.770
RMSE for low y_true (<= 100k): 9162.797 | MAE: 3198.276
RMSE for high y_true (> 100k): 149056.810 | MAE: 129268.120


[I 2025-03-02 18:11:21,833] Trial 29 finished with value: 21872.168896436004 and parameters: {'n_estimators': 600, 'max_depth': 7, 'learning_rate': 0.08384386735124234, 'min_child_weight': 3, 'subsample': 0.8005072149943173, 'colsample_bytree': 0.8032882674264918, 'reg_alpha': 0.5473638192618513, 'reg_lambda': 2.2745509002846704, 'gamma': 0.49619654003972447}. Best is trial 16 with value: 21859.813298464473.


RMSE Overall:  21872.169 | MAE: 5470.701
RMSE for low y_true (<= 100k): 9300.543 | MAE: 3238.756
RMSE for high y_true (> 100k): 148589.617 | MAE: 128492.191


[I 2025-03-02 18:11:46,078] Trial 30 finished with value: 21837.696781241513 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.05014956038356393, 'min_child_weight': 5, 'subsample': 0.9151164631672968, 'colsample_bytree': 0.6933136371098397, 'reg_alpha': 0.7416196778055544, 'reg_lambda': 6.125025399538723, 'gamma': 0.7719121966811282}. Best is trial 30 with value: 21837.696781241513.


RMSE Overall:  21837.697 | MAE: 5474.491
RMSE for low y_true (<= 100k): 9309.531 | MAE: 3250.236
RMSE for high y_true (> 100k): 148273.722 | MAE: 128072.111


[I 2025-03-02 18:12:12,303] Trial 31 finished with value: 21848.958728602527 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.03376991108567134, 'min_child_weight': 5, 'subsample': 0.7240760515313382, 'colsample_bytree': 0.6626718935404237, 'reg_alpha': 0.7513806298367297, 'reg_lambda': 5.858869402231063, 'gamma': 0.7604768013431561}. Best is trial 30 with value: 21837.696781241513.


RMSE Overall:  21848.959 | MAE: 5484.150
RMSE for low y_true (<= 100k): 9323.414 | MAE: 3259.386
RMSE for high y_true (> 100k): 148318.740 | MAE: 128109.810


[I 2025-03-02 18:12:39,889] Trial 32 finished with value: 21854.2984329634 and parameters: {'n_estimators': 800, 'max_depth': 3, 'learning_rate': 0.031341442334261886, 'min_child_weight': 5, 'subsample': 0.7252977216831145, 'colsample_bytree': 0.6766400047581759, 'reg_alpha': 0.766871006970713, 'reg_lambda': 6.33145659456738, 'gamma': 0.8538224067957675}. Best is trial 30 with value: 21837.696781241513.


RMSE Overall:  21854.298 | MAE: 5477.005
RMSE for low y_true (<= 100k): 9287.281 | MAE: 3247.324
RMSE for high y_true (> 100k): 148487.742 | MAE: 128373.640


[I 2025-03-02 18:13:07,893] Trial 33 finished with value: 21835.750209636237 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.030232416594797168, 'min_child_weight': 5, 'subsample': 0.9348382433357434, 'colsample_bytree': 0.6783730222530824, 'reg_alpha': 0.7620160424974837, 'reg_lambda': 6.173470917476139, 'gamma': 0.8172031120662108}. Best is trial 33 with value: 21835.750209636237.


RMSE Overall:  21835.750 | MAE: 5453.946
RMSE for low y_true (<= 100k): 9248.083 | MAE: 3224.448
RMSE for high y_true (> 100k): 148469.456 | MAE: 128340.562


[I 2025-03-02 18:13:35,321] Trial 34 finished with value: 21833.27461769771 and parameters: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.030891804250286765, 'min_child_weight': 5, 'subsample': 0.9432993103831395, 'colsample_bytree': 0.6808937148959778, 'reg_alpha': 0.9409491829357446, 'reg_lambda': 6.2499627510090665, 'gamma': 0.8376028736074165}. Best is trial 34 with value: 21833.27461769771.


RMSE Overall:  21833.275 | MAE: 5450.320
RMSE for low y_true (<= 100k): 9232.365 | MAE: 3219.282
RMSE for high y_true (> 100k): 148502.941 | MAE: 128421.808
Best trial: {'n_estimators': 800, 'max_depth': 4, 'learning_rate': 0.030891804250286765, 'min_child_weight': 5, 'subsample': 0.9432993103831395, 'colsample_bytree': 0.6808937148959778, 'reg_alpha': 0.9409491829357446, 'reg_lambda': 6.2499627510090665, 'gamma': 0.8376028736074165}


In [21]:
import optuna.visualization as vis

fig = vis.plot_optimization_history(study)
fig.show()


In [22]:
fig = vis.plot_param_importances(study)
fig.show()


In [24]:
best_params = study.best_trial.params


regressor_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('linear_model', XGBRegressor(**best_params))
])

model = TransformedTargetRegressor(
    regressor=regressor_pipeline,
    transformer=PowerTransformer(method='box-cox', standardize=False)
)

In [25]:
model.fit(X_train, y_train)

In [26]:
y_test_pred = model.predict(dataset_test)
y_test_pred

array([ 5994.1753 ,  2736.7083 , 19513.99   , ...,  6074.552  ,
        7215.8286 ,   417.86932], shape=(36000,), dtype=float32)

In [27]:
submission_df = pd.DataFrame(zip(dataset_test.index, y_test_pred), columns=['ClaimNumber', 'UltimateIncurredClaimCost'])

print(submission_df.shape)
submission_df.head()

(36000, 2)


Unnamed: 0,ClaimNumber,UltimateIncurredClaimCost
0,WC8145235,5994.175293
1,WC2005111,2736.708252
2,WC6899143,19513.990234
3,WC5502023,359.31665
4,WC4785156,3069.980469


In [None]:
submission_df.to_csv('../data/output/submission_xgb-no_outliers2.csv', index=False)