# XG Boost

this time let's test with XG Boost model.

In [1]:
# import required libraries
import sys, os
import numpy as np
import pandas as pd
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

local_module_path = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()),'preprocess'))
sys.path.append(local_module_path)
from pscript import preprocess

In [2]:
# load the dataset
train = pd.read_csv("../../../dataset/train.csv")
test = pd.read_csv("../../../dataset/test.csv")

In [3]:
# preprocess the datasets
ptrain = preprocess(train)
ptest = preprocess(test)

In [4]:
# ready dataset for training and testing
rtrain = ptrain.drop(['id','accident_risk'], axis=1)
target = ptrain['accident_risk']

rtest = ptest.drop('id', axis=1)

In [5]:
# split the dataset into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(rtrain, target, train_size=0.8, test_size=0.2, random_state=42)

# convert data to dmatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

In [None]:
def objective(trial):
    # Hyperparameter
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'verbosity': 0, # To keep the output clean
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.6, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-8, 10.0, log=True), # L2 regularization
        'alpha': trial.suggest_float('alpha', 1e-8, 10.0, log=True),   # L1 regularization
    }
    
    # training setup
    bst = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=1000,
        evals=[(dvalid, 'validation')],
        early_stopping_rounds=50,
        verbose_eval=False
    )
    
    # Prediction uses the returned booster (bst)
    preds = bst.predict(dvalid)
    rmse = np.sqrt(mean_squared_error(y_valid, preds))
    
    return rmse

In [None]:
# Study and optimization
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)

print("--- Optimization Finished ---")
print(f"Best RMSE: {study.best_value}")
print(f"Best params: {study.best_params}")

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print(f"  Value (RMSE): {trial.value:.4f}")
print("  Best Hyperparameters: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2025-10-05 17:02:09,930] A new study created in memory with name: no-name-86b07cb1-a1f0-4ed9-b49e-f868d7e5b912
[I 2025-10-05 17:02:47,256] Trial 0 finished with value: 0.05629594711079217 and parameters: {'learning_rate': 0.011911379310656748, 'max_depth': 9, 'subsample': 0.6957464208368207, 'colsample_bytree': 0.8441560339987217, 'lambda': 2.7000054608983084e-06, 'alpha': 4.105961437279038}. Best is trial 0 with value: 0.05629594711079217.
[I 2025-10-05 17:02:57,476] Trial 1 finished with value: 0.0563056891180674 and parameters: {'learning_rate': 0.1468372681725936, 'max_depth': 5, 'subsample': 0.7244355104932281, 'colsample_bytree': 0.9233554203628653, 'lambda': 0.8870612859172206, 'alpha': 5.3030337569945e-05}. Best is trial 0 with value: 0.05629594711079217.
[I 2025-10-05 17:03:13,497] Trial 2 finished with value: 0.056180110275803344 and parameters: {'learning_rate': 0.0215938474013395, 'max_depth': 9, 'subsample': 0.6541033638956458, 'colsample_bytree': 0.8460855420918171, 'l

--- Optimization Finished ---
Best RMSE: 0.05614240163281179
Best params: {'learning_rate': 0.016252336249992037, 'max_depth': 8, 'subsample': 0.8337512565597658, 'colsample_bytree': 0.9460166804966047, 'lambda': 1.175196719626886e-05, 'alpha': 4.232127098936453e-08}
Number of finished trials:  100
Best trial:
  Value (RMSE): 0.0561
  Best Hyperparameters: 
    learning_rate: 0.016252336249992037
    max_depth: 8
    subsample: 0.8337512565597658
    colsample_bytree: 0.9460166804966047
    lambda: 1.175196719626886e-05
    alpha: 4.232127098936453e-08


In [8]:
# Train the final model with the best hyperparameters
best_params = trial.params
final_model = xgb.XGBRegressor(**best_params)
final_model.fit(rtrain, target)

In [13]:
fig = optuna.visualization.plot_optimization_history(study)
fig.show()

In [9]:
# predict the values
preds = final_model.predict(rtest)

In [10]:
prediction = []
for x in preds:
    prediction.append(round(x,3))

In [11]:
result = pd.DataFrame({
    'id': ptest['id'],
    'accident_risk': prediction
})

In [12]:
result.to_csv("../submissions/submission06.csv", index=False)