In [21]:
#!/usr/bin/env python
# coding: utf-8
# (c) Charles Le Losq, Clément Ferraina 2023
# see embedded licence file
# iVisc 1.0

#
# Library Loading
#
import pandas as pd # manipulate dataframes
import matplotlib.pyplot as plt # plotting
import numpy as np
np.random.seed = 167 # fix numpy random seed for reproducibility

import time, os

# local imports
import gpvisc.utils as utils


# import sklearn utils
from sklearn.metrics import root_mean_squared_error, median_absolute_error, r2_score

from sklearn.preprocessing import StandardScaler

# import joblib (save and load models)
import joblib

# import xgboost
import xgboost as xgb
import optuna
#
# Helper function for performance evaluation
#


# Data loading

We use utils.data_loader to load the data for training the black box models.

In [22]:
# Data loading
print("Loading the viscosity datasets...")
ds = utils.data_loader()
print("Loaded.")

Loading the viscosity datasets...
Loaded.


## further data preparation

We now train the algorithms on the train-valid splits for final training. For tuning of hyperparameters, see the other notebook.

We thus rescale the data using the train-valid split, and transform the train-valid and test datasets using this scaler.

In [23]:
# preparing data scaling
stop_col = 14
scaler = StandardScaler().fit(ds.TPX_train[:,:stop_col])

ds.TPX_train_scaled = scaler.transform(ds.TPX_train[:,:stop_col])
ds.TPX_valid_scaled = scaler.transform(ds.TPX_valid[:,:stop_col])

# XGBoost regressor

In [24]:
clf_xgb = xgb.XGBRegressor(n_estimators=1000, max_depth=6, eta=0.1, subsample=0.7, colsample_bytree=0.8,
                           booster="gbtree")

clf_xgb.fit(ds.TPX_train_scaled, ds.y_train)


In [52]:
# Objective function for Optuna
def objective(trial):
    # Define hyperparameter search space
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 15),
        "eta": trial.suggest_float("eta", 0.01, 0.3),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "booster": "gbtree",
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
    }

    # Train XGBoost model
    clf_xgb = xgb.XGBRegressor(**param)
    clf_xgb.fit(ds.TPX_train_scaled, ds.y_train, 
                eval_set=[(ds.TPX_valid_scaled, ds.y_valid)], 
                verbose=False)

    # Evaluate model using RMSE on test set
    preds = clf_xgb.predict(ds.TPX_valid_scaled)
    rmse = root_mean_squared_error(ds.y_valid, preds)

    return rmse

In [53]:
# Create Optuna study and optimize
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)

[I 2024-12-13 13:55:40,372] A new study created in memory with name: no-name-795ef0b6-73a8-4474-9a39-7c34862963b3
[I 2024-12-13 13:55:41,212] Trial 0 finished with value: 0.7089840868125094 and parameters: {'n_estimators': 989, 'max_depth': 9, 'eta': 0.11884194147008048, 'subsample': 0.822691231576558, 'colsample_bytree': 0.5998445051406167}. Best is trial 0 with value: 0.7089840868125094.
[I 2024-12-13 13:55:41,595] Trial 1 finished with value: 0.798126207163645 and parameters: {'n_estimators': 377, 'max_depth': 10, 'eta': 0.2789908482200552, 'subsample': 0.6209565518744313, 'colsample_bytree': 0.598629588554922}. Best is trial 0 with value: 0.7089840868125094.
[I 2024-12-13 13:55:41,822] Trial 2 finished with value: 0.8493584875270159 and parameters: {'n_estimators': 122, 'max_depth': 10, 'eta': 0.10497662833416331, 'subsample': 0.9354478309471576, 'colsample_bytree': 0.5363553142404258}. Best is trial 0 with value: 0.7089840868125094.
[I 2024-12-13 13:55:42,030] Trial 3 finished wit

In [54]:
# Best hyperparameters
print("Best hyperparameters: ", study.best_params)
print("Best RMSE: ", study.best_value)

Best hyperparameters:  {'n_estimators': 961, 'max_depth': 9, 'eta': 0.052690169767493036, 'subsample': 0.7852755611321516, 'colsample_bytree': 0.9436479498986826}
Best RMSE:  0.5326383652128739


## Determine the effect of the validation dataset size

In other words, is a 80/10/10 split a sound choice?

In [56]:
from sklearn.model_selection import train_test_split

split_ratios = [0.6, 0.7, 0.8, 0.9, 0.95]  # Different train fractions
results = []

# we first get teh train/validation dataset as a working dataset
stop_col = 14
scaler = StandardScaler().fit(ds.TPX_train_valid[:,:stop_col])
ds.TPX_train_valid_scaled = scaler.transform(ds.TPX_train_valid[:,:stop_col])

for ratio in split_ratios:
    X_train, X_valid, y_train, y_valid = train_test_split(
        ds.TPX_train_valid_scaled, 
        ds.y_train_valid, 
        train_size=ratio, random_state=42
    )
    
    # Train the model
    clf_xgb = xgb.XGBRegressor(**study.best_params)
    clf_xgb.fit(X_train, y_train)

    # Validate the model
    y_pred_train = clf_xgb.predict(X_train)
    y_pred = clf_xgb.predict(X_valid)
    rmse_train = root_mean_squared_error(y_train, y_pred_train)
    rmse = root_mean_squared_error(y_valid, y_pred)
    results.append((ratio, rmse_train, rmse))

# Print results
for ratio, rmse_train, rmse_valid in results:
    print("Train Fraction: {:.2f}, Train RMSE: {:.2f}, Validation RMSE: {:.3f}".format(ratio, rmse_train, rmse_valid))

Train Fraction: 0.60, Train RMSE: 0.16, Validation RMSE: 0.417
Train Fraction: 0.70, Train RMSE: 0.17, Validation RMSE: 0.407
Train Fraction: 0.80, Train RMSE: 0.18, Validation RMSE: 0.382
Train Fraction: 0.90, Train RMSE: 0.19, Validation RMSE: 0.359
Train Fraction: 0.95, Train RMSE: 0.19, Validation RMSE: 0.340
