In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error

In [2]:
# load data
df = pd.read_csv("../data/processed/contra_contacts_processed.csv")
df.drop(columns=["DATE", "AGE_CATEGORY"], inplace=True)
df.head()

Unnamed: 0,AGE,DURATION_MIN,CONTACT_COUNT_TOTAL,SYMPTOMATIC,SYMPTOM_COUNT
0,21.0,17.8,5,True,4
1,86.0,9.3,0,False,0
2,12.0,6.5,3,False,0
3,93.0,2.6,0,False,0
4,24.0,9.1,4,True,3


In [7]:
# setting seed
seed = 42

# define target and predictor columns
target_col = "CONTACT_COUNT_TOTAL"
X = df.loc[:, df.columns != target_col]
y = df.loc[:, target_col]

In [8]:
# defining optuna optimizations study
def objective(trial, data=X, target=y):
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=seed
    )

    params = {
        "lambda": trial.suggest_float("lambda", 1e-3, 10.0),
        "alpha": trial.suggest_float("alpha", 1e-3, 10.0),
        "colsample_bytree": trial.suggest_categorical(
            "colsample_bytree", [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
        ),
        "subsample": trial.suggest_categorical(
            "subsample", [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]
        ),
        "learning_rate": trial.suggest_categorical(
            "learning_rate", [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]
        ),
        "n_estimators": 10000,
        "max_depth": trial.suggest_categorical("max_depth", [5, 7, 9, 11, 13, 15, 17]),
        "random_state": trial.suggest_categorical("random_state", [2020]),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 300),
    }

    model = xgb.XGBRegressor(**params)

    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

    y_pred = model.predict(X_test)

    rmse = root_mean_squared_error(y_test, y_pred)

    return rmse

In [9]:
# running optuna optimizations study
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=50)
print("Number of finished trials:", len(study.trials))
print("Best trial:", study.best_trial.params)

[I 2024-02-19 14:05:32,249] A new study created in memory with name: no-name-d56bcdc4-2b7e-41b8-9f67-f6b283bdac75
[I 2024-02-19 14:06:04,961] Trial 0 finished with value: 2.2416799535379983 and parameters: {'lambda': 6.970411210243374, 'alpha': 0.6819516557395895, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 167}. Best is trial 0 with value: 2.2416799535379983.
[I 2024-02-19 14:07:08,642] Trial 1 finished with value: 2.307081604454937 and parameters: {'lambda': 8.048048387429079, 'alpha': 9.808825429721319, 'colsample_bytree': 0.8, 'subsample': 0.6, 'learning_rate': 0.008, 'max_depth': 11, 'random_state': 2020, 'min_child_weight': 40}. Best is trial 0 with value: 2.2416799535379983.
[I 2024-02-19 14:08:46,893] Trial 2 finished with value: 2.4622087219792412 and parameters: {'lambda': 2.911287763590079, 'alpha': 3.936835194932718, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.016, 'max_depth'

Number of finished trials: 50
Best trial: {'lambda': 8.58600777460456, 'alpha': 5.322150421377522, 'colsample_bytree': 0.7, 'subsample': 0.5, 'learning_rate': 0.008, 'max_depth': 5, 'random_state': 2020, 'min_child_weight': 296}


In [10]:
# visualize optimization history
optuna.visualization.plot_optimization_history(study)

In [11]:
# final model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
final_model = xgb.XGBRegressor(**study.best_trial.params)
final_model.fit(
    X_train,
    y_train,
    eval_set=[(X_test, y_test)],
    verbose=0,
)
y_pred = final_model.predict(X_test)
rmse = root_mean_squared_error(y_test, y_pred)
print(f"the rmse of the final model is {rmse}")

the rmse of the final model is 2.492829116048651


In [15]:
# saving final model
final_model.save_model("../models/model.json")

sources:

- https://forecastegy.com/posts/xgboost-hyperparameter-tuning-with-optuna/
- https://medium.com/optuna/using-optuna-to-optimize-xgboost-hyperparameters-63bfcdfd3407
- https://www.kaggle.com/code/hamzaghanmi/xgboost-catboost-using-optuna?scriptVersionId=94510532
