In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor
import optuna
import mlflow

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df = pd.read_csv('../usdvnd/usdvnd_cleaned_features_regression.csv')
df.set_index('Ngày', inplace=True)
df.index = pd.to_datetime(df.index)
# Tách X và y
# X sẽ bỏ cột 'Target', 'Ngày' và các cột giá gốc (đã có lag/MA)
X = df.drop(columns=['Target'])
y = df['Target']

In [3]:
split_point = int(len(X) - 30)

X_train, y_train = X.iloc[:split_point], y.iloc[:split_point]
X_test, y_test = X.iloc[split_point:], y.iloc[split_point:]

In [4]:
# ==============================================
# 3. Define Optuna objective function with MLflow
# ==============================================
def objective(trial):
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 200, 1000),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
        "gamma": trial.suggest_float("gamma", 0.0, 5.0),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10.0, log=True),
        "random_state": 42,
        "n_jobs": -1,
        "tree_method": "hist",
    }

    with mlflow.start_run(nested=True):
        model = XGBRegressor(**params)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        rmse = float(np.sqrt(mean_squared_error(y_test, y_pred)))
        mae = float(mean_absolute_error(y_test, y_pred))
        r2 = float(r2_score(y_test, y_pred))

        # Log hyperparameters + metrics
        mlflow.log_params(params)
        mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})

    return rmse

In [5]:
# ==============================================
# 4. Run Optuna study with MLflow
# ==============================================
# Force MLflow to always use the root project mlruns folder
mlflow.set_tracking_uri("../mlruns")
mlflow.set_experiment("xgboost_regression")

study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=15)

print("Best params:", study.best_trial.params)

  return FileStore(store_uri, store_uri)
2025/11/11 09:37:11 INFO mlflow.tracking.fluent: Experiment with name 'xgboost_regression' does not exist. Creating a new experiment.
[I 2025-11-11 09:37:11,585] A new study created in memory with name: no-name-8caef1e3-e74b-45d4-a692-3d938b63ae6e
[I 2025-11-11 09:37:19,659] Trial 0 finished with value: 30.7041870409775 and parameters: {'n_estimators': 685, 'max_depth': 6, 'learning_rate': 0.04226144446736835, 'subsample': 0.9946692801338723, 'colsample_bytree': 0.5234544364617055, 'min_child_weight': 9, 'gamma': 1.9379237437629389, 'reg_alpha': 0.09938017016832794, 'reg_lambda': 0.007210660251612904}. Best is trial 0 with value: 30.7041870409775.
[I 2025-11-11 09:37:25,122] Trial 1 finished with value: 22.934800277227335 and parameters: {'n_estimators': 620, 'max_depth': 6, 'learning_rate': 0.16657702494224907, 'subsample': 0.8308043710873154, 'colsample_bytree': 0.988338989377777, 'min_child_weight': 4, 'gamma': 2.8472831958494496, 'reg_alpha'

Best params: {'n_estimators': 783, 'max_depth': 8, 'learning_rate': 0.01751708363990651, 'subsample': 0.71963559631887, 'colsample_bytree': 0.5844740997706666, 'min_child_weight': 7, 'gamma': 3.304437960581069, 'reg_alpha': 0.014416922382707913, 'reg_lambda': 0.05242709068319506}


In [7]:
# ==============================================
# 5. Train final model with best params and log to MLflow
# ==============================================
best_params = study.best_trial.params
best_model = XGBRegressor(**best_params)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("Final tuned model performance:")
print("MAE:", mae)
print("RMSE:", rmse)
print("R2:", r2)

# Log final model
with mlflow.start_run(run_name="best_xgboost_model"):
    mlflow.log_params(best_params)
    mlflow.log_metrics({"rmse": rmse, "mae": mae, "r2": r2})
    mlflow.xgboost.log_model(best_model, name="model")

Final tuned model performance:
MAE: 15.5958984375
RMSE: 19.079874024445136
R2: -0.5671089808370431


