In [1]:
import time
import os
from memory_profiler import memory_usage
import xgboost as xgb
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, log_loss
import optuna
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


os.makedirs("plots", exist_ok=True)

  from .autonotebook import tqdm as notebook_tqdm


### Prepare data

In [2]:
X, y = load_breast_cancer(return_X_y=True, as_frame=True)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

### Plots

In [3]:
def plot_feature_importance(model, booster, top_n=20):
    importance_type = "gain" if booster != "gblinear" else "weight"
    try:
        imp = model.get_score(importance_type=importance_type)
        if not imp:
            print(f"No feature importance for booster: {booster}")
            return
        keys, vals = zip(*sorted(imp.items(), key=lambda kv: kv[1], reverse=True)[:top_n])
        plt.figure(figsize=(8, 6))
        plt.barh(range(len(vals))[::-1], vals)
        plt.yticks(range(len(keys))[::-1], keys)
        plt.xlabel(importance_type.capitalize())
        plt.title(f"Top-{top_n} Feature Importance ({booster})")
        plt.tight_layout()
        plt.savefig(f"plots/feature_importance_{booster}.png", dpi=150)
        plt.close()
    except xgb.core.XGBoostError as e:
        print(f"Skipping feature importance for {booster}: {e}")


def plot_learning_curves(evals_res: dict, booster: str):
    metric_names = {
        "logloss": "Log Loss",
        "error": "Classification Error"
    }
    for metric, train_vals in evals_res["train"].items():
        val_vals = evals_res["val"][metric]
        plt.figure(figsize=(6, 4))
        plt.plot(train_vals, label="Train")
        plt.plot(val_vals, label="Validation")
        plt.xlabel("Iteration")
        plt.ylabel(metric_names.get(metric, metric))
        plt.title(f"{metric_names.get(metric, metric)} – {booster}")
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"plots/{metric}_{booster}.png", dpi=150)
        plt.close()

### Boosters

In [4]:
base = dict(
    objective="binary:logistic",
    eval_metric=["logloss", "error"],
    eta=0.1,
    seed=42,
    verbosity=0,
)

boosters = ["gbtree", "dart", "gblinear"]
results = []

### Custom booster runner

In [5]:
def run_booster(booster: str):
    params = base | {"booster": booster}

    t0 = time.perf_counter()
    mem0 = memory_usage(-1, 0.1, 1)[0]

    evals_res = {}
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=300,
        evals=[(dtrain, "train"), (dtest, "val")],
        early_stopping_rounds=30,
        evals_result=evals_res
    )

    mem1 = memory_usage(-1, 0.1, 1)[0]
    dt = time.perf_counter() - t0
    y_prob = model.predict(dtest)
    y_pred = (y_prob > 0.5).astype(int)

    results.append(
        dict(
            booster=booster,
            best_iter=model.best_iteration + 1,
            accuracy=accuracy_score(y_test, y_pred),
            logloss=log_loss(y_test, y_prob),
            time_sec=dt,
            memory_mb=mem1 - mem0,
        )
    )

    plot_learning_curves(evals_res, booster)
    plot_feature_importance(model, booster)

    return model

In [6]:
models = {b: run_booster(b) for b in boosters}

df = pd.DataFrame(results).round(4)

[0]	train-logloss:0.57707	train-error:0.37363	val-logloss:0.58244	val-error:0.36842
[1]	train-logloss:0.50971	train-error:0.37363	val-logloss:0.52329	val-error:0.36842
[2]	train-logloss:0.45239	train-error:0.05714	val-logloss:0.47286	val-error:0.06140
[3]	train-logloss:0.40310	train-error:0.01758	val-logloss:0.43077	val-error:0.05263
[4]	train-logloss:0.36106	train-error:0.01538	val-logloss:0.39545	val-error:0.05263
[5]	train-logloss:0.32563	train-error:0.01099	val-logloss:0.36584	val-error:0.04386
[6]	train-logloss:0.29400	train-error:0.01099	val-logloss:0.34128	val-error:0.06140
[7]	train-logloss:0.26637	train-error:0.01319	val-logloss:0.32020	val-error:0.07018
[8]	train-logloss:0.24237	train-error:0.00879	val-logloss:0.29809	val-error:0.06140
[9]	train-logloss:0.22114	train-error:0.00879	val-logloss:0.28105	val-error:0.07018
[10]	train-logloss:0.20238	train-error:0.00879	val-logloss:0.26342	val-error:0.07018
[11]	train-logloss:0.18527	train-error:0.00879	val-logloss:0.25034	val-erro

In [7]:
df

Unnamed: 0,booster,best_iter,accuracy,logloss,time_sec,memory_mb
0,gbtree,6,0.9474,0.1243,2.1319,4.6055
1,dart,6,0.9474,0.1243,2.1701,1.2773
2,gblinear,37,0.9737,0.0698,2.1683,0.1367


### Memory Behavior of XGBoost Boosters

#### `gbtree`
- Builds and stores all trees sequentially.
- Each new tree is added to the model.
- As training progresses, all trees accumulate in memory.
- This can result in higher memory usage, especially with many boosting rounds.

#### `dart` (Dropout Additive Regression Trees)
- Randomly **drops some of the previously built trees** when training a new one.
- In each iteration, only a **subset of trees** is used for gradient calculation.
- This reduces the amount of active data in memory at any given time.
- Does **not** maintain a fully additive model like `gbtree`, which can result in lower peak memory usage.

#### `gblinear`
- Does **not** use trees at all.
- Relies on a **linear model** trained with coordinate descent or other linear optimization methods.
- Can sometimes use **more memory temporarily** due to matrix operations, especially with high-dimensional feature spaces.

### Optuna-tunning

In [None]:
def objective(trial):
    p = dict(
        objective="binary:logistic",
        eval_metric="error",
        booster="gbtree",
        eta=trial.suggest_float("eta", 0.01, 0.3, log=True),
        max_depth=trial.suggest_int("max_depth", 3, 10),
        subsample=trial.suggest_float("subsample", 0.5, 1.0),
        colsample_bytree=trial.suggest_float("colsample_bytree", 0.5, 1.0),
        lambda_=trial.suggest_float("lambda", 1e-3, 10, log=True),
        alpha=trial.suggest_float("alpha", 1e-3, 10, log=True),
        verbosity=0,
        seed=42,
    )
    bst = xgb.train(p, dtrain, num_boost_round=200,
                    evals=[(dtest, "val")],
                    early_stopping_rounds=20)
    y_pred = (bst.predict(dtest) > 0.5).astype(int)
    return 1 - accuracy_score(y_test, y_pred)

optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=30)

print("\nBest params Optuna:", study.best_trial.params)

best_params = base | study.best_trial.params
best_model = xgb.train(
    best_params, dtrain, num_boost_round=300,
    evals=[(dtest, "val")], early_stopping_rounds=30
)

print(f"Accuracy tuned model: "
      f"{accuracy_score(y_test, (best_model.predict(dtest) > 0.5).astype(int)):.4f}")


[0]	val-error:0.36842
[1]	val-error:0.36842
[2]	val-error:0.07895
[3]	val-error:0.05263
[4]	val-error:0.05263
[5]	val-error:0.06140
[6]	val-error:0.06140
[7]	val-error:0.05263
[8]	val-error:0.05263
[9]	val-error:0.05263
[10]	val-error:0.04386
[11]	val-error:0.04386
[12]	val-error:0.05263
[13]	val-error:0.04386
[14]	val-error:0.05263
[15]	val-error:0.05263
[16]	val-error:0.05263
[17]	val-error:0.05263
[18]	val-error:0.04386
[19]	val-error:0.04386
[20]	val-error:0.04386
[21]	val-error:0.04386
[22]	val-error:0.04386
[23]	val-error:0.04386
[24]	val-error:0.04386
[25]	val-error:0.04386
[26]	val-error:0.04386
[27]	val-error:0.04386
[28]	val-error:0.05263
[29]	val-error:0.04386
[0]	val-error:0.36842
[1]	val-error:0.05263
[2]	val-error:0.05263
[3]	val-error:0.03509
[4]	val-error:0.03509
[5]	val-error:0.04386
[6]	val-error:0.03509
[7]	val-error:0.03509
[8]	val-error:0.04386
[9]	val-error:0.04386
[10]	val-error:0.03509
[11]	val-error:0.04386
[12]	val-error:0.04386
[13]	val-error:0.04386
[14]	val

### Tuned!