# 02 · Train XGBoost Model
Train an XGBoost binary classifier on `OUTCOME`, evaluate metrics, and save artifacts.

In [0]:
import kagglehub, pandas as pd, numpy as np
import xgboost as xgb
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss, brier_score_loss

def prepare_categoricals_for_xgb(X: pd.DataFrame) -> pd.DataFrame:
    Xc = X.copy()
    for c in Xc.columns:
        if Xc[c].dtype == "object":
            Xc[c] = Xc[c].astype("category")
        if str(Xc[c].dtype).startswith("category"):
            Xc[c] = Xc[c].cat.add_categories(["__NA__"]).fillna("__NA__")
    return Xc

path = kagglehub.dataset_download("sagnik1511/car-insurance-data")
csv = list(Path(path).rglob("*.csv"))[0]
df = pd.read_csv(csv)

target = "OUTCOME"
y = (df[target].astype(str).str.lower()
        .map({"1":1,"0":0,"yes":1,"no":0,"true":1,"false":0})
        .fillna(df[target]).astype(int))
X = df.drop(columns=[target])
X = prepare_categoricals_for_xgb(X)

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
pos, neg = int(y_tr.sum()), len(y_tr)-int(y_tr.sum())
scale_pos_weight = float(neg/max(pos,1))

params = {
    "objective": "binary:logistic",
    "eval_metric": ["auc","logloss"],
    "tree_method": "hist",
    "learning_rate": 0.05,
    "max_depth": 6,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "lambda": 2.0,
    "alpha": 0.0,
    "scale_pos_weight": scale_pos_weight,
    "random_state": 42
}

dtr = xgb.DMatrix(X_tr, label=y_tr, enable_categorical=True)
dte = xgb.DMatrix(X_te, label=y_te, enable_categorical=True)

booster = xgb.train(params, dtr, num_boost_round=1200,
                    evals=[(dtr,"train"), (dte,"valid")],
                    early_stopping_rounds=80, verbose_eval=False)

p = booster.predict(xgb.DMatrix(X_te, enable_categorical=True))
p_clip = np.clip(p, 1e-7, 1-1e-7)

metrics = {
    "roc_auc": float(roc_auc_score(y_te, p)),
    "pr_auc": float(average_precision_score(y_te, p)),
    "logloss": float(log_loss(y_te, p_clip)),
    "brier": float(brier_score_loss(y_te, p_clip))
}
metrics

## Save model & metadata

In [0]:
out = Path("../models"); out.mkdir(exist_ok=True)
booster.save_model(str(out/"car_insurance_xgb.json"))

import json
meta = {
    "path": str(csv),
    "target": target,
    "features": list(X.columns),
    "class_balance": y.value_counts().to_dict(),
    "metrics": metrics
}
(out/"car_insurance_xgb_meta.json").write_text(json.dumps(meta, indent=2), encoding="utf-8")
print("Saved model & meta to ../models/")