## 12. Confusion matrix visualization  
Raw counts and row-normalized ratios highlight which cover types the model confuses most often. Diagonal dominance indicates good performance; off-diagonal hot spots may suggest a need for more data or feature engineering for those specific classes.

In [None]:
# 12. Confusion matrix

cm = confusion_matrix(val_pd.label, pred_labels)  # or sample_batch.label if used

sns.heatmap(cm, annot=True, fmt="d", cmap="viridis")
plt.title("Confusion Matrix with Counts")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

cm_norm = cm.astype("float") / cm.sum(axis=1)[:, np.newaxis]
sns.heatmap(cm_norm, annot=True, fmt=".2f", cmap="viridis")
plt.title("Normalized Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

### 13. CPU batch inference with Ray Data  

Use **Ray Data** for scalable, parallel inference.  
Each actor loads the trained model once and processes data batches in parallel,  
providing better throughput than ad-hoc remote tasks and avoiding repeated model loads.  

In [None]:
# 13. CPU batch inference with Ray Data 

# Assumes: val_ds, feature_columns, best_ckpt already defined.

class XGBPredictor:
    """Stateful actor: load Booster once, reuse across batches."""
    def __init__(self, ckpt, feature_cols):
        self.model = RayTrainReportCallback.get_model(ckpt)
        self.feature_cols = feature_cols

    def __call__(self, batch: pd.DataFrame) -> pd.DataFrame:
        dmatrix = xgb.DMatrix(batch[self.feature_cols])
        probs = self.model.predict(dmatrix)
        preds = np.argmax(probs, axis=1)
        return pd.DataFrame(
            {"pred": preds.astype(np.int32), "label": batch["label"].astype(np.int32)}
        )

# Use an ActorPoolStrategy instead of compute="actors"
pred_ds = val_ds.map_batches(
    XGBPredictor,
    fn_constructor_args=(best_ckpt, feature_columns),
    batch_format="pandas",
    compute=ActorPoolStrategy(),   
    num_cpus=1,                    # per-actor CPU; tune as needed
)

# Aggregate accuracy without collecting to driver
stats_ds = pred_ds.map_batches(
    lambda df: pd.DataFrame({
        "correct": [int((df["pred"].to_numpy() == df["label"].to_numpy()).sum())],
        "n": [int(len(df))]
    }),
    batch_format="pandas",
)

correct = int(stats_ds.sum("correct"))
n = int(stats_ds.sum("n"))
print(f"Validation accuracy (Ray Data inference): {correct / n:.3f}")


### 14. Feature-importance diagnostics  
XGBoost’s built-in `get_score(importance_type="gain")` ranks each feature by its average gain across all splits. Visualizing the top-15 helps connect model behaviour back to domain knowledge. For example, elevation, and soil-type often dominate forest-cover prediction.

In [None]:
# 14. Gain‑based feature importance
importances = booster.get_score(importance_type="gain")
keys, gains = zip(*sorted(importances.items(), key=lambda kv: kv[1], reverse=True)[:15])

plt.barh(range(len(gains)), gains)
plt.yticks(range(len(gains)), keys)
plt.gca().invert_yaxis()
plt.title("Top-15 Feature Importances (gain)"); plt.xlabel("Average gain"); plt.show()