In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
import mlflow
import pandas as pd

from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score

In [None]:
from config.config_data import DATA_PATH

In [None]:
# Get the experiment details
experiment_name = "Tuning - XGB"
experiment = mlflow.get_experiment_by_name(experiment_name)

if experiment is not None:
    experiment_id = experiment.experiment_id
    print(f"Experiment ID for '{experiment_name}': {experiment_id}")
else:
    print(f"Experiment '{experiment_name}' not found.")

In [None]:
# Set the tracking URI to your local MLflow directory
mlflow.set_tracking_uri("../mlruns")

# Sort the runs by the 'val_AUC' metric
runs = mlflow.search_runs(experiment_ids=[experiment_id])
sorted_runs = runs.sort_values(by="metrics.val_AUC", ascending=False)

# Get the run with the highest AUC
best_run = sorted_runs.iloc[0]

In [None]:
# Access the hyperparameters of the best run

best_run_id = best_run.run_id
best_run_info = mlflow.get_run(run_id=best_run_id)
best_hyperparameters = best_run_info.data.params

In [None]:
float_hyperparameters = ["eta", "colsample_bytree"]
int_hyperparameters = ["n_estimators", "max_depth", "alpha"]

# Create the XGBoost model
best_xgb_model = xgb.XGBClassifier()

# Set the hyperparameters, casting the numeric ones to float or int
for param_name, param_value in best_hyperparameters.items():
    if param_name in float_hyperparameters:
        best_xgb_model.set_params(**{param_name: float(param_value)})
    if param_name in int_hyperparameters:
        best_xgb_model.set_params(**{param_name: int(param_value)})
    else:
        best_xgb_model.set_params(**{param_name: param_value})

In [None]:
X_train = pd.read_csv(".." / Path(DATA_PATH.parent / "train_X.csv"))
y_train = pd.read_csv(".." / Path(DATA_PATH.parent / "train_y.csv"))
X_test = pd.read_csv(".." / Path(DATA_PATH.parent / "test_X.csv"))
y_test = pd.read_csv(".." / Path(DATA_PATH.parent / "test_y.csv"))

In [None]:
# Train the XGBoost model with your training data (X_train and y_train)
best_xgb_model.fit(X_train, y_train)

In [None]:
# Train a Decision Tree as a surrogate model to approximate XGBoost predictions
max_depth = 5

# Train a Decision Tree as a surrogate model with a maximum depth
decision_tree_surrogate = DecisionTreeClassifier(max_depth=max_depth)
decision_tree_surrogate.fit(X_train, best_xgb_model.predict(X_train))

In [None]:
# Use the decision tree surrogate model to make predictions on the test data
surrogate_preds = decision_tree_surrogate.predict(X_test)

# Evaluate the surrogate model's performance
surrogate_accuracy = accuracy_score(y_test, surrogate_preds)
print("Decision Tree Surrogate Model Accuracy: {:.2f}%".format(surrogate_accuracy * 100))

#### Tree Visualization

In [None]:
# Plot the Decision Tree structure with readability enhancements
plt.figure(figsize=(12, 6))
plot_tree(
    decision_tree_surrogate,
    filled=True,
    feature_names=X_train.columns,
    class_names=["Warning", "Citation"],
)
plt.show()

#### Feature Importance Plot

In [None]:
feature_importance = decision_tree_surrogate.feature_importances_
feature_names = X_train.columns

# Sort feature importances in descending order
sorted_idx = feature_importance.argsort()[::-1]

In [None]:
plt.figure(figsize=(12, 6))
plt.bar(range(X_train.shape[1]), feature_importance[sorted_idx], align="center")
plt.xticks(range(X_train.shape[1]), [feature_names[i] for i in sorted_idx], rotation=90)
plt.xlabel("Feature")
plt.ylabel("Feature Importance")
plt.title("Feature Importance Analysis")
plt.show()