In [None]:
import xgboost as xgb
import matplotlib.pyplot as plt
import mlflow
import pandas as pd

from pathlib import Path
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import plot_tree
from sklearn.metrics import accuracy_score

In [None]:
from config.config_data import DATA_PATH

In [None]:
# TO DO: Change experiment_id to model name
experiment_id = "721849622709339297"

# Set the tracking URI to your local MLflow directory
mlflow.set_tracking_uri(Path(DATA_PATH.parent.parent / "mlruns"))

# Sort the runs by the 'val_AUC' metric
runs = mlflow.search_runs(experiment_ids=[experiment_id])
sorted_runs = runs.sort_values(by="metrics.val_AUC", ascending=False)

# Get the run with the highest AUC
best_run = sorted_runs.iloc[0]

In [None]:
# Access the hyperparameters of the best run

best_run_id = best_run.run_id
best_run_info = mlflow.get_run(run_id=best_run_id)
best_hyperparameters = best_run_info.data.params

In [None]:
# Set the best model

best_xgb_model = xgb.XGBClassifier()
best_xgb_model.set_params(**best_hyperparameters)

In [None]:
X_train = pd.read_csv(Path(DATA_PATH.parent.parent / "train_X.csv"))
X_test = pd.read_csv(Path(DATA_PATH.parent.parent / "test_X.csv"))
y_test = pd.read_csv(Path(DATA_PATH.parent.parent / "test_y.csv"))

In [None]:
# Train a Decision Tree as a surrogate model to approximate XGBoost predictions

decision_tree_surrogate = DecisionTreeClassifier()
decision_tree_surrogate.fit(X_train, best_xgb_model.predict(X_train))

In [None]:
# Use the decision tree surrogate model to make predictions on the test data

surrogate_preds = decision_tree_surrogate.predict(X_test)

In [None]:
# Evaluate the surrogate model's performance

surrogate_accuracy = accuracy_score(y_test, surrogate_preds)
print("Decision Tree Surrogate Model Accuracy: {:.2f}%".format(surrogate_accuracy * 100))

#### Feature Importance Plot

In [None]:
feature_importance = decision_tree_surrogate.feature_importances_
feature_names = [f"Feature {i}" for i in range(len(feature_importance))]

plt.barh(feature_names, feature_importance)
plt.xlabel("Feature Importance")
plt.ylabel("Feature Name")
plt.title("Feature Importance Plot")
plt.show()

#### Tree Visualization

In [None]:
plt.figure(figsize=(10, 8))
plot_tree(
    decision_tree_surrogate,
    feature_names=feature_names,
    class_names=["Class 0", "Class 1"],
    filled=True,
)
plt.show()