In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from pathlib import Path
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier, plot_tree

from src.modeling.create_data_split import split_data
from src.utils.models_pkl import load_pickle

In [None]:
from config.config_data import DATA_PATH
from config.config_modeling import CAT_COLS, TRAIN_SIZE, TEST_FROM_VAL, RANDOM_STATE

In [None]:
MODEL_PATH = Path("..") / ".." / "models" / "XGB.pkl"
DATA_PATH = "../.." / Path(DATA_PATH.parent / "data.csv")

In [None]:
model = load_pickle(MODEL_PATH)

data = pd.read_csv(DATA_PATH)

split_data = split_data(
    cols=CAT_COLS,
    df=data,
    train_size=TRAIN_SIZE,
    test_size=TEST_FROM_VAL,
    random_state=RANDOM_STATE,
)

In [None]:
X_train = split_data["train"][0]
y_train = split_data["train"][1]
X_test = split_data["test"][0]
y_test = split_data["test"][1]

In [None]:
# Train the XGBoost model
model.fit(X_train, y_train)

#### Linear Regression as a Surrogate Model

In [None]:
# Train a Linear Regression as a surrogate model to approximate XGBoost predictions
linear_reggression_surrogate = LinearRegression()
linear_reggression_surrogate.fit(X_train, model.predict(X_train))

In [None]:
from sklearn.linear_model import LassoCV

lasso = LassoCV(alphas=[0.01, 0.1, 1.0, 10.0, 100.0], cv=5)
lasso.fit(X_train, y_train)

# Get the optimal alpha value
optimal_alpha = lasso.alpha_
print(optimal_alpha)

linear_regression_surrogate_lasso = LinearRegression()
linear_regression_surrogate_lasso.fit(X_train, y_train)
lasso_coefficients = lasso.coef_

In [None]:
# Evaluate the surrogate model's performance
surrogate_predictions = linear_reggression_surrogate.predict(X_train)
lasso_surrogate_predictions = linear_regression_surrogate_lasso.predict(X_train)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(model.predict(X_train), surrogate_predictions))
lasso_rmse = np.sqrt(mean_squared_error(model.predict(X_train), lasso_surrogate_predictions))

print("Linear Regression Surrogate Model RMSE: {:.4f}".format(rmse))
print("Lasso Linear Regression Surrogate Model RMSE: {:.4f}".format(lasso_rmse))

In [None]:
# Get the coefficients of the surrogate model
coef = linear_reggression_surrogate.coef_
coef_lasso = linear_regression_surrogate_lasso.coef_

# Get feature names (assuming X_train is a DataFrame)
feature_names = X_train.columns

In [None]:
# Order by the absolute values of coefficients
importance_df = pd.DataFrame(
    {"Feature Name": feature_names, "LR Coefficients": coef, "Lasso Coefficients": coef_lasso}
)

importance_df["Absolute Coefficients"] = importance_df["LR Coefficients"].abs()
importance_df = importance_df.sort_values(by="Absolute Coefficients", ascending=False)
importance_df = importance_df.drop(columns="Absolute Coefficients")
pd.options.display.float_format = "{:.2f}".format

print("Feature Importances (Ordered by Absolute Coefficient Value):")
print(importance_df)

In [None]:
# Plot the feature importance
fig, ax = plt.subplots(figsize=(10, 10))

ax.barh(feature_names, coef_lasso, alpha=0.7, label="Lasso Regression", color="green")
ax.barh(feature_names, coef, label="Linear Regression")

ax.set_title("Feature Importance Comparison")
ax.set_xlabel("Coefficient Magnitude")
ax.legend()
plt.show()

#### Decision Tree as a Surrogate Model

In [None]:
# Define a range of maximum depths to test
max_depths = list(range(2, 30))
cv_scores = []

# Iterate over different max_depth values and perform cross-validation
for max_depth in max_depths:
    decision_tree_surrogate = DecisionTreeClassifier(max_depth=max_depth)
    scores = cross_val_score(decision_tree_surrogate, X_train, y_train, cv=5, scoring="accuracy")
    mean_accuracy = np.mean(scores)
    cv_scores.append(mean_accuracy)

# Plot the mean cross-validation scores vs. max depth
plt.figure(figsize=(10, 3))
plt.plot(max_depths, cv_scores, color="green")
plt.xlabel("Max Depth")
plt.ylabel("Mean Cross-Validation Accuracy")
plt.title("Surrogate Model Performance vs. Max Depth (Cross-Validation)")
plt.grid(True)
plt.show()

In [None]:
# Train a Decision Tree as a surrogate model to approximate XGBoost predictions
max_depth = 5

decision_tree_surrogate = DecisionTreeClassifier(max_depth=max_depth)
decision_tree_surrogate.fit(X_train, model.predict(X_train))

In [None]:
# Evaluate the surrogate model's performance
decision_tree_surrogate_preds = decision_tree_surrogate.predict(X_test)

surrogate_accuracy = accuracy_score(y_test, decision_tree_surrogate_preds)
print("Decision Tree Surrogate Model Accuracy: {:.2f}%".format(surrogate_accuracy * 100))

In [None]:
# Decision Tree Visualization
plt.figure(figsize=(80, 20))
plot_tree(
    decision_tree_surrogate,
    filled=True,
    feature_names=X_train.columns,
    class_names=["Warning", "Citation"],
    fontsize=12,
)
plt.show()

In [None]:
feature_importance = decision_tree_surrogate.feature_importances_
feature_names = X_train.columns

# Sort feature importances in descending order
sorted_idx = feature_importance.argsort()

In [None]:
# Feature Importance Plot

plt.figure(figsize=(8, 10))
plt.barh(range(X_train.shape[1]), feature_importance[sorted_idx], color="green")
plt.yticks(range(X_train.shape[1]), [feature_names[i] for i in sorted_idx])
plt.ylabel("Feature")
plt.xlabel("Feature Importance")
plt.title("Feature Importance Analysis")
plt.show()