In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import shap
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
from itertools import cycle


In [None]:
# Load dataset
df = pd.read_csv("df_cat_dog_harmonized.csv")

In [None]:
# Data Preprocess - Created a "regroupped" outcome_type by remapping the outcome_type
def map_outcome_group(value):
    adopted = {"adoption"}
    non_adopted = {
        "rescue", "foster", "return to owner",
        "foster to adopt", "return to rescue", "rtf"
    }

    value = str(value).strip().lower()

    if value in adopted:
        return "adopted"
    elif value in non_adopted:
        return "non-adopted"
    else:
        return "other"

df["outcome_type_harmonized_regrouped"] = df["outcome_type_harmonized"].apply(map_outcome_group)

In [None]:
# --- Preprocessing ---
def encode_categorical(df, categorical_cols):
    encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col].astype(str))
        encoders[col] = le
    return df, encoders

def transform_categorical(df, categorical_cols, encoders):
    for col in categorical_cols:
        if col in encoders:
            le = encoders[col]
            df[col] = df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    return df

def scale_numerical(df, numerical_cols, scaler=None):
    df = df.copy()
    df[numerical_cols] = df[numerical_cols].astype(np.float32)
    if scaler is None:
        scaler = StandardScaler()
        df.loc[:, numerical_cols] = scaler.fit_transform(df[numerical_cols])
    else:
        df.loc[:, numerical_cols] = scaler.transform(df[numerical_cols])
    return df, scaler

def encode_booleans(df, boolean_cols):
    for col in boolean_cols:
        df[col] = df[col].map({'yes': 1, 'no': 0}).fillna(0).astype(int)
    return df

def train_xgb_classifier(X_train, y_train, **kwargs):
    model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', **kwargs)
    model.fit(X_train, y_train)
    return model

def train_xgb_regressor(X_train, y_train, **kwargs):
    model = XGBRegressor(**kwargs)
    model.fit(X_train, y_train)
    return model

def evaluate_regression(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def evaluate_classification(y_true, y_pred, target_names=['non-adopted', 'adopted']):
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    return pd.DataFrame(report).transpose()

def train_logistic_classification(X_train, y_train, **kwargs):
    model = LogisticRegression(max_iter=1000, **kwargs)
    model.fit(X_train, y_train)
    return model

def train_rf_classifier(X_train, y_train, **kwargs):
    model = RandomForestClassifier(**kwargs)
    model.fit(X_train, y_train)
    return model

def train_rf_regressor(X_train, y_train, **kwargs):
    model = RandomForestRegressor(**kwargs)
    model.fit(X_train, y_train)
    return model


# --- Step 1: Preprocessing
boolean_cols = ['Is_returned', 'is_mix']
categorical_cols = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized', 'has_name',
] + boolean_cols

numerical_cols = [
    'Num_returned', 'age_months', 'stay_length_days', 'min_height', 'max_height',
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy',
    'grooming_frequency_value', 'shedding_value', 'energy_level_value',
    'trainability_value', 'demeanor_value'
]

target_col_cls = 'adopt_label'
features_cls = categorical_cols + numerical_cols

# --- Data Preparation ---
# Encode booleans and categorical
df = encode_booleans(df, boolean_cols)
df, encoders = encode_categorical(df, categorical_cols)

# Filter and split for classification
df[target_col_cls] = df['outcome_type_harmonized_grouped'].map({'adopted': 1, 'non-adopted': 0})
df = df.dropna(subset=features_cls + [target_col_cls])

train_df = df[df['outcome_year'].between(2018, 2023)]
val_df = df[df['outcome_year'] == 2024]
test_df = df[df['outcome_year'] == 2025]

# Scale numericals
train_df, scaler = scale_numerical(train_df, numerical_cols)
val_df, _ = scale_numerical(val_df, numerical_cols, scaler)
test_df, _ = scale_numerical(test_df, numerical_cols, scaler)

# --- Step 2a: Train classifier through XGBoost and Random Forest---
X_train_cls = train_df[features_cls]
y_train_cls = train_df[target_col_cls]
X_val_cls = val_df[features_cls]
y_val_cls = val_df[target_col_cls]
X_test_cls = test_df[features_cls]
y_test_cls = test_df[target_col_cls]

# --- XGBoost Classifer
clf_model = train_xgb_classifier(X_train_cls, y_train_cls, max_depth=6, n_estimators=150, learning_rate=0.05)

# --- Logistic Classifer ---
logclf_model = train_logistic_classification(X_train_cls, y_train_cls, solver='lbfgs', C=1.0)

# --- Step 3a: Evaluate adopt score prediction ---
# --- Evaluate XGboost Classifier results
y_pred_val_cls = clf_model.predict(X_val_cls)
print("\n Val Classification Report for adoption score (XGBoost):")
print(evaluate_classification(y_val_cls, y_pred_val_cls))

y_pred_test_cls = clf_model.predict(X_test_cls)
print("\n Test Classification Report for adoption score (XGBoost):")
print(evaluate_classification(y_test_cls, y_pred_test_cls))

# --- Evaluate Logistic Classifier results ---
y_pred_val_logclf = logclf_model.predict(X_val_cls)
print("\n Validation Classification Report for adoption score (Logistic):")
print(evaluate_classification(y_val_cls, y_pred_val_logclf))

y_pred_test_logclf = logclf_model.predict(X_test_cls)
print("\n Test Classification Report for adoption score (Logistic):")
print(evaluate_classification(y_test_cls, y_pred_test_logclf))

# Predict adoption probability and scale to 0–100
y_pred_proba = clf_model.predict_proba(X_test_cls)[:, 1]
logclf_proba = logclf_model.predict_proba(X_test_cls)[:, 1]

test_df = test_df.copy()
test_df["xgb_predicted_adopt_score"] = np.round(y_pred_proba * 100, 1).astype(np.float32)
test_df["log_predicted_adopt_score"] = np.round(logclf_proba * 100, 1).astype(np.float32)
test_df["is_adopted_prediction"] = test_df["xgb_predicted_adopt_score"] >= 50
combined_df = test_df.copy()

# --- Step 2b: Train regressor for stay_length_days using XGBoost and Random Forest---
features_reg = [col for col in features_cls if col != 'stay_length_days']
df_reg = df.dropna(subset=features_reg + ['stay_length_days'])

train_df_reg = df_reg[df_reg['outcome_year'].between(2018, 2023)]
val_df_reg = df_reg[df_reg['outcome_year'] == 2024]
test_df_reg = df_reg[df_reg['outcome_year'] == 2025]

X_train_reg = train_df_reg[features_reg]
y_train_reg = train_df_reg['stay_length_days']
X_val_reg = val_df_reg[features_reg]
y_val_reg = val_df_reg['stay_length_days']
X_test_reg = test_df_reg[features_reg]
y_test_reg = test_df_reg['stay_length_days']

# XGBoost
reg_model = train_xgb_regressor(X_train_reg, y_train_reg, max_depth=6, n_estimators=100, learning_rate=0.05)
# Random Forest
reg_model_rf = train_rf_regressor(X_train_reg, y_train_reg, n_estimators=100, max_depth=10, random_state=42)

# --- Step 3b: Evaluate regression ---
# --- XGBoost regression evaluation ---
y_pred_val_reg = reg_model.predict(X_val_reg)
print("\n Val Regression Metrics for stay_length (XGBoost):")
print(evaluate_regression(y_val_reg, y_pred_val_reg))

y_pred_test_reg = reg_model.predict(X_test_reg)
print("\n Test Regression Metrics for stay_length (XGBoost):")
print(evaluate_regression(y_test_reg, y_pred_test_reg))

test_df_reg = test_df_reg.copy()
test_df_reg["predicted_stay_length_days"] = np.round(y_pred_test_reg, 1).astype(np.float32)

# --- Random Forest regression evaluation ---
y_pred_val_reg_rf = reg_model_rf.predict(X_val_reg)
print("\n Val Regression Metrics for stay_length (Random Forest):")
print(evaluate_regression(y_val_reg, y_pred_val_reg_rf))

y_pred_test_reg_rf = reg_model_rf.predict(X_test_reg)
print("\n Test Regression Metrics for stay_length (Random Forest):")
print(evaluate_regression(y_test_reg, y_pred_test_reg_rf))

# --- Step 3d: Save predictions from one model (choose XGBoost here) ---
test_df_reg = test_df_reg.copy()
test_df_reg["predicted_stay_length_days_xgb"] = np.round(y_pred_test_reg, 1).astype(np.float32)
test_df_reg["predicted_stay_length_days_rf"] = np.round(y_pred_test_reg_rf, 1).astype(np.float32)
# Filter to only those that exist in combined_df for safety (adopted predictions)
test_df_reg = test_df_reg[test_df_reg.index.isin(combined_df.index)]

# --- Merge predicted stay lengths for adopted animals ---
#stay_length_cols = ["predicted_stay_length_days_xgb", "predicted_stay_length_days_rf"]
#combined_df = combined_df.join(test_df_reg[stay_length_cols], how="left")

# Mask for adopted predictions
#adopted_mask = combined_df["is_adopted_prediction"] == True

# Apply stay length predictions only to adopted rows, set others to NaN
mask = combined_df["is_adopted_prediction"] == True
test_df_reg = test_df_reg[test_df_reg.index.isin(combined_df[mask].index)]
combined_df = combined_df.join(
    test_df_reg[["predicted_stay_length_days_xgb", "predicted_stay_length_days_rf"]],
    how="left"
)
# --- Step 4: Non-Adopted Outcome Subtype Classification using Random Forest and XGBoost ---
# Create `non_adopted_label` from harmonized subtype mappings
non_adopted_subtypes = {
    "rescue": "rescue",
    "foster": "foster",
    "return to owner": "return_to_owner",
    "foster to adopt": "foster",
    "return to rescue": "rescue",
    "rtf": "foster"
}
df["non_adopted_label"] = df["outcome_type_harmonized"].map(non_adopted_subtypes)

# Filter non-adopted and drop incomplete rows
df_non_adopted = df[df['outcome_type_harmonized_regrouped'] == 'non-adopted'].copy()
df_non_adopted = df_non_adopted.dropna(subset=categorical_cols + numerical_cols + ['non_adopted_label'])

# Prepare splits
train_df_reg_na = df_non_adopted[df_non_adopted['outcome_year'].between(2018, 2023)]
val_df_reg_na = df_non_adopted[df_non_adopted['outcome_year'] == 2024]
#test_df_reg_na = df_non_adopted[df_non_adopted['outcome_year'] == 2025]

# --- Filter only non-adopted test samples with XGBoost adoption score < 50 ---
test_df_with_scores = test_df.copy()
test_df_with_scores = test_df_with_scores[test_df_with_scores["xgb_predicted_adopt_score"] < 50]

# Join with df_non_adopted to get outcome label
test_df_reg_na = df_non_adopted[df_non_adopted['outcome_year'] == 2025]
test_df_reg_na = test_df_reg_na[test_df_reg_na.index.isin(test_df_with_scores.index)]


X_train_na = train_df_reg_na[categorical_cols + numerical_cols]
y_train_na = train_df_reg_na["non_adopted_label"]
X_val_na = val_df_reg_na[categorical_cols + numerical_cols]
y_val_na = val_df_reg_na["non_adopted_label"]
X_test_na = test_df_reg_na[categorical_cols + numerical_cols]
y_test_na = test_df_reg_na["non_adopted_label"]

# --- Encode labels for non-adopted subtypes ---
label_encoder = LabelEncoder()
label_encoder.fit(pd.concat([y_train_na, y_val_na, y_test_na], axis=0))

y_train_encoded = label_encoder.transform(y_train_na)
y_val_encoded = label_encoder.transform(y_val_na)
y_test_encoded = label_encoder.transform(y_test_na)

# --- Random Forest Classifier for Non-adopted Subtypes ---
rf_na_model = train_rf_classifier(X_train_na, y_train_na, n_estimators=100, max_depth=10, random_state=42, class_weight="balanced")
# Validation/Test performance
y_pred_val_na = rf_na_model.predict(X_val_na)
y_pred_test_na = rf_na_model.predict(X_test_na)

print("\n Val Set Classification Report (Random Forest):")
print(classification_report(y_val_na, y_pred_val_na))
#print(f"Accuracy (Val): {accuracy_score(y_val_na, y_pred_val_na):.4f}")
print("\n Test Set Classification Report (Random Forest):")
print(classification_report(y_test_na, y_pred_test_na))
#print(f"Accuracy (Test): {accuracy_score(y_test_na, y_pred_test_na):.4f}")

# --- XGBoost Classifier for Non-Adopted Subtypes ---
xgb_na_model = train_xgb_classifier(X_train_na, y_train_encoded, max_depth=10, n_estimators=100, learning_rate=0.05, random_state=42)
y_pred_val_xgb_na = xgb_na_model.predict(X_val_na)
y_pred_test_xgb_na = xgb_na_model.predict(X_test_na)
y_proba_test_xgb_na = xgb_na_model.predict_proba(X_test_na)

print("\n Val Set Classification Report (XGBoost):")
print(classification_report(y_val_encoded, y_pred_val_xgb_na, target_names=label_encoder.classes_))
print("\n Test Set Classification Report (XGBoost):")
print(classification_report(y_test_encoded, y_pred_test_xgb_na, target_names=label_encoder.classes_))


# --- Step 5: Combine results and SHAP for classifier ---
#combined_df = test_df.join(test_df_reg[["predicted_stay_length_days", "predicted_stay_length_days_rf"]])

# Add predicted non-adopted subtype to test_df_reg_na
test_df_reg_na = test_df_reg_na.copy()
test_df_reg_na["rf_predicted_non_adopted_label"] = y_pred_test_na
test_df_reg_na["xgb_predicted_non_adopted_label"] = label_encoder.inverse_transform(y_pred_test_xgb_na)

# Optional: XGBoost probability scores (per class)
for i, class_label in enumerate(label_encoder.classes_):
    test_df_reg_na[f"xgb_proba_{class_label}"] = y_proba_test_xgb_na[:, i]

# Merge all predictions into a single DataFrame by common indices
combined_df = test_df.copy()

# Merge predicted stay lengths
combined_df = combined_df.join(
    test_df_reg[["predicted_stay_length_days_xgb", "predicted_stay_length_days_rf"]]
)

# Merge predicted non-adopted subtypes
combined_df = combined_df.join(
    test_df_reg_na[[
        "rf_predicted_non_adopted_label",
        "xgb_predicted_non_adopted_label"
    ] + [f"xgb_proba_{cls}" for cls in label_encoder.classes_]],
    how="left"
)

# Final combined_df now contains:
# - Adoption scores from XGBoost and Logistic Regression
# - Stay length predictions from XGBoost and Random Forest
# - Subtype predictions from RF and XGBoost
# - XGBoost probability distribution over subtype classes

# Preview combined_df
print("\n Final Combined DataFrame (Sample):")
display(combined_df.head())



In [None]:
combined_df.columns

In [None]:
display_columns = [
    "animal_id", "has_name", "animal_type", "primary_breed_harmonized", "primary_color_harmonized",
    "sex", "intake_type_harmonized", "shelter",
    "xgb_predicted_adopt_score", "log_predicted_adopt_score", "predicted_stay_length_days_xgb",
    "xgb_predicted_non_adopted_label", "xgb_proba_foster", "xgb_proba_rescue", "xgb_proba_return_to_owner",
    "Num_returned", "age_months", "stay_length_days", "is_adopted_prediction"
]

combined_df[display_columns].head(10)


In [None]:
# --- Step 6: Decode categorical values in combined_df ---
categorical_to_decode = [
    "animal_type", "primary_breed_harmonized", "primary_color_harmonized",
    "sex", "intake_type_harmonized", 'has_name', 'Is_returned', 'is_mix'
]

for col in categorical_to_decode:
    if col in encoders and col in combined_df.columns:
        le = encoders[col]
        # Ensure integer conversion and mask invalid values
        valid_idx = combined_df[col].apply(lambda x: isinstance(x, (int, np.integer)) and 0 <= x < len(le.classes_))
        decoded_vals = combined_df.loc[valid_idx, col].astype(float).astype(int)
        combined_df.loc[valid_idx, col] = le.inverse_transform(decoded_vals)

In [None]:
# --- Step 7: Decode numerical values in combined_df ---
numerical_cols_decodable = [col for col in numerical_cols if col in combined_df.columns]
decoded_matrix = combined_df[numerical_cols_decodable].copy()

# Fill NaNs to avoid transform errors
decoded_matrix_filled = decoded_matrix.fillna(0)

# Perform inverse transform on all columns the scaler saw
inversed_matrix = scaler.inverse_transform(decoded_matrix_filled)

# Replace columns with inverse transformed values
combined_df.loc[:, numerical_cols_decodable] = inversed_matrix

# --- Round adopt scores to integers (0–100 scale) ---
score_cols = ["XGBoost_predicted_adopt_score", "Logistic_predicted_adopt_score"]
for col in score_cols:
    if col in combined_df.columns:
        combined_df[col] = combined_df[col].round(0).astype("Int64")

In [None]:
combined_df[display_columns].head(10)

In [None]:
# --- SHAP for Test Data ---
explainer = shap.Explainer(clf_model, X_train_cls)
shap_values = explainer(X_test_cls)

# Add SHAP values as separate columns
shap_array = shap_values.values
feature_names = shap_values.feature_names
for i, feature in enumerate(feature_names):
    combined_df[f"{feature}_SHAP"] = shap_array[:, i]

# --- SHAP for Train Data ---
explainer_train = shap.Explainer(clf_model, X_train_cls)
#shap_values_train = explainer_train(X_train_cls)

# Add SHAP values to a copy of train_df
#shap_array_train = shap_values_train.values
#feature_names = shap_values_train.feature_names
#for i, feature in enumerate(feature_names):
#    combined_df[f"{feature}_SHAP_train"] = shap_array[:, i]

# --- Step 5: Visualize SHAP for first example ---
print("SHAP Waterfall Plot for Test Example:")
shap.plots.waterfall(shap_values[0])
plt.show()

#print("SHAP Waterfall Plot for Train Example:")
#shap.plots.waterfall(shap_values_train[0])
#plt.show()


In [None]:
combined_df.columns

In [None]:
shap.summary_plot(shap_values, X_test_cls)
plt.show()

In [None]:
#shap.summary_plot(shap_values_train, X_train_cls)
#plt.show()

In [None]:
# --- Step 8: SHAP Waterfall Plot for Specific Animal ---
animal_id = "A1354267"  # Replace with your desired animal ID

# Use the final decoded dataframe
decoded_df = combined_df.copy()

# Get all matching rows for the specified animal ID
matching_rows = decoded_df[decoded_df['animal_id'].astype(str) == animal_id]

if matching_rows.empty:
    print(f"No records found for Animal ID: {animal_id}")
else:
    for idx, row in matching_rows.iterrows():
        # Extract SHAP columns and values
        shap_cols = [col for col in decoded_df.columns if col.endswith('_SHAP')]
        shap_values = row[shap_cols].values

        # Base value (mean model prediction)
        base_value = decoded_df['xgb_predicted_adopt_score'].mean()

        # Get original feature names and values
        feature_names = [col.replace('_SHAP', '') for col in shap_cols]
        feature_values = row[feature_names].values

        # Create SHAP explanation object
        shap_explanation = shap.Explanation(
            values=shap_values,
            base_values=base_value,
            data=feature_values,
            feature_names=feature_names
        )

        # Print info and plot
        print(f"\n🐾 Waterfall Plot for Animal ID: {animal_id} (Index: {idx})")
        shap.plots.waterfall(shap_explanation)
        plt.show()


In [None]:
animal_id = "A1354267"  # Replace with any valid animal_id

# Combine feature list
numerical_features_clf = [
    'Num_returned', 'age_months', 'stay_length_days', 'min_height', 'max_height',
    'min_weight', 'max_weight', 'min_expectancy', 'max_expectancy',
    'grooming_frequency_value', 'shedding_value', 'energy_level_value',
    'trainability_value', 'demeanor_value'
]
categorical_columns = [
    'animal_type', 'primary_breed_harmonized', 'primary_color_harmonized',
    'sex', 'intake_type_harmonized', 'Is_returned', 'has_name', 'is_mix', 'shelter'
]
all_feature_names = numerical_features_clf + categorical_columns

# Filter rows for the animal_id
animal_rows = decoded_df[decoded_df["animal_id"].astype(str) == animal_id]

if animal_rows.empty:
    print(f"No records found for Animal ID: {animal_id}")
else:
    for idx, row in animal_rows.iterrows():
        adoption_score = row['xgb_predicted_adopt_score']
        predicted_stay = row.get('predicted_stay_length_days_rf', np.nan)

        shap_cols = [col for col in decoded_df.columns if col.endswith('_SHAP')]
        shap_values = row[shap_cols].values
        features = [col.replace('_SHAP', '') for col in shap_cols]
        feature_values = row[features].values

        shap_df = pd.DataFrame({
            "feature": features,
            "value": feature_values,
            "shap": shap_values
        })

        # Select top 3 contributors based on predicted_stay_length_days logic
        if adoption_score >= 50:
            top_contributors = shap_df[shap_df["shap"] > 0].sort_values(by="shap", ascending=False).head(3)
        else:
            top_contributors = shap_df[shap_df["shap"] < 0].reindex(
                shap_df[shap_df["shap"] < 0]["shap"].abs().sort_values(ascending=False).index
            ).head(3)

        print(f"\n===============================")
        print(f"Animal ID: {animal_id} | Record Index: {idx}")
        print(f"Adoption Score: {adoption_score:.2f}")
        print(f"Predicted Stay Length: {predicted_stay:.2f}")

        print("\n🔍 Top 3 SHAP Contributors:")
        for _, row_contrib in top_contributors.iterrows():
            impact = "↑" if row_contrib["shap"] > 0 else "↓"
            print(f"  • {row_contrib['feature']} = {row_contrib['value']} ({impact} impact of {row_contrib['shap']:.2f})")


In [None]:
# --- Multiclass AUC Curve ---
# Encode labels and binarize for ROC

y_test_binarized = label_binarize(y_test_encoded, classes=range(len(label_encoder.classes_)))
y_score_rf = rf_na_model.predict_proba(X_test_na)
y_pred_rf_encoded = label_encoder.transform(y_pred_test_na)
y_pred_xgb_encoded = y_pred_test_xgb_na  # Already encoded since XGB was trained on y_train_encoded
y_score_xgb = y_proba_test_xgb_na


# ROC Curve for each class
fpr, tpr, roc_auc = {}, {}, {}
for i in range(len(label_encoder.classes_)):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_score_rf[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC Curves
plt.figure(figsize=(10, 7))
colors = cycle(['aqua', 'darkorange', 'cornflowerblue', 'green', 'red', 'purple'])
for i, color in zip(range(len(label_encoder.classes_)), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label=f"{label_encoder.classes_[i]} (AUC = {roc_auc[i]:.2f})")

plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Non-Adopted Outcome Classifier (RF)')
plt.legend(loc="lower right")
plt.grid(True)
plt.tight_layout()
plt.show()



# --- Compare with Random Forest ---
auc_rf = roc_auc_score(y_test_binarized, y_score_rf, average='macro', multi_class='ovr')
print("\n Random Forest Classification Report:")
print(classification_report(y_test_encoded, y_pred_rf_encoded, target_names=label_encoder.classes_))
print(f"Random Forest Macro AUC: {auc_rf:.3f}")

auc_xgb = roc_auc_score(y_test_binarized, y_score_xgb, average='macro', multi_class='ovr')
print("\n XGBoost Classification Report:")
print(classification_report(y_test_encoded, y_pred_xgb_encoded, target_names=label_encoder.classes_))
print(f"XGBoost Macro AUC: {auc_xgb:.3f}")

In [None]:
combined_df.columns