In [None]:
#Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_encoded=pd.read_csv('/content/drive/My Drive/encoded_data/df_encoded.csv')

In [None]:
y=df_encoded['Approved_Flag']
x=df_encoded.drop(['Approved_Flag'],axis=1)

In [None]:
df_encoded.info()

In [None]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [None]:
x.info()

In [None]:
y.value_counts()

In [None]:
##Logistic reg fit

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000, random_state=42)
lr_model.fit(X_train_scaled, y_train)

In [None]:
#evaluate
from sklearn.metrics import classification_report, accuracy_score

y_pred = lr_model.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Coefficients and Odds Ratios
coefficients = lr_model.coef_[0]
odds_ratios = np.exp(coefficients)


# Display feature importance using coefficients and odds ratios
feature_importance = pd.DataFrame({
    'Feature': x.columns,
    'Coefficient': coefficients,
    'Odds Ratio': odds_ratios
})
print("\nFeature Importance (Coefficient and Odds Ratio):")
print(feature_importance.sort_values(by='Coefficient', ascending=False))

In [None]:
# Permutation Importance
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import RFE
perm_importance = permutation_importance(lr_model, X_test_scaled, y_test, n_repeats=30, random_state=42, n_jobs=-1)
perm_importance_df = pd.DataFrame({
    'Feature': x.columns,
    'Importance Mean': perm_importance.importances_mean,
    'Importance Std': perm_importance.importances_std
})
print("\nPermutation Importance:")
print(perm_importance_df.sort_values(by='Importance Mean', ascending=False))

In [None]:
lr_model_w = LogisticRegression(
    multi_class='multinomial',
    solver='lbfgs',
    class_weight='balanced',  # Automatically adjusts for class imbalance
    max_iter=1000,
    random_state=42
)
lr_model_w.fit(X_train_scaled, y_train)

In [None]:
#evaluate
from sklearn.metrics import classification_report, accuracy_score

y_pred_w = lr_model_w.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred_w))
print(classification_report(y_test, y_pred_w, target_names=label_encoder.classes_))

In [None]:
custom_weights = {0: 1.5, 1: 0.8, 2: 2.0, 3: 1.2}  # Try increasing weight for P3 (label 2)
lr_custom = LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight=custom_weights, max_iter=1000)

In [None]:
#evaluate
from sklearn.metrics import classification_report, accuracy_score
lr_custom.fit(X_train_scaled, y_train)
y_pred_custom = lr_custom.predict(X_test_scaled)
print("Accuracy:", accuracy_score(y_test, y_pred_custom))
print(classification_report(y_test, y_pred_custom, target_names=label_encoder.classes_))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Store different weight combinations to test
weight_options = [
    {0: 1, 1: 1, 2: 1, 3: 1},               # baseline (no weighting)
    {0: 1.5, 1: 0.7, 2: 2.5, 3: 1.5},       # trial 1
    {0: 1.2, 1: 0.9, 2: 3.0, 3: 1.8},       # trial 2 (boost P3/P4 more)
    {0: 1.0, 1: 1.0, 2: 2.0, 3: 2.0},       # trial 3 (balanced emphasis)
    {0: 1.3, 1: 0.8, 2: 2.8, 3: 1.6},       # trial 4
]

In [None]:
from sklearn.model_selection import train_test_split

for i, weights in enumerate(weight_options):
    print(f"\n🔎 Trial {i + 1} with weights: {weights}")
    model = LogisticRegression(multi_class='multinomial', solver='lbfgs',max_iter=1000, class_weight=weights)
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

custom_weights = {0: 1.5, 1: 0.8, 2: 2.0, 3: 1.2}  # Try increasing weight for P3 (label 2)

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Adjust degree for interactions; try 2 first
poly_logreg_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('logreg', LogisticRegression(
        max_iter=1000,
        solver='saga',          # supports L1
        penalty='l1',           # L1 for feature selection
        class_weight={0: 1.5, 1: 0.8, 2: 2.0, 3: 1.2},  # best weight trial
        multi_class='multinomial',
        random_state=42
    ))
])

In [None]:
poly_logreg_pipeline.fit(X_train, y_train)
y_pred = poly_logreg_pipeline.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score

# 5-Fold Cross Validation
cv_scores = cross_val_score(
    poly_logreg_pipeline, x, y_encoded,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

print("Cross-Validation Accuracy Scores:", cv_scores)
print("Mean Accuracy:", cv_scores.mean())
print("Standard Deviation:", cv_scores.std())


In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import itertools
from tqdm import tqdm

# Define parameter grid
param_dist = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate': [1, 0.1, 0.01, 0.001],
    'max_depth': [3, 5, 8, 10],
    'reg_alpha': [1, 10, 100],
    'n_estimators': [10, 50, 100]
}

# Generate all combinations
keys, values = zip(*param_dist.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

# Split dataset (you must define X and y before this)
X_train, X_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42, stratify=y)

results = []

# Loop over combinations
for params in tqdm(param_combinations, desc="Running Grid Search"):
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=4,
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        **params
    )
    model.fit(X_train, y_train)

    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    test_acc = accuracy_score(y_test, y_pred_test)
    train_acc = accuracy_score(y_train, y_pred_train)
    f1 = f1_score(y_test, y_pred_test, average='weighted')

    results.append({
        **params,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'f1_score': f1
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Optional: save to CSV
results_df.to_csv("xgboost_grid_search_results.csv", index=False)

# Show top 5 combinations
print(results_df.sort_values(by='f1_score', ascending=False).head())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
results_df.to_csv('/content/drive/MyDrive/xgboost_grid_search_results.csv', index=False)

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Choose one of the good ones, e.g., index 580
best_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 1.0,
    'max_depth': 3,
    'reg_alpha': 10,
    'n_estimators': 50,
    'objective': 'multi:softprob',
    'num_class': 4,
    'eval_metric': 'mlogloss',
    'use_label_encoder': False,
    'random_state': 42
}

xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train, y_train)

In [None]:
y_probs = xgb_best.predict_proba(X_test)
y_preds = y_probs.argmax(axis=1)

print(classification_report(y_test, y_preds))
print(confusion_matrix(y_test, y_preds))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_predictions(y_test, y_preds, cmap="Blues")
plt.title("Confusion Matrix - XGBoost Best Model")
plt.show()

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

# Define threshold evaluation function
def evaluate_class2_thresholds(y_probs, y_true, thresholds=np.arange(0.3, 0.61, 0.05)):
    results = []

    for thresh in thresholds:
        adjusted_preds = []
        for probs in y_probs:
            if probs[2] > thresh:
                adjusted_preds.append(2)
            else:
                adjusted_preds.append(np.argmax(probs))

        f1_macro = f1_score(y_true, adjusted_preds, average='macro')
        f1_weighted = f1_score(y_true, adjusted_preds, average='weighted')
        f1_class2 = f1_score((y_true == 2), (np.array(adjusted_preds) == 2))

        results.append({
            'threshold': thresh,
            'macro_f1': f1_macro,
            'weighted_f1': f1_weighted,
            'class2_f1': f1_class2
        })

    return pd.DataFrame(results)

# Predict probabilities
y_probs = xgb_best.predict_proba(X_test)
# Ground truth
y_true = y_test.values if hasattr(y_test, "values") else y_test

# Evaluate thresholds
threshold_results_df = evaluate_class2_thresholds(y_probs, y_true)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(threshold_results_df['threshold'], threshold_results_df['macro_f1'], label='Macro F1')
plt.plot(threshold_results_df['threshold'], threshold_results_df['weighted_f1'], label='Weighted F1')
plt.plot(threshold_results_df['threshold'], threshold_results_df['class2_f1'], label='Class 2 F1', linestyle='--')
plt.xlabel("Threshold for Class 2")
plt.ylabel("F1 Score")
plt.title("Threshold Optimization for Class 2")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Show top results
print(threshold_results_df.sort_values(by='class2_f1', ascending=False).head())

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import itertools
from tqdm import tqdm

# Define parameter grid
param_dist = {
    'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
    'learning_rate': [1, 0.1, 0.01, 0.001],
    'max_depth': [3, 5, 8, 10],
    'reg_alpha': [1, 10, 100],
    'n_estimators': [10, 50, 100]
}

# Generate all combinations
keys, values = zip(*param_dist.items())
param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

# Split dataset (you must define X and y before this)
X_train, X_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

results = []

# Loop over combinations
for params in tqdm(param_combinations, desc="Running Grid Search"):
    model = XGBClassifier(
        objective='multi:softmax',
        num_class=4,
        eval_metric='mlogloss',
        use_label_encoder=False,
        random_state=42,
        **params
    )
    model.fit(X_train, y_train)

    y_pred_test = model.predict(X_test)
    y_pred_train = model.predict(X_train)

    test_acc = accuracy_score(y_test, y_pred_test)
    train_acc = accuracy_score(y_train, y_pred_train)
    f1 = f1_score(y_test, y_pred_test, average='weighted')

    results.append({
        **params,
        'train_accuracy': train_acc,
        'test_accuracy': test_acc,
        'f1_score': f1
    })

# Convert to DataFrame
results_df_1 = pd.DataFrame(results)

# Optional: save to CSV
results_df.to_csv("xgboost_grid_search_results_without_stratify.csv", index=False)

# Show top 5 combinations
print(results_df.sort_values(by='f1_score', ascending=False).head())

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Best parameters from grid search (you can change if you prefer another row)
best_params = {
    'colsample_bytree': 0.9,
    'learning_rate': 1.0,
    'max_depth': 3,
    'reg_alpha': 10,
    'n_estimators': 100,
    'random_state': 42,
    'use_label_encoder': False,
    'eval_metric': 'mlogloss'
}

# Train final model
xgb_best = XGBClassifier(**best_params)
xgb_best.fit(X_train_scaled, y_train)

# Predictions
y_probs = xgb_best.predict_proba(X_test_scaled)
y_preds = y_probs.argmax(axis=1)

# Evaluation
print("Classification Report:\n", classification_report(y_test, y_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_preds))

In [None]:
!pip install optuna

In [None]:
import optuna
import xgboost as xgb
from sklearn.metrics import accuracy_score

def objective(trial):
    params = {
        'objective': 'multi:softprob',
        'num_class': len(np.unique(y_train)),
        'eval_metric': 'mlogloss',
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10),
        'n_estimators': trial.suggest_int('n_estimators', 50, 300)
    }
    model = xgb.XGBClassifier(**params, use_label_encoder=False)
    model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

In [None]:
xgb_best.feature_importances_

In [None]:
!pip install catboost

In [None]:
!pip install lightgbm catboost xgboost

from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier

# === 1️⃣ Base models with your tuned/best parameters ===
xgb_model = xgb.XGBClassifier(
    learning_rate=0.14568258239424609,
    max_depth=6,
    subsample=0.7290186302174095,
    colsample_bytree=0.5876436160089679,
    reg_alpha=4.2192212447413855,
    reg_lambda=8.431802769532,
    n_estimators=264,
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42
)

lgb_model = lgb.LGBMClassifier(
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    n_estimators=300,
    random_state=42
)

cat_model = CatBoostClassifier(
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    iterations=300,
    eval_metric='Accuracy',
    random_state=42,
    verbose=0
)

# === 2️⃣ Stacking setup ===
def stacking_train_predict(X, y, X_test, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

    # Placeholders for out-of-fold predictions
    oof_preds_xgb = np.zeros((len(X), len(np.unique(y))))
    oof_preds_lgb = np.zeros((len(X), len(np.unique(y))))
    oof_preds_cat = np.zeros((len(X), len(np.unique(y))))

    test_preds_xgb = np.zeros((len(X_test), len(np.unique(y))))
    test_preds_lgb = np.zeros((len(X_test), len(np.unique(y))))
    test_preds_cat = np.zeros((len(X_test), len(np.unique(y))))

    # Cross-validation loop
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"Fold {fold+1}")

        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Train each base model
        xgb_model.fit(X_train, y_train)
        lgb_model.fit(X_train, y_train)
        cat_model.fit(X_train, y_train)

        # Store OOF predictions
        oof_preds_xgb[val_idx] = xgb_model.predict_proba(X_val)
        oof_preds_lgb[val_idx] = lgb_model.predict_proba(X_val)
        oof_preds_cat[val_idx] = cat_model.predict_proba(X_val)

        # Store test set predictions
        test_preds_xgb += xgb_model.predict_proba(X_test) / n_splits
        test_preds_lgb += lgb_model.predict_proba(X_test) / n_splits
        test_preds_cat += cat_model.predict_proba(X_test) / n_splits

    # Stack the OOF predictions for meta learner training
    oof_train = np.hstack([oof_preds_xgb, oof_preds_lgb, oof_preds_cat])
    test_meta = np.hstack([test_preds_xgb, test_preds_lgb, test_preds_cat])

    return oof_train, test_meta

# === 3️⃣ Train stacked model ===
# Convert X_train, X_test to numpy arrays if they are DataFrames
X_train_np = np.array(X_train)
X_test_np = np.array(X_test)
y_train_np = np.array(y_train)

oof_train, test_meta = stacking_train_predict(X_train_np, y_train_np, X_test_np, n_splits=5)

meta_model = LogisticRegression(max_iter=1000, random_state=42)
meta_model.fit(oof_train, y_train_np)

# === 4️⃣ Final predictions ===
final_preds = meta_model.predict(test_meta)

print("Accuracy:", accuracy_score(y_test, final_preds))
print("\nClassification Report:\n", classification_report(y_test, final_preds))
print("Confusion Matrix:\n", confusion_matrix(y_test, final_preds))

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, average_precision_score, f1_score
import xgboost as xgb

# ✅ Best parameters from your tuning
xgb_model = xgb.XGBClassifier(
    learning_rate=0.14568258239424609,
    max_depth=6,
    subsample=0.7290186302174095,
    colsample_bytree=0.5876436160089679,
    reg_alpha=4.2192212447413855,
    reg_lambda=8.431802769532,
    n_estimators=264,
    use_label_encoder=False,
    eval_metric="mlogloss",
    random_state=42
)

# Train
xgb_model.fit(X_train, y_train)

# ----------------------------
# Pick a class for PR curve (e.g., minority class 2)
# ----------------------------
class_of_interest = 2
y_true_binary = (y_test == class_of_interest).astype(int)  # convert to binary: class_of_interest vs rest

# Probabilities for that class
y_proba = xgb_model.predict_proba(X_test)[:, class_of_interest]

# ----------------------------
# Precision–Recall curve
# ----------------------------
precision, recall, thresholds = precision_recall_curve(y_true_binary, y_proba)
pr_auc = average_precision_score(y_true_binary, y_proba)

plt.figure(figsize=(6, 6))
plt.plot(recall, precision, label=f'PR AUC (class {class_of_interest}) = {pr_auc:.3f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall Curve for Class {class_of_interest}")
plt.legend()
plt.grid(True)
plt.show()

# ----------------------------
# Find the best threshold (maximizing F1-score)
# ----------------------------
f1_scores = [f1_score(y_true_binary, (y_proba >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Best Threshold for class {class_of_interest}: {best_threshold:.3f}")
print(f"Best F1-score: {f1_scores[best_idx]:.3f}")


In [None]:
# ----------------------------
class_of_interest = 0
y_true_binary = (y_test == class_of_interest).astype(int)  # convert to binary: class_of_interest vs rest

# Probabilities for that class
y_proba = xgb_model.predict_proba(X_test)[:, class_of_interest]

# ----------------------------
# Precision–Recall curve
# ----------------------------
precision, recall, thresholds = precision_recall_curve(y_true_binary, y_proba)
pr_auc = average_precision_score(y_true_binary, y_proba)

plt.figure(figsize=(6, 6))
plt.plot(recall, precision, label=f'PR AUC (class {class_of_interest}) = {pr_auc:.3f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall Curve for Class {class_of_interest}")
plt.legend()
plt.grid(True)
plt.show()

# ----------------------------
# Find the best threshold (maximizing F1-score)
# ----------------------------
f1_scores = [f1_score(y_true_binary, (y_proba >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Best Threshold for class {class_of_interest}: {best_threshold:.3f}")
print(f"Best F1-score: {f1_scores[best_idx]:.3f}")


In [None]:
# ----------------------------
class_of_interest = 3
y_true_binary = (y_test == class_of_interest).astype(int)  # convert to binary: class_of_interest vs rest

# Probabilities for that class
y_proba = xgb_model.predict_proba(X_test)[:, class_of_interest]

# ----------------------------
# Precision–Recall curve
# ----------------------------
precision, recall, thresholds = precision_recall_curve(y_true_binary, y_proba)
pr_auc = average_precision_score(y_true_binary, y_proba)

plt.figure(figsize=(6, 6))
plt.plot(recall, precision, label=f'PR AUC (class {class_of_interest}) = {pr_auc:.3f}')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision–Recall Curve for Class {class_of_interest}")
plt.legend()
plt.grid(True)
plt.show()

# ----------------------------
# Find the best threshold (maximizing F1-score)
# ----------------------------
f1_scores = [f1_score(y_true_binary, (y_proba >= t).astype(int)) for t in thresholds]
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]
print(f"Best Threshold for class {class_of_interest}: {best_threshold:.3f}")
print(f"Best F1-score: {f1_scores[best_idx]:.3f}")


In [None]:
import numpy as np

# Your per-class thresholds
thresholds = {
    0: 0.256,
    1: 0.396,
    2: 0.169,
    3: 0.389
}

# Predicted probabilities
y_scores = xgb_model.predict_proba(X_test)

# Apply per-class thresholds
y_pred_custom = []
for probs in y_scores:
    # Apply thresholds mask
    passed = [i for i, p in enumerate(probs) if p >= thresholds[i]]
    if passed:
        # Pick the one with highest probability among those passing threshold
        chosen = max(passed, key=lambda i: probs[i])
    else:
        # Fallback to normal argmax if no threshold passed
        chosen = np.argmax(probs)
    y_pred_custom.append(chosen)

y_pred_custom = np.array(y_pred_custom)

from sklearn.metrics import classification_report, confusion_matrix
print("Classification Report with Custom Thresholds:")
print(classification_report(y_test, y_pred_custom))
print("Confusion Matrix with Custom Thresholds:")
print(confusion_matrix(y_test, y_pred_custom))

In [None]:
!pip install shap

In [None]:
import xgboost as xgb
import shap
import pandas as pd
import numpy as np

# =========================
model= xgb_model
# =========================
# 2. Compute SHAP values
# =========================
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)

# =========================
# 3. Handle both list and array outputs
# =========================
if isinstance(shap_values, list):
    # Old SHAP version output
    for class_idx in range(len(shap_values)):
        print(f"\nTop 10 important features for Class {class_idx}:")
        shap_df = pd.DataFrame({
            "feature": X_train.columns,
            "mean_abs_shap": np.abs(shap_values[class_idx]).mean(axis=0)
        }).sort_values("mean_abs_shap", ascending=False).head(10)
        print(shap_df)
        shap.summary_plot(shap_values[class_idx], X_train, show=False)

elif isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
    # New SHAP version output: (n_samples, n_features, n_classes)
    n_classes = shap_values.shape[2]
    for class_idx in range(n_classes):
        class_shap = shap_values[:, :, class_idx]
        print(f"\nTop 10 important features for Class {class_idx}:")
        shap_df = pd.DataFrame({
            "feature": X_train.columns,
            "mean_abs_shap": np.abs(class_shap).mean(axis=0)
        }).sort_values("mean_abs_shap", ascending=False).head(10)
        print(shap_df)
        shap.summary_plot(class_shap, X_train, show=False)

else:
    raise ValueError(f"Unexpected SHAP output shape: {type(shap_values)}, shape={np.shape(shap_values)}")
