1. Load the Preprocessed Data

In [None]:
import numpy as np

# Load the dataset from the NPZ file
data_path = 'dataapp_domain_train_test_sets.npz'
data = np.load(data_path, allow_pickle=True)

# Extract the training and testing sets
X_train = data['X_train']
X_test = data['X_test']
y_train = data['y_train']
y_test = data['y_test']

2.Hyperparameter Tuning via GridSearchCV and Imbalanced-learn Pipeline

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Define the resampling techniques
over_sampler = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)

# Define the models
rf = RandomForestClassifier(random_state=42)
ridge = RidgeClassifier()

# Define the parameter grids for GridSearchCV
param_grid_rf = {
    'classifier__n_estimators': [100, 200],
    'classifier__max_depth': [10, 20],
    'classifier__class_weight': [None, 'balanced']
}

param_grid_ridge = {
    'classifier__alpha': [1.0, 0.1, 0.01],
    'classifier__class_weight': [None, 'balanced']
}

# Define pipelines
pipeline_rf_over = Pipeline([
    ('oversample', over_sampler),
    ('classifier', rf)
])

pipeline_rf_under = Pipeline([
    ('undersample', under_sampler),
    ('classifier', rf)
])

pipeline_ridge_over = Pipeline([
    ('oversample', over_sampler),
    ('classifier', ridge)
])

pipeline_ridge_under = Pipeline([
    ('undersample', under_sampler),
    ('classifier', ridge)
])

# Perform GridSearchCV for each pipeline
grids = [
    (pipeline_rf_over, param_grid_rf, 'Random Forest with Over-sampling'),
    (pipeline_rf_under, param_grid_rf, 'Random Forest with Under-sampling'),
    (pipeline_ridge_over, param_grid_ridge, 'Ridge with Over-sampling'),
    (pipeline_ridge_under, param_grid_ridge, 'Ridge with Under-sampling')
]

best_estimators = {}

for pipeline, param_grid, name in grids:
    grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring='roc_auc', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_estimators[name] = grid_search.best_estimator_
    print(f"Best parameters for {name}: {grid_search.best_params_}")
    print(f"Best ROC AUC for {name}: {grid_search.best_score_}")


3.Evaluate the Best Model

In [None]:
from sklearn.metrics import confusion_matrix, roc_auc_score, classification_report

# Select the best model (for example purposes, choosing the best model based on GridSearchCV results)
best_model_name = max(best_estimators, key=lambda name: best_estimators[name].score(X_test, y_test))
best_model = best_estimators[best_model_name]

# Predict on the test set
y_pred = best_model.predict(X_test)

# Compute confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Compute ROC AUC score
roc_auc = roc_auc_score(y_test, y_pred)
print(f"ROC AUC: {roc_auc}")

# Ensure ROC AUC < 0.82
if roc_auc >= 0.82:
    print("Warning: ROC AUC score is greater than or equal to 0.82. Model might be overfitting.")

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


4.Feature Importance

4.1 Global Feature Importance Using SHAP

In [None]:
import shap

# Fit the explainer on the training data
explainer = shap.Explainer(best_model['classifier'], X_train)
shap_values = explainer(X_test)

# Global feature importance
shap.summary_plot(shap_values, X_test)


4.2 Local Feature Importance Using SHAP

In [None]:
# Local explanation for a single instance
shap.initjs()
shap.force_plot(explainer.expected_value, shap_values[0,:], X_test[0,:])
