In [57]:
import os

In [58]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [59]:
os.chdir('../.')

In [60]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [61]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [62]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class FeatureImportanceConfig:
    """Configuration for feature importance analysis.
    """
    root_dir: Path
    grid_search_model: Path
    training_data_path: Path
    test_data_path: Path
    feature_importance_file: Path
    target_column: str
    
    
    
    

In [63]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories
from BankProducts import logger

In [64]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_feature_importance_config(self) -> FeatureImportanceConfig:
        config = self.config.feature_importance 
        schema = self.schema.target_column
        params= self.params
        
        create_directories([self.config.artifacts_root])
       
        
        feature_importance_config = FeatureImportanceConfig(
            root_dir=Path(config.root_dir),
            grid_search_model=Path(config.grid_search_model),
            training_data_path=Path(config.training_data_path),
            test_data_path=Path(config.test_data_path),
            feature_importance_file=Path(config.feature_importance_file),
            target_column= schema.name
            
        )
        logger.info(f"Feature Importance Config: {feature_importance_config}")
        return feature_importance_config
        
        

In [65]:
import pandas as pd

In [66]:
class FeatureImportance:
    def __init__(self, config: FeatureImportanceConfig):
        self.config = config
        self.pipeline = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.processor = None

    def feature_importances(self):
        import joblib
        import shap
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt
        import os

        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(self.config.target_column, axis=1)

        pipeline = joblib.load(self.config.grid_search_model)

        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['classifier']

        # Transform test data using preprocessor
        X_processed = preprocessor.transform(test_x)

        # Get feature names after preprocessing
        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )

        print(X_df.columns)

        # Create SHAP explainer
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_df)

        # Handle multiclass vs binary/regression
        if isinstance(shap_values, list) and isinstance(shap_values[0], np.ndarray):
            # Multiclass classification (shap_values is a list of arrays)
            print("Multiclass classification detected.")
            
            # Average absolute SHAP values across all classes
            shap_array = np.abs(np.array(shap_values))  # shape: (n_classes, n_samples, n_features)
            shap_mean = shap_array.mean(axis=0)         # shape: (n_samples, n_features)
            shap_df = pd.DataFrame(shap_mean, columns=X_df.columns)

            # Optional: Save summary plots for each class
            for i, class_shap in enumerate(shap_values):
                shap.summary_plot(class_shap, X_df, show=False)
                plt.title(f"SHAP Summary - Class {i}")
                plt.savefig(f"{self.config.feature_importance_file.stem}_class_{i}.png", bbox_inches='tight')
                plt.close()

        else:
            # Binary classification or regression
            print("Binary classification or regression detected.")
            shap_df = pd.DataFrame(shap_values, columns=X_df.columns)

            shap.summary_plot(shap_values, X_df, show=False)
            plt.savefig(f"{self.config.feature_importance_file.stem}.png", bbox_inches='tight')
            plt.close()

        # Compute mean absolute SHAP values
        shap_abs_mean = shap_df.abs().mean().sort_values(ascending=False)

        # Select top N important features
        top_n = 10
        top_features = shap_abs_mean.head(top_n).index.tolist()

        print("Top Important Features:")
        print(top_features)

        # Save to JSON
        os.makedirs(self.config.feature_importance_file.parent, exist_ok=True)
        shap_abs_mean.to_json(self.config.feature_importance_file)

    def important_features(self):
        
        """Compute and save SHAP feature importance for the model."""
        # Compute feature importances robustly for multiclass
        import joblib
        import shap
        import pandas as pd
        import numpy as np

        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(self.config.target_column, axis=1)

        pipeline = joblib.load(self.config.grid_search_model)
        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['classifier']

        X_processed = preprocessor.transform(test_x)
        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )

        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_df)

        # Handle multiclass
        if isinstance(shap_values, np.ndarray) and shap_values.ndim == 3:
            # shape: (n_samples, n_features, n_classes)
            # Take mean absolute SHAP value across classes for each feature
            shap_abs = np.abs(shap_values).mean(axis=2)  # shape: (n_samples, n_features)
            shap_df = pd.DataFrame(shap_abs, columns=X_df.columns)
            shap_importance = shap_df.mean().sort_values(ascending=False)
        elif isinstance(shap_values, list) and isinstance(shap_values[0], np.ndarray):
            # shape: (n_classes, n_samples, n_features)
            shap_array = np.abs(np.array(shap_values))  # (n_classes, n_samples, n_features)
            shap_abs = shap_array.mean(axis=0)  # mean over classes -> (n_samples, n_features)
            shap_df = pd.DataFrame(shap_abs, columns=X_df.columns)
            shap_importance = shap_df.mean().sort_values(ascending=False)
        else:
            shap_df = pd.DataFrame(shap_values, columns=X_df.columns)
            shap_importance = shap_df.abs().mean().sort_values(ascending=False)

        # Print top important features
        
        print("Top Important Features:")
        print(shap_importance.head(10))

        # Optionally save to JSON
        import os
        os.makedirs(self.config.feature_importance_file.parent, exist_ok=True)
        shap_importance.to_json(self.config.feature_importance_file)

        logger.info(f"Feature importance saved to {self.config.feature_importance_file}")
        return shap_importance



In [67]:
try:
    config = ConfigurationManager()
    feature_importance_config = config.get_feature_importance_config()
    feature_imp = FeatureImportance(config = feature_importance_config)
    logger.info(f"Feature Importance Config: {feature_importance_config}")
    feature_imp.important_features()
    
    logger.info(f"Feature importance saved to {feature_importance_config.feature_importance_file}")
except Exception as e:
    logger.exception(f"An error occurred: {e}")
    raise e 


[2025-05-28 19:12:48,944: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-28 19:12:48,950: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-28 19:12:48,956: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-28 19:12:48,960: INFO: common: created directory at: artifacts]
[2025-05-28 19:12:48,964: INFO: common: created directory at: artifacts]
[2025-05-28 19:12:48,965: INFO: 1520614144: Feature Importance Config: FeatureImportanceConfig(root_dir=WindowsPath('artifacts/feature_importance'), grid_search_model=WindowsPath('artifacts/model_training/grid_search_model.joblib'), training_data_path=WindowsPath('artifacts/data_transformation/train_data.csv'), test_data_path=WindowsPath('artifacts/data_transformation/test_data.csv'), feature_importance_file=WindowsPath('artifacts/feature_importance/feature_importance.json'), target_column='product_name')]
[2025-05-28 19:12:48,967: INFO: 1239298184: Feature Importance Config: FeatureImpor