In [188]:
import os

In [189]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [190]:
os.chdir('../.')

In [191]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [192]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [193]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class FeatureImportanceConfig:
    """Configuration for feature importance analysis.
    """
    root_dir: Path
    grid_search_model: Path
    training_data_path: Path
    test_data_path: Path
    feature_importance_file: Path
    
    
    
    

In [194]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories
from BankProducts import logger

In [195]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_feature_importance_config(self) -> FeatureImportanceConfig:
        config = self.config.feature_importance 
        
        create_directories([self.config.artifacts_root])
       
        
        feature_importance_config = FeatureImportanceConfig(
            root_dir=Path(config.root_dir),
            grid_search_model=Path(config.grid_search_model),
            training_data_path=Path(config.training_data_path),
            test_data_path=Path(config.test_data_path),
            feature_importance_file=Path(config.feature_importance_file)
        )
        logger.info(f"Feature Importance Config: {feature_importance_config}")
        return feature_importance_config
        
        

In [196]:

import pandas as pd
import shap

In [197]:
class FeatureImportance:
    def __init__(self, config: FeatureImportanceConfig):
        self.config = config
        self.pipeline = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.processor = None

    def load_data(self):
        """Load training and test data."""
        self.X_train = pd.read_csv(self.config.training_data_path)
        self.X_test = pd.read_csv(self.config.test_data_path)
        logger.info("Data loaded successfully.")
        
        
    def load_model(self):
        import joblib
        import logging
        logger = logging.getLogger(__name__)

        # Load pipeline
        self.pipeline = joblib.load(self.config.grid_search_model)

        # Extract steps
        self.model = self.pipeline.named_steps['classifier']
        self.processor = self.pipeline.named_steps['preprocessor']

        # Transform training data
        X_transformed = self.processor.transform(self.X_train)
        if hasattr(X_transformed, "toarray"):
            self.X_train = X_transformed.toarray()
        else:
            self.X_train = X_transformed

        # Get feature names
        self.feature_names = self.processor.get_feature_names_out()

        logger.info("Model loaded successfully.")

    def shap_analysis(self, top_n: int = 20):
        import shap
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt

        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(self.X_train)

        # Handle multiclass SHAP (shape: [samples, features, classes])
        if isinstance(shap_values, list) or len(shap_values.shape) == 3:
            # Take absolute SHAP, mean over classes, then mean over samples
            shap_values_mean = np.abs(shap_values).mean(axis=0).mean(axis=0)  # shape: (features,)
        else:
            shap_values_mean = np.abs(shap_values).mean(axis=0)  # binary or regression

        # Create Series for feature importances
        feature_importance = pd.Series(shap_values_mean, index=self.feature_names).sort_values(ascending=False)

        # Logging
        logger.info(f"Top {top_n} important features by SHAP values:")
        for i, (feature, score) in enumerate(feature_importance.head(top_n).items(), start=1):
            logger.info(f"{i}. {feature}: {score:.4f}")

        # SHAP summary plot
        shap.summary_plot(shap_values, self.X_train, feature_names=self.feature_names, show=False)
        plt.tight_layout()
        plt.savefig("artifacts/feature_importance/shap_summary_plot.png", dpi=300)
        plt.close()

        # Bar chart for top N
        top_features = feature_importance.head(top_n)
        plt.figure(figsize=(10, 6))
        top_features.plot(kind="barh", color="skyblue")
        plt.gca().invert_yaxis()
        plt.title(f"Top {top_n} SHAP Feature Importances")
        plt.xlabel("Mean |SHAP value|")
        plt.tight_layout()
        plt.savefig("artifacts/feature_importance/shap_top_features_bar.png", dpi=300)
        plt.close()

        return feature_importance



                
        #def mape(y_true, y_pred):
            #y_true, y_pred = np.array(y_true), np.array(y_pred)
            #return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1)))

        #mape_value = mape(y_encoded, y_pred_selected) * 100
        #print(f"MAPE with selected features: {mape_value:.2f} %")

        #logger.info("Top important features selected using SHAP.")
        return feature_importance
    

In [198]:
try:
    config = ConfigurationManager()
    feature_importance_config = config.get_feature_importance_config()
    feature_imp = FeatureImportance(config = feature_importance_config)
    feature_imp.load_data()
    feature_imp.load_model()
    feature_importance = feature_imp.shap_analysis()
    # Save as CSV or JSON depending on your config
    feature_importance.to_csv(feature_importance_config.feature_importance_file, header=True)
    logger.info(f"Feature importance saved to {feature_importance_config.feature_importance_file}")
except Exception as e:
    logger.exception(f"An error occurred: {e}")
    raise e 


[2025-05-26 22:34:07,498: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-26 22:34:07,503: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-26 22:34:07,508: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-26 22:34:07,509: INFO: common: created directory at: artifacts]
[2025-05-26 22:34:07,511: INFO: common: created directory at: artifacts]
[2025-05-26 22:34:07,512: INFO: 2574497744: Feature Importance Config: FeatureImportanceConfig(root_dir=WindowsPath('artifacts/feature_importance'), grid_search_model=WindowsPath('artifacts/model_training/grid_search_model.joblib'), training_data_path=WindowsPath('artifacts/data_transformation/train_data.csv'), test_data_path=WindowsPath('artifacts/data_transformation/test_data.csv'), feature_importance_file=WindowsPath('artifacts/feature_importance/feature_importance.json'))]
[2025-05-26 22:34:07,527: INFO: 535375605: Data loaded successfully.]
[2025-05-26 22:34:07,582: INFO: 535375605: 

[2025-05-26 22:34:09,330: ERROR: 3472536603: An error occurred: Length of values (5) does not match length of index (464)]
Traceback (most recent call last):
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_3968\3472536603.py", line 7, in <module>
    feature_importance = feature_imp.shap_analysis()
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_3968\535375605.py", line 59, in shap_analysis
    feature_importance = pd.Series(shap_values_mean, index=self.feature_names).sort_values(ascending=False)
  File "c:\Users\RICH-FILES\anacoda4\envs\bankprod\lib\site-packages\pandas\core\series.py", line 575, in __init__
    com.require_length_match(data, index)
  File "c:\Users\RICH-FILES\anacoda4\envs\bankprod\lib\site-packages\pandas\core\common.py", line 573, in require_length_match
    raise ValueError(
ValueError: Length of values (5) does not match length of index (464)


ValueError: Length of values (5) does not match length of index (464)