In [78]:
import os

In [79]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [80]:
os.chdir('../.')

In [81]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [82]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [83]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class FeatureImportanceConfig:
    """Configuration for feature importance analysis.
    """
    root_dir: Path
    grid_search_model: Path
    training_data_path: Path
    test_data_path: Path
    feature_importance_file: Path
    target_column: str
    
    
    
    

In [84]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories
from BankProducts import logger

In [85]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_feature_importance_config(self) -> FeatureImportanceConfig:
        config = self.config.feature_importance 
        schema = self.schema.target_column
        params= self.params
        
        create_directories([self.config.artifacts_root])
       
        
        feature_importance_config = FeatureImportanceConfig(
            root_dir=Path(config.root_dir),
            grid_search_model=Path(config.grid_search_model),
            training_data_path=Path(config.training_data_path),
            test_data_path=Path(config.test_data_path),
            feature_importance_file=Path(config.feature_importance_file),
            target_column= schema.name
            
        )
        logger.info(f"Feature Importance Config: {feature_importance_config}")
        return feature_importance_config
        
        

In [86]:
import pandas as pd

In [87]:
class FeatureImportance:
    def __init__(self, config: FeatureImportanceConfig):
        self.config = config

    def important_feature(self):
        import joblib
        import shap
        import pandas as pd
        import numpy as np
        import os
        import matplotlib.pyplot as plt

        logger.info("Starting SHAP feature importance calculation...")

        # Load data
        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(self.config.target_column, axis=1)

        # Load model and preprocessor
        pipeline = joblib.load(self.config.grid_search_model)
        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['log_regression']

        # Transform data
        X_processed = preprocessor.transform(test_x)

        # Get feature names
        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )

        # SHAP Explainer
        explainer = shap.LinearExplainer(model, X_df, feature_perturbation="interventional")
        shap_values = explainer.shap_values(X_df)

        # Handle binary or multiclass
        # Handle binary or multiclass
        shap_abs = np.abs(shap_values)

        if shap_abs.ndim == 3:
            # Multiclass: (samples, features, classes) → average over classes
            shap_mean = shap_abs.mean(axis=2)
            shap_df = pd.DataFrame(shap_mean, columns=X_df.columns)
        else:
            shap_df = pd.DataFrame(shap_abs, columns=X_df.columns)

        shap_importance = shap_df.mean().sort_values(ascending=False)


        # Log top features
        print("Top Important Features:")
        print(shap_importance.head(10))

        # Create folder for plots
        plots_dir = self.config.feature_importance_file.parent / "plots"
        os.makedirs(plots_dir, exist_ok=True)

        # 1. Bar Plot
        plt.figure(figsize=(10, 6))
        shap_importance.head(20).plot(kind='barh')
        plt.title("Top 20 Feature Importances (Mean SHAP Value)")
        plt.gca().invert_yaxis()
        plt.tight_layout()
        bar_plot_path = plots_dir / "shap_bar_plot.png"
        plt.savefig(bar_plot_path)
        plt.close()

        # 2. Beeswarm Plot
        plt.figure(figsize=(12, 6))
        shap.summary_plot(shap_values, X_df, plot_type="dot", show=False)
        beeswarm_path = plots_dir / "shap_beeswarm_plot.png"
        plt.savefig(beeswarm_path, bbox_inches='tight')
        plt.close()

        # (Optional) 3. Force Plot for a single prediction
        # Uncomment to save interactive HTML
        # force_plot = shap.plots.force(explainer.expected_value, shap_values[0], X_df.iloc[0])
        # shap.save_html(str(plots_dir / "shap_force_plot.html"), force_plot)

        # Save SHAP importance to JSON
        shap_importance.to_json(self.config.feature_importance_file)
        logger.info(f"SHAP values saved to {self.config.feature_importance_file}")
        logger.info(f"SHAP bar plot saved to {bar_plot_path}")
        logger.info(f"SHAP beeswarm plot saved to {beeswarm_path}")

        return shap_importance



In [88]:
try:
    config = ConfigurationManager()
    feature_importance_config = config.get_feature_importance_config()
    feature_imp = FeatureImportance(config = feature_importance_config)

    feature_imp.important_feature()
    
    logger.info(f"Feature importance saved to {feature_importance_config.feature_importance_file}")
except Exception as e:
    logger.exception(f"An error occurred: {e}")
    raise e 


[2025-06-13 23:33:03,337: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-06-13 23:33:03,375: INFO: common: yaml file: params.yaml loaded successfully]


[2025-06-13 23:33:03,399: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-06-13 23:33:03,406: INFO: common: created directory at: artifacts]
[2025-06-13 23:33:03,409: INFO: common: created directory at: artifacts]
[2025-06-13 23:33:03,411: INFO: 1520614144: Feature Importance Config: FeatureImportanceConfig(root_dir=WindowsPath('artifacts/feature_importance'), grid_search_model=WindowsPath('artifacts/model_training/grid_search_model.joblib'), training_data_path=WindowsPath('artifacts/data_transformation/train_data.csv'), test_data_path=WindowsPath('artifacts/data_transformation/test_data.csv'), feature_importance_file=WindowsPath('artifacts/feature_importance/feature_importance.json'), target_column='recommendedoffer')]
[2025-06-13 23:33:03,414: INFO: 1332980845: Starting SHAP feature importance calculation...]




Top Important Features:
num__monthlyincome                       23.776206
cat__productcategory_Savings Account      5.618137
cat__productcategory_Credit Card          4.943945
cat__productcategory_Mortgage             3.873505
cat__productcategory_Loan                 3.554549
cat__productcategory_Checking Account     3.444262
cat__most_used_channel_ATM                2.506376
cat__most_used_channel_Branch             2.152710
cat__productsubcategory_Student           1.488896
cat__most_used_channel_Mobile             1.483481
dtype: float64
[2025-06-13 23:33:08,891: INFO: 1332980845: SHAP values saved to artifacts\feature_importance\feature_importance.json]
[2025-06-13 23:33:08,892: INFO: 1332980845: SHAP bar plot saved to artifacts\feature_importance\plots\shap_bar_plot.png]
[2025-06-13 23:33:08,893: INFO: 1332980845: SHAP beeswarm plot saved to artifacts\feature_importance\plots\shap_beeswarm_plot.png]
[2025-06-13 23:33:08,896: INFO: 516678016: Feature importance saved to artifacts

<Figure size 1200x600 with 0 Axes>