In [145]:
import os

In [146]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [147]:
os.chdir('../.')

In [148]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [149]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [150]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class FeatureImportanceConfig:
    """Configuration for feature importance analysis.
    """
    root_dir: Path
    grid_search_model: Path
    training_data_path: Path
    test_data_path: Path
    feature_importance_file: Path
    target_column: str
    
    
    
    

In [151]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories
from BankProducts import logger

In [152]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])
        
    def get_feature_importance_config(self) -> FeatureImportanceConfig:
        config = self.config.feature_importance 
        schema = self.schema.target_column
        params= self.params
        
        create_directories([self.config.artifacts_root])
       
        
        feature_importance_config = FeatureImportanceConfig(
            root_dir=Path(config.root_dir),
            grid_search_model=Path(config.grid_search_model),
            training_data_path=Path(config.training_data_path),
            test_data_path=Path(config.test_data_path),
            feature_importance_file=Path(config.feature_importance_file),
            target_column= schema.name
            
        )
        logger.info(f"Feature Importance Config: {feature_importance_config}")
        return feature_importance_config
        
        

In [153]:
import pandas as pd

In [154]:
class FeatureImportance:
    def __init__(self, config: FeatureImportanceConfig):
        self.config = config
        self.pipeline = None
        self.X_train = None
        self.X_test = None
        self.y_train = None
        self.y_test = None
        self.processor = None

    def load_data(self):
        """Load training and test data."""
        self.X_train = pd.read_csv(self.config.training_data_path)
        self.X_test = pd.read_csv(self.config.test_data_path)
        logger.info("Data loaded successfully.")
        
        
    def load_model(self):
        import joblib
        import logging
        logger = logging.getLogger(__name__)

        # Load pipeline
        self.pipeline = joblib.load(self.config.grid_search_model)

        # Extract steps
        self.model = self.pipeline.named_steps['classifier']
        self.processor = self.pipeline.named_steps['preprocessor']

        # Transform training data
        X_transformed = self.processor.transform(self.X_train)
        if hasattr(X_transformed, "toarray"):
            self.X_train = X_transformed.toarray()
        else:
            self.X_train = X_transformed

        # Get feature names
        self.feature_names = self.processor.get_feature_names_out()

        logger.info("Model loaded successfully.")

    def shap_analysis(self, top_n: int = 20):
        import shap
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt

        explainer = shap.TreeExplainer(self.model)
        shap_values = explainer.shap_values(self.X_train)

        # Handle multiclass SHAP values
        if isinstance(shap_values, list):
            # Multiclass: average SHAP across classes
            shap_array = np.array([np.abs(class_shap).mean(axis=0) for class_shap in shap_values])  # shape: (n_classes, n_features)
            shap_values_mean = shap_array.mean(axis=0)  # shape: (n_features,)
            # For plotting, use class 0 or sum over all
            plot_values = shap_values[0]  # you can also use np.mean(shap_values, axis=0)
        else:
            # Binary classification or regression
            shap_values_mean = np.abs(shap_values).mean(axis=0)
            plot_values = shap_values

        feature_importance = pd.Series(shap_values_mean, index=self.feature_names).sort_values(ascending=False)

        logger.info(f"Top {top_n} important features by SHAP values:")
        for i, (feature, score) in enumerate(feature_importance.head(top_n).items(), start=1):
            logger.info(f"{i}. {feature}: {score:.4f}")

        # SHAP summary plot
        shap.summary_plot(plot_values, self.X_train, feature_names=self.feature_names, show=False)
        plt.tight_layout()
        plt.savefig("artifacts/feature_importance/shap_summary_plot.png", dpi=300)
        plt.close()

        # Bar chart for top N
        top_features = feature_importance.head(top_n)
        plt.figure(figsize=(10, 6))
        top_features.plot(kind="barh", color="skyblue")
        plt.gca().invert_yaxis()
        plt.title(f"Top {top_n} SHAP Feature Importances")
        plt.xlabel("Mean |SHAP value|")
        plt.tight_layout()
        plt.savefig("artifacts/feature_importance/shap_top_features_bar.png", dpi=300)
        plt.close()

        return feature_importance
    
    
    def feature_importance(self):
        import joblib  # Fix: import joblib
        import shap
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt
        
        
        
        test_data = pd.read_csv(self.config.test_data_path)
        
        test_x = test_data.drop(self.config.target_column, axis=1)
        
        # change the month column to string
        #test_x['month'] = test_x['month'].astype('str')
        
        
        pipeline = joblib.load(self.config.grid_search_model)
        
         # Extract preprocessor and model from pipeline
        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['classifier']

        # Transform test data using preprocessor
        X_processed = preprocessor.transform(test_x)

        # Get feature names after preprocessing
        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        # Convert processed features to DataFrame
        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )
        
        print(X_df.columns)
        
         
        
        # Create SHAP explainer and compute SHAP values
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_df)

        # Generate SHAP plot
        shap.summary_plot(shap_values, X_df, show=False)
        #buf = BytesIO()
        #plt.savefig(buf, format="png", bbox_inches='tight')
        #plt.close()
        #buf.seek(0)

        # Encode image to base64
        #img_base64 = base64.b64encode(buf.getvalue()).decode('utf-8')
        #return img_base64
        
        # For regression or binary classification
        shap_df = pd.DataFrame(shap_values, columns=X_df.columns)
        shap_abs_mean = shap_df.abs().mean().sort_values(ascending=False)


        # 6. Select top N important features
        top_n = 10  # You can change this to any number or use a threshold
        top_features = shap_abs_mean.head(top_n).index.tolist()

        print("Top Important Features:")
        print(top_features)
        
        
    def feature_importances(self):
        import joblib
        import shap
        import pandas as pd
        import numpy as np
        import matplotlib.pyplot as plt
        import os

        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(self.config.target_column, axis=1)

        pipeline = joblib.load(self.config.grid_search_model)

        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['classifier']

        # Transform test data using preprocessor
        X_processed = preprocessor.transform(test_x)

        # Get feature names after preprocessing
        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )

        print(X_df.columns)

        # Create SHAP explainer
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_df)

        # Handle multiclass vs binary/regression
        if isinstance(shap_values, list) and isinstance(shap_values[0], np.ndarray):
            # Multiclass classification (shap_values is a list of arrays)
            print("Multiclass classification detected.")
            
            # Average absolute SHAP values across all classes
            shap_array = np.abs(np.array(shap_values))  # shape: (n_classes, n_samples, n_features)
            shap_mean = shap_array.mean(axis=0)         # shape: (n_samples, n_features)
            shap_df = pd.DataFrame(shap_mean, columns=X_df.columns)

            # Optional: Save summary plots for each class
            for i, class_shap in enumerate(shap_values):
                shap.summary_plot(class_shap, X_df, show=False)
                plt.title(f"SHAP Summary - Class {i}")
                plt.savefig(f"{self.config.feature_importance_file.stem}_class_{i}.png", bbox_inches='tight')
                plt.close()

        else:
            # Binary classification or regression
            print("Binary classification or regression detected.")
            shap_df = pd.DataFrame(shap_values, columns=X_df.columns)

            shap.summary_plot(shap_values, X_df, show=False)
            plt.savefig(f"{self.config.feature_importance_file.stem}.png", bbox_inches='tight')
            plt.close()

        # Compute mean absolute SHAP values
        shap_abs_mean = shap_df.abs().mean().sort_values(ascending=False)

        # Select top N important features
        top_n = 10
        top_features = shap_abs_mean.head(top_n).index.tolist()

        print("Top Important Features:")
        print(top_features)

        # Save to JSON
        os.makedirs(self.config.feature_importance_file.parent, exist_ok=True)
        shap_abs_mean.to_json(self.config.feature_importance_file)



In [155]:
try:
    config = ConfigurationManager()
    feature_importance_config = config.get_feature_importance_config()
    feature_imp = FeatureImportance(config = feature_importance_config)
    feature_imp.feature_importances()
    #feature_imp.load_model()
    #feature_importance = feature_imp.shap_analysis()
    # Save as CSV or JSON depending on your config
    # Convert Series to DataFrame before saving
    #feature_importance_df = feature_importance.reset_index()
    #feature_importance_df.columns = ['feature', 'importance']
    #feature_importance_df.to_csv(feature_importance_config.feature_importance_file, index=False)
    logger.info(f"Feature importance saved to {feature_importance_config.feature_importance_file}")
except Exception as e:
    logger.exception(f"An error occurred: {e}")
    raise e 


[2025-05-27 21:42:44,900: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-27 21:42:44,907: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-27 21:42:44,911: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-27 21:42:44,912: INFO: common: created directory at: artifacts]
[2025-05-27 21:42:44,913: INFO: common: created directory at: artifacts]
[2025-05-27 21:42:44,914: INFO: 1520614144: Feature Importance Config: FeatureImportanceConfig(root_dir=WindowsPath('artifacts/feature_importance'), grid_search_model=WindowsPath('artifacts/model_training/grid_search_model.joblib'), training_data_path=WindowsPath('artifacts/data_transformation/train_data.csv'), test_data_path=WindowsPath('artifacts/data_transformation/test_data.csv'), feature_importance_file=WindowsPath('artifacts/feature_importance/feature_importance.json'), target_column='product_name')]


Index(['num__age', 'num__annual_income', 'num__credit_score',
       'cat__gender_Female', 'cat__gender_Male',
       'cat__occupation_Academic librarian',
       'cat__occupation_Accommodation manager',
       'cat__occupation_Accountant, chartered',
       'cat__occupation_Accountant, chartered certified',
       'cat__occupation_Accountant, chartered public finance',
       ...
       'cat__occupation_Water quality scientist',
       'cat__occupation_Web designer', 'cat__marital_status_Divorced',
       'cat__marital_status_Married', 'cat__marital_status_Single',
       'cat__financial_goals_Education', 'cat__financial_goals_Home Ownership',
       'cat__financial_goals_Retirement', 'cat__financial_goals_Savings',
       'cat__financial_goals_Travel'],
      dtype='object', length=464)
Binary classification or regression detected.
[2025-05-27 21:42:45,522: ERROR: 2208610099: An error occurred: Must pass 2-d input. shape=(758, 464, 5)]
Traceback (most recent call last):
  File "C:\Us

ValueError: Must pass 2-d input. shape=(758, 464, 5)