In [45]:
import os

In [46]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [47]:
os.chdir("../.")

In [48]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [49]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [50]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelEvaluationConfig:
    root_dir: Path
    test_data_path: Path
    model_path: Path
    metric_file_name: Path
    target_column: str
    params: dict[str, str]
    
     
    

In [51]:
from BankProducts.constants import *
from BankProducts.utils.common  import read_yaml, create_directories
from BankProducts   import logger



In [52]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_evaluation_config(self) -> ModelEvaluationConfig:
        config = self.config.model_evaluation   
        params = self.params.random_forest
        schema =  self.schema.target_column

        create_directories([self.config.artifacts_root])

        model_evaluation_config = ModelEvaluationConfig(
            root_dir=Path(config.root_dir),
            test_data_path=Path(config.test_data_path),
            model_path=Path(config.model_path),
            metric_file_name=Path(config.metric_file_name),
            target_column=schema.name,
            params=params
            
           
            
        )

        return model_evaluation_config

In [53]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import pandas as pd
from BankProducts import logger
import seaborn as sns
from matplotlib import pyplot as plt
import tempfile
import mlflow
import joblib
from sklearn.preprocessing import LabelEncoder
from BankProducts.utils.common import save_json
import numpy as np
import shap
from urllib.parse import urlparse
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings("ignore", category=UserWarning, module='_distutils_hack')



In [None]:
class ModelEvaluation:
    def __init__(self, config):
        self.config = config

    def eval_metrics(self, actual, pred):
        accuracy = accuracy_score(actual, pred)
        precision = precision_score(actual, pred, average='weighted')
        recall = recall_score(actual, pred, average='weighted')
        f1 = f1_score(actual, pred, average='weighted')
        return accuracy, precision, recall, f1

    def log_confusion_matrix(self, actual, predicted, class_names):
        cm = confusion_matrix(actual, predicted)
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap="Blues",
                    xticklabels=class_names, yticklabels=class_names)
        plt.xlabel("Predicted")
        plt.ylabel("Actual")
        plt.title("Confusion Matrix")
        temp_img_path = tempfile.NamedTemporaryFile(suffix=".png", delete=False).name
        plt.savefig(temp_img_path)
        plt.close()
        mlflow.log_artifact(temp_img_path, artifact_path="confusion_matrix")

    def log_classification_report(self, actual, predicted, class_names):
        report = classification_report(actual, predicted, target_names=class_names)
        temp_txt_path = tempfile.NamedTemporaryFile(suffix=".txt", delete=False).name
        with open(temp_txt_path, "w") as f:
            f.write(report)
        mlflow.log_artifact(temp_txt_path, artifact_path="bank_products_recommender")

    def log_into_mlflow(self):
        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(columns=[self.config.target_column])
        test_y = test_data[self.config.target_column]

        # Encode the target variable
        le = LabelEncoder()
        test_y_encoded = le.fit_transform(test_y)

        logger.info("Loading model from path: %s", self.config.model_path)
        pipeline = joblib.load(self.config.model_path)

        tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme
        mlflow.set_experiment("Product Recommender")

        if mlflow.active_run():
            mlflow.end_run()

        with mlflow.start_run():
            predicted = pipeline.predict(test_x)

            accuracy, precision, recall, f1 = self.eval_metrics(test_y_encoded, predicted)

            scores = {
                "model_name": "random_classifier",
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall,
                "f1_score": f1
            }
            
            # Ensure directory exists
            Path(self.config.metric_file_name).parent.mkdir(parents=True, exist_ok=True)

            save_json(Path(self.config.metric_file_name), data=scores)
            
            logger.info("Metrics saved to: %s", self.config.metric_file_name)

            mlflow.log_metric("accuracy", accuracy)
            mlflow.log_metric("precision", precision)
            mlflow.log_metric("recall", recall)
            mlflow.log_metric("f1_score", f1)

            class_names = le.classes_
            self.log_confusion_matrix(test_y_encoded, predicted, class_names)
            self.log_classification_report(test_y_encoded, predicted, class_names)

            if tracking_url_type_store != "file":
                mlflow.sklearn.log_model(pipeline, "pipeline", registered_model_name="product recommender")
            else:
                mlflow.sklearn.log_model(pipeline, "pipeline")
              
            logger.info("mlflow model logged successfully.")
 
    def feature_importance(self):
        test_data = pd.read_csv(self.config.test_data_path)
        test_x = test_data.drop(columns=[self.config.target_column])
        test_y = test_data[self.config.target_column]

        pipeline = joblib.load(self.config.model_path)

        
        preprocessor = pipeline.named_steps['preprocessor']
        model = pipeline.named_steps['classifier']
        X_processed = preprocessor.transform(test_x)

        try:
            feature_names = preprocessor.get_feature_names_out()
        except AttributeError:
            num_features = preprocessor.transformers_[0][2]
            cat_encoder = preprocessor.transformers_[1][1]
            cat_features = cat_encoder.get_feature_names_out(preprocessor.transformers_[1][2])
            feature_names = np.concatenate([num_features, cat_features])

        X_df = pd.DataFrame(
            X_processed.toarray() if hasattr(X_processed, 'toarray') else X_processed,
            columns=feature_names
        )

        logger.info("Generating SHAP values...")
        explainer = shap.TreeExplainer(model)
        shap_values = explainer.shap_values(X_df)

        shap.summary_plot(shap_values, X_df, show=False)

        # Mean absolute SHAP values
        if isinstance(shap_values, list):
            shap_array = np.abs(np.array(shap_values)).mean(axis=0)
        else:
            shap_array = np.abs(shap_values).mean(axis=0)

        feature_importance = pd.DataFrame(
            list(zip(X_df.columns, shap_array)),
            columns=['Feature', 'Mean Absolute SHAP Value']
        ).sort_values(by='Mean Absolute SHAP Value', ascending=False)

        logger.info("Feature importance:\n%s", feature_importance)

        # Select top N features
        n_top_features = 5
        selected_features = feature_importance['Feature'].head(n_top_features).tolist()

        X_selected = test_x[selected_features]

        # Re-encode target variable
        le = LabelEncoder()
        y_encoded = le.fit_transform(test_y)

        # Train on selected features
        model_selected = RandomForestClassifier(random_state=42)
        model_selected.fit(X_selected, y_encoded)

        y_pred_selected = model_selected.predict(X_selected)

        # MAPE (for classification, not common, but preserved from original)
        def mape(y_true, y_pred):
            y_true, y_pred = np.array(y_true), np.array(y_pred)
            return np.mean(np.abs((y_true - y_pred) / np.maximum(y_true, 1)))

        mape_value = mape(y_encoded, y_pred_selected) * 100
        print(f"MAPE with selected features: {mape_value:.2f} %")

        logger.info("Top important features selected using SHAP.")
        return selected_features

In [55]:
try:
    config = ConfigurationManager()
    model_evaluation_config = config.get_model_evaluation_config()
    eval = ModelEvaluation(config=model_evaluation_config)
    eval.log_into_mlflow()
    #eval.feature_importance()
except Exception as e:
    print(f"An error occurred: {e}")
    
    
    

[2025-05-26 14:13:35,578: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-26 14:13:35,597: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-26 14:13:35,622: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-26 14:13:35,624: INFO: common: created directory at: artifacts]
[2025-05-26 14:13:35,625: INFO: common: created directory at: artifacts]
[2025-05-26 14:13:35,637: INFO: 1283244525: Loading model from path: artifacts\model_training\model.joblib]
[2025-05-26 14:13:36,056: INFO: common: json file saved at: artifacts\model_evaluation\metrics.json]
[2025-05-26 14:13:36,057: INFO: 1283244525: Metrics saved to: artifacts\model_evaluation\metrics.json]
An error occurred: No module named 'distutils._modified'
