In [57]:
import os

In [58]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml\\AI-powered-Bank-Product-Recommender-Chatbot'

In [59]:
os.chdir("../.")

In [60]:
%pwd

'C:\\Users\\RICH-FILES\\Desktop\\ml'

In [61]:
project_dir = "C:/Users/RICH-FILES/Desktop/ml/AI-powered-Bank-Product-Recommender-Chatbot"
os.chdir(project_dir)

In [62]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainingConfig:
    """Configuration for model training.
    """
    model_name: str
    root_dir: Path
    test_data_dir: Path
    train_data_dir: Path
    criterion: str
    max_features: int
    min_samples_split: int
    min_samples_leaf: int
    n_estimators: int
    max_depth: int
    random_state: int
    class_weight: str
    n_jobs: int
    target_column: str

    

In [63]:
from BankProducts.constants import *
from BankProducts.utils.common import read_yaml, create_directories

In [64]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])


    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        params = self.params.random_forest
        schema =  self.schema.target_column

        create_directories([self.config.artifacts_root])

        model_training_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            train_data_dir = Path(config.train_data_dir),
            test_data_dir = Path(config.test_data_dir),
            model_name = config.model_name,
            criterion = params.criterion,
            max_features = params.max_features,
            min_samples_split = params.min_samples_split,
            min_samples_leaf = params.min_samples_leaf,
            n_estimators = params.n_estimators, 
            max_depth = params.max_depth,
            random_state = params.random_state,
            class_weight = params.class_weight,
            n_jobs = params.n_jobs,
            target_column = schema.name
            
        )

        return model_training_config

In [65]:
import pandas as pd
import os
from BankProducts import logger
from sklearn.ensemble import RandomForestClassifier
import joblib
from pathlib import Path

from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer



In [None]:
class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config
        self.model = None

    def train(self):
        logger.info("Loading training data")
        train_data = pd.read_csv(self.config.train_data_dir)
        X_train = train_data.drop(columns=[self.config.target_column])
        y_train = train_data[self.config.target_column]
        
        # encode the target variable if it's categorical
        if y_train.dtype == 'object' or y_train.dtype.name == 'category':
            logger.info("Encoding target variable")
            label_encoder = LabelEncoder()
            y_train = label_encoder.fit_transform(y_train)
        else:
            logger.info("Target variable is already numeric, no encoding needed")   
        logger.info("Loading validation data")
        
        
        logger.info("Training the model")
        # Identify categorical and numerical features
        categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()
        numerical_features = X_train.select_dtypes(include=['number']).columns.tolist()
      
        preprocessor = ColumnTransformer(
            transformers=[
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features),
                ('num', SimpleImputer(strategy='mean'), numerical_features)
            ],
            remainder='passthrough'  # Keep other columns as they are
        )

        # Initialize the RandomForestClassifier with the provided configuration
        model = RandomForestClassifier(
            criterion=self.config.criterion,
            max_features=self.config.max_features,
            min_samples_split=self.config.min_samples_split,
            min_samples_leaf=self.config.min_samples_leaf,
            n_estimators=self.config.n_estimators,  
            max_depth=self.config.max_depth,
            random_state=self.config.random_state,
            class_weight=self.config.class_weight,
            n_jobs=self.config.n_jobs
        )

        # Create the pipeline combining preprocessing, scaling, and modeling
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('scaler', StandardScaler()),   # Optional: Only applies to numerical after preprocessing
            ('classifier', model)
        ])
        
        # Fit the pipeline to the training data

        pipeline.fit(X_train, y_train)

        logger.info("Saving the trained model")
        
                
        # Create directory if it doesn't exist
        model_dir = self.config.root_dir
        os.makedirs(model_dir, exist_ok=True)
        
        print("Model directory:", model_dir)
        print("Directory exists?", os.path.exists(model_dir))
        print("Is a directory?", os.path.isdir(model_dir))

                
        model_path = os.path.join(model_dir, self.config.model_name)
        
        print("Model path:", model_path)
        # Ensure the model path is a Path object
        model_path = Path(model_path)
        
        
        

        
        print("Model will be saved to:", model_path)
        
                # Safety check: if a directory exists where the model file should go, delete it
        if os.path.isdir(model_path):
            import shutil
            logger.warning(f"A folder exists at model path '{model_path}', deleting it.")
            shutil.rmtree(model_path)


   
        # Save the model using joblib
        joblib.dump(self.model, model_path, compress=4)
      
        
        logger.info("Model training completed successfully")

In [67]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    model_trainer = ModelTraining(config=model_training_config)
    model_trainer.train()
except Exception as e:
    logger.exception(f"An error occurred during model training: {e}")
    

[2025-05-24 22:43:19,053: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-05-24 22:43:19,062: INFO: common: yaml file: params.yaml loaded successfully]
[2025-05-24 22:43:19,073: INFO: common: yaml file: schema.yaml loaded successfully]
[2025-05-24 22:43:19,078: INFO: common: created directory at: artifacts]
[2025-05-24 22:43:19,082: INFO: common: created directory at: artifacts]
[2025-05-24 22:43:19,086: INFO: 2501898837: Loading training data]
[2025-05-24 22:43:19,196: INFO: 2501898837: Encoding target variable]
[2025-05-24 22:43:19,203: INFO: 2501898837: Loading validation data]
[2025-05-24 22:43:19,206: INFO: 2501898837: Training the model]


[2025-05-24 22:43:22,236: INFO: 2501898837: Saving the trained model]
Model directory: artifacts\model_training
Directory exists? True
Is a directory? True
Model path: artifacts\model_training\model.joblib
Model will be saved to: artifacts\model_training\model.joblib
[2025-05-24 22:43:22,244: ERROR: 4071221513: An error occurred during model training: [Errno 13] Permission denied: 'artifacts\\model_training\\model.joblib']
Traceback (most recent call last):
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_7968\4071221513.py", line 5, in <module>
    model_trainer.train()
  File "C:\Users\RICH-FILES\AppData\Local\Temp\ipykernel_7968\2501898837.py", line 85, in train
    joblib.dump(self.model, model_path, compress=4)
  File "c:\Users\RICH-FILES\anacoda4\envs\bankprod\lib\site-packages\joblib\numpy_pickle.py", line 594, in dump
    with _write_fileobject(
  File "c:\Users\RICH-FILES\anacoda4\envs\bankprod\lib\site-packages\joblib\numpy_pickle_utils.py", line 222, in _write_fileob