In [1]:
import os

In [2]:
%pwd

'c:\\Users\\asus\\OneDrive\\Desktop\\Risk_Score_Predication_With_ML_FLOW\\research'

In [3]:
os.chdir("../")

In [4]:
%pwd

'c:\\Users\\asus\\OneDrive\\Desktop\\Risk_Score_Predication_With_ML_FLOW'

In [5]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class ModelTrainerConfig:
    root_dir: Path
    train_data_dir: Path
    test_data_dir: Path
    model_name: str
    iterations: int
    learning_rate: float 
    depth: int               
    loss_function: str   
    early_stopping_rounds: int


In [6]:
from RiskScorePrediction.constants import *
from RiskScorePrediction.utils.common import read_yaml,create_directories

In [7]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH,
        schema_filepath = SCHEMA_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)
        self.schema = read_yaml(schema_filepath)

        create_directories([self.config.artifacts_root])

    
    def get_model_trainer_config(self) -> ModelTrainerConfig:
        config = self.config.model_trainer
        params = self.params.catboost
       

        create_directories([config.root_dir])

        model_trainer_config = ModelTrainerConfig(
            root_dir=config.root_dir,
            train_data_dir=config.train_data_dir,
            test_data_dir=config.test_data_dir,
            model_name=config.model_name,
            iterations=params.iterations,
            learning_rate=params.learning_rate,
            depth=params.depth,               
            loss_function=params.loss_function,   
            early_stopping_rounds=params.early_stopping_rounds,
       
        )

        return model_trainer_config

In [8]:
import pandas as pd
import os
from RiskScorePrediction import logger
from catboost import CatBoostRegressor
import joblib

In [9]:
class ModelTrainer:
    def __init__(self, config: ModelTrainerConfig):
        self.config = config

    def train(self):
        try:
            
            train_data = pd.read_csv(self.config.train_data_dir)
            test_data = pd.read_csv(self.config.test_data_dir)

           
            train_data.columns = train_data.columns.astype(str).str.strip()
            test_data.columns = test_data.columns.astype(str).str.strip()

            
            target_col = 'RiskScore'  

           
            print(f"Target Column: {target_col}")
            print(f"Available Columns: {train_data.columns.tolist()}")

            
            if target_col not in train_data.columns:
                raise KeyError(f"Target column '{target_col}' not found! Available columns: {train_data.columns.tolist()}")

          
            train_x = train_data.drop(columns=[target_col])
            test_x = test_data.drop(columns=[target_col])
            train_y = train_data[target_col]
            test_y = test_data[target_col]

            
            if train_y.isna().sum() > 0:
                print(f"Warning: There are {train_y.isna().sum()} NaN values in the target column. Filling with mean.")
                train_y.fillna(train_y.mean(), inplace=True)  # Fill NaN values with the mean of the column

            if test_y.isna().sum() > 0:
                print(f"Warning: There are {test_y.isna().sum()} NaN values in the target column. Filling with mean.")
                test_y.fillna(test_y.mean(), inplace=True)  # Fill NaN values with the mean of the column

            
            assert not train_x.empty, "train_x is empty after dropping the target column!"
            assert not train_y.empty, "train_y is empty!"

          
            cat = CatBoostRegressor(
                iterations=self.config.iterations,
                learning_rate=self.config.learning_rate,
                depth=self.config.depth,
                loss_function=self.config.loss_function,
                early_stopping_rounds=self.config.early_stopping_rounds  
            )

            
            cat.fit(train_x, train_y, eval_set=(test_x, test_y))

           
            joblib.dump(cat, os.path.join(self.config.root_dir, self.config.model_name))

        except Exception as e:
            raise e


In [10]:
try:
    config = ConfigurationManager()
    model_trainer_config = config.get_model_trainer_config()
    model_trainer_config = ModelTrainer(config=model_trainer_config)
    model_trainer_config.train()
except Exception as e:
    raise e

[2025-01-30 23:08:23,287: INFO: common: yaml file : config\config.yaml loaded  sucessfully]
[2025-01-30 23:08:23,287: INFO: common: yaml file : params.yaml loaded  sucessfully]
[2025-01-30 23:08:23,300: INFO: common: yaml file : schema.yaml loaded  sucessfully]
[2025-01-30 23:08:23,302: INFO: common: created directory at :artifacts]
[2025-01-30 23:08:23,302: INFO: common: created directory at :artifacts/model_trainer]
Target Column: RiskScore
Available Columns: ['Age', 'AnnualIncome', 'CreditScore', 'Experience', 'LoanAmount', 'LoanDuration', 'NumberOfDependents', 'MonthlyDebtPayments', 'CreditCardUtilizationRate', 'NumberOfOpenCreditLines', 'NumberOfCreditInquiries', 'DebtToIncomeRatio', 'BankruptcyHistory', 'PreviousLoanDefaults', 'PaymentHistory', 'LengthOfCreditHistory', 'SavingsAccountBalance', 'CheckingAccountBalance', 'TotalAssets', 'TotalLiabilities', 'MonthlyIncome', 'UtilityBillsPaymentHistory', 'JobTenure', 'NetWorth', 'BaseInterestRate', 'InterestRate', 'MonthlyLoanPayment'