In [1]:

import os




In [2]:

%pwd

'd:\\Projects\\Fraud-Detection\\research'

In [3]:

os.chdir("../")

In [4]:
from dataclasses import dataclass
from pathlib import Path

@dataclass(frozen=True)
class TrainingConfig:
    root_dir: Path
    trained_model_path: Path
    updated_base_model_path: Path
    training_data: Path
    
    # XGBoost parameters
    objective: str
    eval_metric: str
    max_depth: int
    learning_rate: float
    n_estimators: int
    subsample: float
    colsample_bytree: float
    gamma: float
    min_child_weight: int
    scale_pos_weight: int
    reg_alpha: float
    reg_lambda: float
    random_state: int
    booster: str


In [5]:

from fraud_detection.constants import *
from fraud_detection.utils.common import read_yaml, create_directories

In [13]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath=CONFIG_FILE_PATH,
        params_filepath=PARAMS_FILE_PATH
    ):
        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])

    def get_training_config(self) -> TrainingConfig:
        training = self.config.training
        prepare_base_model = self.config.prepare_base_model
        params = self.params["xgboost_params"]   # load xgboost params dict

        create_directories([
            Path(training.root_dir)
        ])

        training_config = TrainingConfig(
            root_dir=Path(training.root_dir),
            trained_model_path=Path(training.trained_model_path),
            updated_base_model_path=Path(prepare_base_model.updated_base_model_path),
            training_data=Path(self.config.data_ingestion.local_data_file),

            # XGBoost params
            objective=params["objective"],
            eval_metric=params["eval_metric"],
            max_depth=params["max_depth"],
            learning_rate=params["learning_rate"],
            n_estimators=params["n_estimators"],
            subsample=params["subsample"],
            colsample_bytree=params["colsample_bytree"],
            gamma=params["gamma"],
            min_child_weight=params["min_child_weight"],
            scale_pos_weight=params["scale_pos_weight"],
            reg_alpha=params["reg_alpha"],
            reg_lambda=params["reg_lambda"],
            random_state=params["random_state"],
            booster=params["booster"]
        )

        return training_config


In [20]:
class Training:
    def __init__(self, config: TrainingConfig):
        self.config = config
        self.model = None
        logger.info("Training class initialized with given configuration.")

    def load_data(self):
        logger.info(f"Loading dataset from {self.config.training_data}")
        df = pd.read_csv(self.config.training_data)

        logger.info("Splitting features and target.")
        X = df.iloc[:, :-1]
        y = df.iloc[:, -1]

        logger.info("Applying encoding on categorical columns if any.")
        X = pd.get_dummies(X, drop_first=True)

        logger.info("Performing train-test split.")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=0.2, random_state=self.config.random_state, stratify=y
        )

        logger.info("Applying hybrid resampling (SMOTE + ENN) on training data.")
        sampler = SMOTEENN(random_state=self.config.random_state)
        self.X_train, self.y_train = sampler.fit_resample(self.X_train, self.y_train)

        logger.info(f"Training data shape after resampling: {self.X_train.shape}, {self.y_train.shape}")

    def get_model(self):
        logger.info("Initializing XGBoost model with provided parameters.")
        self.model = xgb.XGBClassifier(
            objective=self.config.objective,
            eval_metric=self.config.eval_metric,
            max_depth=self.config.max_depth,
            learning_rate=self.config.learning_rate,
            n_estimators=self.config.n_estimators,
            subsample=self.config.subsample,
            colsample_bytree=self.config.colsample_bytree,
            gamma=self.config.gamma,
            min_child_weight=self.config.min_child_weight,
            scale_pos_weight=self.config.scale_pos_weight,
            reg_alpha=self.config.reg_alpha,
            reg_lambda=self.config.reg_lambda,
            random_state=self.config.random_state,
            booster=self.config.booster,
            use_label_encoder=False
        )
        logger.info("XGBoost model initialized successfully.")

    def train(self):
        logger.info("Starting model training.")

        # Train the model
        self.model.fit(
            self.X_train,
            self.y_train
        )

        logger.info("Model training completed.")

        # Save trained model
        self.save_model(self.config.trained_model_path, self.model)
        logger.info(f"Model saved at: {self.config.trained_model_path}")

    @staticmethod
    def save_model(path: Path, model):
        joblib.dump(model, path)
        logger.info(f"Model saved at {path}")


In [21]:
try:
    config = ConfigurationManager()
    training_config = config.get_training_config()
    training = Training(config=training_config)
    training.load_data()
    training.get_model()
    training.train()
    
except Exception as e:
    raise e

[2025-08-30 15:14:50,364: INFO: common: yaml file: config\config.yaml loaded successfully]
[2025-08-30 15:14:50,384: INFO: common: yaml file: params.yaml loaded successfully]
[2025-08-30 15:14:50,384: INFO: common: created directory at: artifacts]
[2025-08-30 15:14:50,384: INFO: common: created directory at: artifacts\training]
[2025-08-30 15:14:50,384: INFO: 4053563582: Training class initialized with given configuration.]
[2025-08-30 15:14:50,393: INFO: 4053563582: Loading dataset from artifacts\data_ingestion\fraud.csv]
[2025-08-30 15:14:50,414: INFO: 4053563582: Splitting features and target.]
[2025-08-30 15:14:50,417: INFO: 4053563582: Applying encoding on categorical columns if any.]
[2025-08-30 15:14:50,443: INFO: 4053563582: Performing train-test split.]
[2025-08-30 15:14:50,451: INFO: 4053563582: Applying hybrid resampling (SMOTE + ENN) on training data.]
[2025-08-30 15:14:50,500: INFO: 4053563582: Training data shape after resampling: (666, 8), (666,)]
[2025-08-30 15:14:50,50

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
