# import

In [6]:
# --- iPython Config --- #
from IPython import get_ipython
if 'IPython.extensions.autoreload' not in get_ipython().extension_manager.loaded:
    get_ipython().run_line_magic('load_ext', 'autoreload')
else:
    get_ipython().run_line_magic('reload_ext', 'autoreload')
%autoreload 2

# --- System and Path --- #
import os
import sys
repo_path = os.path.dirname(os.getcwd())
if repo_path not in sys.path:
    sys.path.append(repo_path)

# --- Standard Libraries --- #
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np
from typing import Tuple, Optional, Dict
import time
import joblib
# scikit-learn
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import recall_score, precision_score, f1_score, accuracy_score, roc_auc_score
# models
from sklearn.linear_model import LogisticRegression
import xgboost as xgb
import optuna

# --- Custom Modules --- #
from src.config import Config
from src.data import dataloader

In [7]:
file_path = os.path.join(repo_path, "data", "raw", "dataset.parquet")
df_dataset = dataloader.load_data(file_path)

Memory usage: Before=134.49MB -> After=65.62MB, Decreased by 51.2%
Data loaded successfully.


In [8]:
class DataProcessor:
    """
    A class for processing datasets, including splitting and normalization.
    """

    def __init__(self):
        """
        Initializes the DataProcessor class with default attributes.
        """
        self.df_dataset: Optional[pd.DataFrame] = None
        self.target: Optional[str] = None
        self.random_state: int = Config.SEED  # Assumes Config.SEED is defined globally

    def initial_train_test_split(self, df_dataset: pd.DataFrame, test_size: float = 0.10) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Splits the dataset into training and test sets.

        Args:
            df (pd.DataFrame): The input dataset.
            test_size (float, optional): Proportion of the dataset to be used as the test set. Defaults to 0.10.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Training and test datasets.
        """
        if not self.target:
            raise ValueError("Target column must be specified before splitting.")

        X = df_dataset.drop(columns=[self.target])
        y = df_dataset[self.target]

        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.random_state, stratify=y
        )
        # combine X and y
        df_train = pd.concat([X_train, y_train], axis=1)
        df_test = pd.concat([X_test, y_test], axis=1)

        return df_train, df_test

    def normalize(self, df_train: pd.DataFrame, df_test: pd.DataFrame, num_cols: Optional[list] = None) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Normalizes numerical features in the dataset using StandardScaler.

        Args:
            df_train (pd.DataFrame): Training dataset.
            df_test (pd.DataFrame): Test dataset.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Normalized training and test datasets.
        """
        if num_cols is None:
            num_cols = df_train.select_dtypes(include=["number"]).columns.tolist()
        if self.target in num_cols:
            num_cols.remove(self.target)

        scaler = StandardScaler()
        df_train[num_cols] = scaler.fit_transform(df_train[num_cols])
        df_test[num_cols] = scaler.transform(df_test[num_cols])

        return df_train, df_test


    def process(self, df_dataset: pd.DataFrame, target: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
        """
        Processes the dataset by splitting into train and test sets.

        Args:
            df (pd.DataFrame): The dataset to be processed.
            target (str): The target column for prediction.

        Returns:
            Tuple[pd.DataFrame, pd.DataFrame]: Processed train and test datasets.
        """
        if not target:
            raise ValueError("Target column must be specified.")

        self.df_dataset = df_dataset
        self.target = target

        # Initial split: dataset → train (80+10%) | test (10%)
        df_train, df_test = self.initial_train_test_split(self.df_dataset, test_size=0.10)

        # Normalize
        df_train, df_test = self.normalize(df_train, df_test)

        return df_train, df_test

data_processor = DataProcessor()
df_train, df_test = data_processor.process(df_dataset, target="Class")

In [None]:
class BaseModel():
    def __init__(self):
        self.model = None
        self.params = {}
        self.metrics = {
            "precision": precision_score,
            "recall": recall_score,
            "f1": f1_score,
            "accuracy": accuracy_score,
            "roc_auc": roc_auc_score
        } # {metric_name: metric_function}

    def fit(self, X, y):
        self.model.fit(X, y)

    def predict(self, X):
        return self.model.predict(X)

    def evaluate(self, X, y) -> dict:
        y_pred = self.predict(X)
        results = {}
        for metric_name, metric_fn in self.metrics.items():
            try:
                if metric_name == "roc_auc":
                    y_prob = self.model.predict_proba(X)[:, 1]  # Get probabilities for the positive class (1)
                    results[metric_name] = metric_fn(y, y_prob)
                else:
                    results[metric_name] = metric_fn(y, y_pred)
            except ValueError:
                results[metric_name] = None
        return results

class LogisticRegressionModel(BaseModel):
    def __init__(self, random_state=Config.SEED):
        super().__init__()
        self.model = LogisticRegression()
        self.base_params = {"random_state": random_state}
        self.learnable_params = {
            "C": {"type": "loguniform", "low": 0.01, "high": 10}  # Regularization strength
        }
        self.params = {**self.base_params}

    def fit(self, X, y):
        self.model.set_params(**self.params)
        self.model.fit(X, y)


class XGBoostModel(BaseModel):
    def __init__(self, random_state=Config.SEED):
        super().__init__()
        self.model = xgb.XGBClassifier()
        self.base_params = {"random_state": random_state, "objective": "binary:logistic"}
        self.learnable_params = {
            "max_depth": {"type": "int", "low": 3, "high": 10},
            "learning_rate": {"type": "loguniform", "low": 0.01, "high": 0.3},
            "n_estimators": {"type": "int", "low": 50, "high": 300}
        }
        self.params = {**self.base_params}

    def fit(self, X, y):
        self.model.set_params(**self.params)
        self.model.fit(np.array(X), np.array(y))

class SingleTrainer:
    def __init__(self, model:BaseModel):
        self.model = model # Custom model instance (e.g., LogisticRegressionModel)
        self.target = ""

    def train(self, df_train, target, tune_params=False):
        self.target = target # update target
        X = df_train.drop(columns=[self.target]).values
        y = df_train[self.target].values

        if tune_params:
            best_params = self.tune_hyperparameters(df_train)
            self.model.model.set_params(**best_params)
        self.model.fit(X, y)

    def _retrieve_search_space(self, model, trial):
            """Retrieve and suggest hyperparameter search space for Optuna tuning."""
            trial_params = {}
            for param, search_space in model.learnable_params.items():
                if search_space["type"] == "int":
                    trial_params[param] = trial.suggest_int(param, search_space["low"], search_space["high"])
                elif search_space["type"] == "loguniform":
                    trial_params[param] = trial.suggest_loguniform(param, search_space["low"], search_space["high"])
            return trial_params

    def _cross_validate(self, X, y, params):
        """Perform cross-validation and return fold scores"""
        n_splits = 5
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=Config.SEED)
        fold_scores = []
        for train_index, val_index in cv.split(X, y):
            x_tr, x_val = X[train_index], X[val_index]
            y_tr, y_val = y[train_index], y[val_index]

            model = type(self.model)()  # Re-instantiate model
            model.model.set_params(**params)
            model.fit(x_tr, y_tr)
            score = model.evaluate(x_val, y_val)[self.main_metric]
            fold_scores.append(score)
        return fold_scores

    def objective(self, trial, df_train):
        """Optuna objective function for hyperparameter tuning"""
        # Search space
        trial_params = self._retrieve_search_space(self.model, trial)
        # Cross-validation
        X = df_train.drop(columns=[self.target]).values
        y = df_train[self.target].values
        fold_scores = self._cross_validate(X, y, trial_params)
        return np.mean(fold_scores)

    def tune_hyperparameters(self, df_train, n_trials=3) -> dict:
        """Optimize hyperparameters using Optuna"""
        study = optuna.create_study(direction="maximize")
        study.optimize(lambda trial: self.objective(trial, df_train), n_trials=n_trials)
        return study.best_params

class Trainer:
    def __init__(self, df_train, df_test, target:str, models: Dict[str, BaseModel], main_metric:str, verbose=True, output_dir:str=None):
        self.df_train = df_train
        self.df_test = df_test
        self.target = target
        self.models: Dict[str, BaseModel] = models # {model_name: model_instance}
        self.trained_models: Dict[str, BaseModel] = {}
        self.main_metric = main_metric
        self.verbose = verbose
        self.output_dir = output_dir

    @staticmethod
    def _save_model(model:BaseModel, output_dir, file_format='pkl', verbose=True):
        time_now = time.strftime("%Y-%m-%d-%H%M")
        model_name = model.__class__.__name__
        file_name = f"{time_now}-{model_name}"
        os.makedirs(output_dir, exist_ok=True)
        file_path = os.path.join(output_dir, f"{file_name}.{file_format}")
        joblib.dump(model, file_path)
        if verbose:
            print(f"Model saved to {file_path}")

    def train_all_models(self, tune_params:bool=False):
        """Train and tune all models"""
        for model_name, model in self.models.items():
            if self.verbose:
                print(f"Training {model_name}...")
                start_time = time.time()

            single_trainer = SingleTrainer(model)
            single_trainer.train(self.df_train, self.target, tune_params)
            self.trained_models[model_name] = single_trainer.model
            # Save model
            if self.output_dir:
                self._save_model(single_trainer.model, self.output_dir, verbose=self.verbose)

            if self.verbose:
                print(f"Training time {model_name}: {time.time()-start_time:.2f} seconds.")

    def evaluate_all_models(self) -> Dict[str, Dict]:
        """Evaluate all trained models on both train and test sets."""
        results = {"train": {}, "test": {}}

        for dataset_name, df in [("train", self.df_train), ("test", self.df_test)]:
            X = df.drop(columns=[self.target]).values
            y = df[self.target].values

            for model_name, model in self.trained_models.items():
                res = model.evaluate(X, y)
                score = res.get(self.main_metric, None)  # Avoid KeyError
                results[dataset_name][model_name] = res
                if self.verbose:
                    print(
                        f"{dataset_name.upper()} | {model_name} {self.main_metric}: {score:.4f}"
                        if score is not None
                        else f"{dataset_name.upper()} | {model_name}: Metric not available"
                    )

        return results

In [10]:
# Training
models = {"LogisticRegression": LogisticRegressionModel(), "XGBoost": XGBoostModel()}
# Initialize trainer
multi_trainer = Trainer(
    df_train,
    df_test,
    target="Class",
    models=models,
    main_metric="recall",
    output_dir=os.path.join(repo_path, "models"),
)
multi_trainer.train_all_models(tune_params=False)
multi_trainer.evaluate_all_models()

Training LogisticRegression...
Model saved to /Users/pupipatsingkhorn/Developer/repositories/fraud-detection-european-credit-card-transactions-2023/models/2025-02-16-2158-LogisticRegressionModel.pkl
Training time LogisticRegression: 0.87 seconds.
Training XGBoost...
Model saved to /Users/pupipatsingkhorn/Developer/repositories/fraud-detection-european-credit-card-transactions-2023/models/2025-02-16-2158-XGBoostModel.pkl
Training time XGBoost: 1.51 seconds.
TRAIN | LogisticRegression recall: 0.9977
TRAIN | XGBoost recall: 1.0000
TEST | LogisticRegression recall: 0.9976
TEST | XGBoost recall: 0.9997


{'train': {'LogisticRegression': {'precision': 0.9991272391403893,
   'recall': 0.9976708195901268,
   'f1': 0.9983984982254421,
   'accuracy': 0.9983996623463413,
   'roc_auc': 0.9998447527758879},
  'XGBoost': {'precision': 1.0,
   'recall': 1.0,
   'f1': 1.0,
   'accuracy': 1.0,
   'roc_auc': 1.0}},
 'test': {'LogisticRegression': {'precision': 0.999154572354516,
   'recall': 0.9976434173965039,
   'f1': 0.9983984230627079,
   'accuracy': 0.9983996623463413,
   'roc_auc': 0.9998275054532307},
  'XGBoost': {'precision': 0.9998592837543094,
   'recall': 0.9996834441278886,
   'f1': 0.9997713562094377,
   'accuracy': 0.9997713803351916,
   'roc_auc': 0.9999653776293036}}}