In [68]:
import duckdb
import numpy as np
import orjson
import datetime as dt
from pprint import pprint
from typing import Any, Dict, Hashable, Optional, List
import tqdm
import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
)
from sklearn.model_selection import (
    train_test_split,
    StratifiedKFold
)
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
)
from imblearn.metrics import geometric_mean_score
from catboost import CatBoostClassifier

In [2]:
MINIO_HOST = "localhost"
MINIO_PORT = "9000"
MINIO_ENDPOINT = f"{MINIO_HOST}:{MINIO_PORT}"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
PROJECT_NAME = "Transaction Fraud Detection"

In [3]:
DELTA_PATHS = {
    "Transaction Fraud Detection": "s3://lakehouse/delta/transaction_fraud_detection",
    "Estimated Time of Arrival": "s3://lakehouse/delta/estimated_time_of_arrival",
    "E-Commerce Customer Interactions": "s3://lakehouse/delta/e_commerce_customer_interactions",
    "Sales Forecasting": "s3://lakehouse/delta/sales_forecasting",
}

delta_path = DELTA_PATHS.get(PROJECT_NAME)

In [5]:
# Disable AWS EC2 metadata service lookup (prevents 169.254.169.254 errors)
os.environ["AWS_EC2_METADATA_DISABLED"] = "true"
# Create connection (in-memory database)
conn = duckdb.connect()
# Install and load required extensions
conn.execute("INSTALL delta; LOAD delta;")
conn.execute("INSTALL httpfs; LOAD httpfs;")
# Create a secret for S3/MinIO credentials
# This is the recommended way to configure S3 access in DuckDB
# and avoids the EC2 metadata service lookup issue
conn.execute(f"""
    CREATE SECRET minio_secret (
        TYPE S3,
        KEY_ID '{MINIO_ACCESS_KEY}',
        SECRET '{MINIO_SECRET_KEY}',
        REGION 'us-east-1',
        ENDPOINT '{MINIO_ENDPOINT}',
        URL_STYLE 'path',
        USE_SSL false
    );
""")
print("DuckDB extensions loaded and S3 secret configured")

DuckDB extensions loaded and S3 secret configured


In [6]:
query = f"SELECT * FROM delta_scan('{delta_path}') LIMIT 1000"
result = conn.execute(query).df()

print(f"Loaded {len(result)} rows from Delta Lake")
print(f"Columns: {list(result.columns)}")
result.head()

Loaded 1000 rows from Delta Lake
Columns: ['transaction_id', 'user_id', 'timestamp', 'amount', 'currency', 'merchant_id', 'product_category', 'transaction_type', 'payment_method', 'location', 'ip_address', 'device_info', 'user_agent', 'account_age_days', 'cvv_provided', 'billing_address_match', 'is_fraud']


Unnamed: 0,transaction_id,user_id,timestamp,amount,currency,merchant_id,product_category,transaction_type,payment_method,location,ip_address,device_info,user_agent,account_age_days,cvv_provided,billing_address_match,is_fraud
0,1e9620e4-b50d-4558-8965-b26b81d0a8db,55cc0ccb-cf3c-4868-9bad-2cf45bc16626,2026-01-16T15:12:16.333727+00:00,444.97,AUD,merchant_5,travel,transfer,paypal,"{""lat"":22.2902085,""lon"":-8.422191}",129.178.121.192,"{""os"":""iOS"",""browser"":""Edge""}",Mozilla/5.0 (iPod; U; CPU iPhone OS 3_2 like M...,1164,True,True,0
1,2d35465c-bec6-48eb-ad51-3f4e70af7bb4,6e760407-8f00-4588-aea4-fcf812771d59,2026-01-16T15:12:16.600199+00:00,434.07,EUR,merchant_82,luxury_items,withdrawal,crypto,"{""lat"":9.09235,""lon"":22.00906}",93.150.126.81,"{""os"":""macOS"",""browser"":""Firefox""}",Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/53...,700,True,True,0
2,03e7d5be-55f3-469e-a63a-ee0bfbeacd22,431927ac-2ab9-4e05-bb08-ad4bf009f5e7,2026-01-16T15:12:16.722857+00:00,337.21,USD,merchant_14,luxury_items,purchase,debit_card,"{""lat"":-39.740644,""lon"":-141.070575}",166.224.57.203,"{""os"":""Android"",""browser"":""Edge""}",Mozilla/5.0 (Macintosh; Intel Mac OS X 10_6_9 ...,347,True,True,0
3,29555dc9-b5bf-4cbd-a0da-475d085e8746,b14c6658-2dfd-4eec-87e2-1473ba6a330c,2026-01-16T15:12:17.077599+00:00,166.02,JPY,merchant_102,digital_goods,transfer,paypal,"{""lat"":52.344216,""lon"":-79.000464}",21.228.0.100,"{""os"":""Linux"",""browser"":""Firefox""}",Opera/9.93.(Windows NT 10.0; raj-IN) Presto/2....,284,True,True,0
4,ce8689f0-cf7f-48da-8ce6-df72c071459b,2d915bae-12bd-4c53-9a50-785a4f47a555,2026-01-16T15:12:17.539951+00:00,275.85,JPY,merchant_150,services,purchase,credit_card,"{""lat"":-5.5804715,""lon"":109.105593}",66.173.27.36,"{""os"":""Other"",""browser"":""Edge""}",Mozilla/5.0 (compatible; MSIE 6.0; Windows 98;...,173,True,True,0


In [None]:
def load_data(project_name: str):
    """Load data from Delta Lake on MinIO via Polars or fallback to Kafka."""
    DELTA_PATH = DELTA_PATHS.get(project_name, "")
    try:
        print(f"Attempting to load data from Delta Lake via DuckDB: {DELTA_PATH}")
        query = f"SELECT * FROM delta_scan('{DELTA_PATH}')"
        result = conn.execute(query).df()
        print(f"Data loaded from Delta Lake for {project_name}: {len(result)} rows")
        return result
    except Exception as e:
        #print(f"Delta Lake not available for {project_name}: {e}")
        raise e


In [24]:
test = load_data("Transaction Fraud Detection")

Attempting to load data from Delta Lake via DuckDB: s3://lakehouse/delta/transaction_fraud_detection
Data loaded from Delta Lake for Transaction Fraud Detection: 930507 rows


In [None]:
import json

def extract_device_info_sklearn(data):
    """
    Extract device info from JSON string column to separate columns.
    
    The device_info column from Delta Lake contains JSON strings like:
        '{"os":"iOS","browser":"Edge"}'
    
    pd.json_normalize() expects dicts, not strings, so we must parse first.
    """
    data = data.copy()
    # Parse JSON strings to dictionaries
    # Using orjson for speed (already imported), fallback to json
    #try:
    device_dicts = data["device_info"].apply(
        lambda x: orjson.loads(x) if isinstance(x, str) else x
    )
    data_to_join = pd.json_normalize(device_dicts)
    data = data.drop("device_info", axis = 1)
    data = data.join(data_to_join)
    return data


def extract_timestamp_info_sklearn(data):
    """Extract timestamp components to separate columns."""
    data = data.copy()
    data["timestamp"] = pd.to_datetime(data["timestamp"], format='ISO8601')
    data["year"] = data["timestamp"].dt.year
    data["month"] = data["timestamp"].dt.month
    data["day"] = data["timestamp"].dt.day
    data["hour"] = data["timestamp"].dt.hour
    data["minute"] = data["timestamp"].dt.minute
    data["second"] = data["timestamp"].dt.second
    data = data.drop("timestamp", axis=1)
    return data


def extract_coordinates_sklearn(data):
    """
    Extract location coordinates from JSON string column to separate columns.
    
    The location column from Delta Lake contains JSON strings like:
        '{"lat":22.2902085,"lon":-8.422191}'
    """
    data = data.copy()
    location_dicts = data["location"].apply(
        lambda x: orjson.loads(x) if isinstance(x, str) else x
    )
    data_to_join = pd.json_normalize(location_dicts)
    data = data.drop("location", axis=1)
    data = data.join(data_to_join)
    return data

## In the future, consider replace these functions for DuckDB SQL queries

In [55]:
def process_batch_data(data: pd.DataFrame, project_name: str):
    """Process batch data and fit/save sklearn encoders."""
    data = data.copy()
    #os.makedirs("encoders/sklearn", exist_ok=True)
    #filename = "encoders/sklearn/" + project_name.lower().replace(' ', '_').replace("-", "_") + ".pkl"
    if project_name == "Transaction Fraud Detection":
        data = extract_device_info_sklearn(data)
        data = extract_timestamp_info_sklearn(data)
        numerical_features = [
            "amount",
            "account_age_days",
            "cvv_provided",
            "billing_address_match"
        ]
        binary_features = [
            "cvv_provided",
            "billing_address_match"
        ]
        categorical_features = [
            "currency",
            "merchant_id",
            "payment_method",
            "product_category",
            "transaction_type",
            "browser",
            "os",
            "year",
            "month",
            "day",
            "hour",
            "minute",
            "second",
        ]
        numerical_transformer = StandardScaler()
        categorical_transformer = OneHotEncoder(
            handle_unknown = 'ignore',
            sparse_output = False)
        preprocessor = ColumnTransformer(
            transformers = [
                ("numerical", numerical_transformer, numerical_features),
                ("binary", "passthrough", binary_features),
                ("categorical", categorical_transformer, categorical_features),
            ]
        )
        preprocessor.set_output(transform = "pandas")
        X = data.drop('is_fraud', axis = 1)
        y = data['is_fraud']
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size = 0.2,
            stratify = y,
            random_state = 42
        )
        preprocessor.fit(X_train)
        preprocessor_dict = {"preprocessor": preprocessor}
        #with open(filename, 'wb') as f:
        #    pickle.dump(preprocessor_dict, f)
        X_train = preprocessor.transform(X_train)
        X_test = preprocessor.transform(X_test)
        return X_train, X_test, y_train, y_test, preprocessor_dict
    else:
        raise ValueError(f"Unsupported project for batch processing: {project_name}")


In [60]:
# =============================================================================
# CATBOOST CLASSIFIER FOR FRAUD DETECTION
# =============================================================================
# CatBoost is optimal for fraud detection because:
# 1. Native handling of imbalanced data (auto_class_weights)
# 2. Built-in categorical feature support (no one-hot encoding needed)
# 3. Symmetric tree structure prevents overfitting
# 4. 30-60x faster prediction than XGBoost/LightGBM
# 5. Best default parameters among boosting libraries
#
# Research sources:
# - CatBoost docs: https://catboost.ai/docs/en/concepts/parameter-tuning
# - Fraud detection study (2025): F1=0.9161, Precision=0.9319, Recall=0.9114
# - https://catboost.ai/docs/en/references/training-parameters/common
# =============================================================================

def create_batch_model(project_name: str, **kwargs):
    """
    Create CatBoost classifier optimized for fraud detection.
    
    CatBoost advantages over XGBoost for fraud detection:
    - Better handling of categorical features (native support)
    - auto_class_weights for imbalanced data
    - Symmetric trees reduce overfitting
    - Faster inference (30-60x)
    - Better default parameters
    
    Args:
        project_name: Name of the project
        **kwargs: Additional arguments (y_train for class weight calculation)
    
    Returns:
        CatBoostClassifier configured for fraud detection
    
    Usage:
        model = create_batch_model("Transaction Fraud Detection", y_train=y_train)
        
        # IMPORTANT: Pass eval_set for early stopping and best model selection
        model.fit(X_train, y_train, eval_set=(X_test, y_test))
    """
    if project_name == "Transaction Fraud Detection":
        # Calculate class imbalance ratio for reference
        y_train = kwargs.get("y_train")
        if y_train is not None:
            neg_samples = sum(y_train == 0)
            pos_samples = sum(y_train == 1)
            imbalance_ratio = neg_samples / pos_samples if pos_samples > 0 else 1
            print(f"Class imbalance ratio: {imbalance_ratio:.2f}:1 (negative:positive)")
            print(f"Fraud rate: {pos_samples / len(y_train) * 100:.2f}%")
        
        model = CatBoostClassifier(
            # =================================================================
            # CORE PARAMETERS
            # =================================================================
            iterations=1000,              # Number of boosting rounds
                                          # High value + early stopping finds optimal
            
            learning_rate=0.03,           # Lower = better generalization
                                          # Range: 0.01-0.3, lower for more data
            
            depth=6,                      # Tree depth (recommended: 6-10)
                                          # CatBoost default=6, good for most cases
            
            # =================================================================
            # IMBALANCED DATA HANDLING (KEY FOR FRAUD DETECTION)
            # =================================================================
            auto_class_weights='Balanced', # Automatically balance class weights
                                           # Options: None, 'Balanced', 'SqrtBalanced'
                                           # 'Balanced' = weight inversely proportional to frequency
            
            # =================================================================
            # LOSS FUNCTION & EVALUATION
            # =================================================================
            loss_function='Logloss',      # Binary cross-entropy for classification
                                          # Options: 'Logloss', 'CrossEntropy'
            
            eval_metric='AUC',            # Area Under ROC Curve
                                          # Best metric for imbalanced binary classification
                                          # Other options: 'F1', 'Precision', 'Recall', 'PRAUC'
            
            # =================================================================
            # REGULARIZATION (PREVENT OVERFITTING)
            # =================================================================
            l2_leaf_reg=3.0,              # L2 regularization coefficient
                                          # Higher = more regularization
                                          # Range: 1-10, default=3
            
            random_strength=1.0,          # Randomness for scoring splits
                                          # Higher = more regularization
                                          # Range: 0-10, default=1
            
            bagging_temperature=1.0,      # Bayesian bootstrap intensity
                                          # Higher = more randomness
                                          # Range: 0-10, default=1
            
            # =================================================================
            # EARLY STOPPING (requires eval_set in fit())
            # =================================================================
            # NOTE: early_stopping_rounds and use_best_model require eval_set
            # Pass eval_set=(X_test, y_test) when calling model.fit()
            # =================================================================
            
            # =================================================================
            # PERFORMANCE & REPRODUCIBILITY
            # =================================================================
            task_type='CPU',              # 'CPU' or 'GPU'
                                          # GPU requires CUDA, much faster for large data
            
            thread_count=-1,              # Use all CPU cores
            
            random_seed=42,               # Reproducibility
            
            # =================================================================
            # OUTPUT
            # =================================================================
            verbose=100,                  # Print every 100 iterations
                                          # Set to False for silent training
            
            allow_writing_files=False,    # Don't write temp files
        )
        return model
    
    raise ValueError(f"Unknown project: {project_name}")

In [32]:
data_df = load_data(PROJECT_NAME)

Attempting to load data from Delta Lake via DuckDB: s3://lakehouse/delta/transaction_fraud_detection
Data loaded from Delta Lake for Transaction Fraud Detection: 938608 rows


In [56]:
X_train, X_test, y_train, y_test, preprocessor_dict = process_batch_data(data_df, PROJECT_NAME)

In [61]:
model = create_batch_model(PROJECT_NAME, y_train = y_train)

Class imbalance ratio: 98.87:1 (negative:positive)
Fraud rate: 1.00%


In [63]:
model.fit(
    X_train, y_train,
    eval_set = (X_test, y_test),
    early_stopping_rounds = 50,
    use_best_model = True,
    verbose = True
)

0:	test: 0.9909961	best: 0.9909961 (0)	total: 457ms	remaining: 7m 36s
1:	test: 0.9921351	best: 0.9921351 (1)	total: 1.11s	remaining: 9m 12s
2:	test: 0.9914281	best: 0.9921351 (1)	total: 1.55s	remaining: 8m 34s
3:	test: 0.9921252	best: 0.9921351 (1)	total: 2.17s	remaining: 9m
4:	test: 0.9937338	best: 0.9937338 (4)	total: 2.97s	remaining: 9m 51s
5:	test: 0.9938285	best: 0.9938285 (5)	total: 3.54s	remaining: 9m 45s
6:	test: 0.9939114	best: 0.9939114 (6)	total: 4.32s	remaining: 10m 13s
7:	test: 0.9938146	best: 0.9939114 (6)	total: 4.72s	remaining: 9m 45s
8:	test: 0.9937911	best: 0.9939114 (6)	total: 5.27s	remaining: 9m 40s
9:	test: 0.9938813	best: 0.9939114 (6)	total: 6.07s	remaining: 10m
10:	test: 0.9939301	best: 0.9939301 (10)	total: 6.47s	remaining: 9m 41s
11:	test: 0.9939454	best: 0.9939454 (11)	total: 6.84s	remaining: 9m 23s
12:	test: 0.9939707	best: 0.9939707 (12)	total: 7.28s	remaining: 9m 13s
13:	test: 0.9940053	best: 0.9940053 (13)	total: 7.77s	remaining: 9m 7s
14:	test: 0.9940205

<catboost.core.CatBoostClassifier at 0x7fab71b3d950>

In [64]:
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

In [69]:
metrics = {
    "Accuracy": float(accuracy_score(y_test, y_pred)),
    "Precision": float(precision_score(y_test, y_pred, zero_division=0)),
    "Recall": float(recall_score(y_test, y_pred, zero_division=0)),
    "F1": float(f1_score(y_test, y_pred, zero_division=0)),
    "ROCAUC": float(roc_auc_score(y_test, y_pred_proba)),
    "GeometricMean": float(geometric_mean_score(y_test, y_pred)),
}

metrics

{'Accuracy': 0.9921053472688337,
 'Precision': 0.5644847699287103,
 'Recall': 0.926595744680851,
 'F1': 0.7015706806282722,
 'ROCAUC': 0.9945911128579368,
 'GeometricMean': 0.9591113860464155}

In [70]:
model.__class__.__name__

'CatBoostClassifier'