In [1]:
import polars as pl
import orjson
import datetime as dt
from pprint import pprint
from typing import Any, Dict, Hashable, Optional, List
import tqdm
from river import (
    base,
    compose,
    metrics,
    drift,
    forest,
    cluster,
    preprocessing,
    time_series,
    linear_model,
)

In [2]:
MINIO_HOST = "localhost"
MINIO_ENDPOINT = f"http://{MINIO_HOST}:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
PROJECT_NAME = "Transaction Fraud Detection"

In [3]:
DELTA_STORAGE_OPTIONS = {
    "AWS_ENDPOINT_URL": MINIO_ENDPOINT,
    "AWS_ACCESS_KEY_ID": MINIO_ACCESS_KEY,
    "AWS_SECRET_ACCESS_KEY": MINIO_SECRET_KEY,
    "AWS_REGION": "us-east-1",
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
    "AWS_ALLOW_HTTP": "true",
}

In [4]:
DELTA_PATHS = {
    "Transaction Fraud Detection": "s3://lakehouse/delta/transaction_fraud_detection",
    "Estimated Time of Arrival": "s3://lakehouse/delta/estimated_time_of_arrival",
    "E-Commerce Customer Interactions": "s3://lakehouse/delta/e_commerce_customer_interactions",
    "Sales Forecasting": "s3://lakehouse/delta/sales_forecasting",
}

In [5]:
delta_path = DELTA_PATHS.get("Transaction Fraud Detection")

In [6]:
lf = pl.scan_delta(
    delta_path, 
    storage_options = DELTA_STORAGE_OPTIONS)

In [7]:
sql = pl.SQLContext()
sql.register("data", lf)

<SQLContext [tables:1] at 0x7f7176e1cad0>

In [8]:
result = sql.execute("SELECT * FROM data LIMIT 1000").collect()
result

transaction_id,user_id,timestamp,amount,currency,merchant_id,product_category,transaction_type,payment_method,location,ip_address,device_info,user_agent,account_age_days,cvv_provided,billing_address_match,is_fraud
str,str,str,f64,str,str,str,str,str,str,str,str,str,i32,bool,bool,i32
"""ee583be0-c39d-4c17-ab99-9e941b…","""1b02783e-492a-4820-9d44-46a343…","""2026-01-13T22:08:26.555237+00:…",35.62,"""GBP""","""merchant_53""","""digital_goods""","""deposit""","""debit_card""","""{""lat"":-3.685515,""lon"":61.1841…","""157.23.223.51""","""{""os"":""Windows"",""browser"":""Oth…","""Mozilla/5.0 (compatible; MSIE …",657,true,true,0
"""6ed1848c-5f04-40bb-a506-2faf06…","""92294c5a-26ed-429d-9ddd-e1f2ad…","""2026-01-13T22:08:26.927459+00:…",316.91,"""EUR""","""merchant_68""","""services""","""payment""","""crypto""","""{""lat"":-89.3953295,""lon"":-132.…","""132.39.71.49""","""{""os"":""macOS"",""browser"":""Firef…","""Mozilla/5.0 (Windows 95; gez-E…",56,true,true,0
"""a196f742-76e7-4db0-a56a-70f53e…","""c015b552-6839-4283-a90f-a72c1a…","""2026-01-13T22:08:27.188999+00:…",411.43,"""JPY""","""merchant_82""","""travel""","""transfer""","""credit_card""","""{""lat"":-7.958436,""lon"":-79.050…","""130.181.12.16""","""{""os"":""Windows"",""browser"":""Edg…","""Mozilla/5.0 (Windows NT 4.0) A…",1253,true,true,0
"""eecc2d9c-eaed-47e5-8da5-948cbf…","""511f44f9-a097-4ba4-8559-0ce40a…","""2026-01-13T22:08:27.666879+00:…",273.37,"""AUD""","""merchant_177""","""gambling""","""withdrawal""","""debit_card""","""{""lat"":52.5498685,""lon"":-132.0…","""91.149.136.147""","""{""os"":""Windows"",""browser"":""Chr…","""Mozilla/5.0 (compatible; MSIE …",46,true,true,0
"""341820da-331e-4e04-939b-8de4b1…","""61ced3b8-2b69-4816-9e5c-a15af8…","""2026-01-13T22:08:28.087394+00:…",416.58,"""JPY""","""merchant_122""","""other""","""purchase""","""paypal""","""{""lat"":-43.5955175,""lon"":95.16…","""157.170.210.33""","""{""os"":""iOS"",""browser"":""Chrome""…","""Mozilla/5.0 (compatible; MSIE …",648,true,true,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""b2f90039-18aa-413d-a875-147e95…","""e8541af0-495b-4683-a0f8-fad11d…","""2026-01-13T22:07:03.008008+00:…",442.4,"""GBP""","""merchant_163""","""other""","""purchase""","""bank_transfer""","""{""lat"":-60.220377,""lon"":149.42…","""17.92.179.160""","""{""os"":""Android"",""browser"":""Ope…","""Mozilla/5.0 (iPhone; CPU iPhon…",166,true,true,0
"""f0c15566-2d13-404d-a488-ba4e75…","""d56e3b7b-8e83-49d8-ae52-5ad2fd…","""2026-01-13T22:07:03.295945+00:…",806.9,"""BRL""","""merchant_144""","""digital_goods""","""deposit""","""credit_card""","""{""lat"":-67.9371995,""lon"":178.1…","""143.215.230.242""","""{""os"":""Other"",""browser"":""Other…","""Opera/8.99.(Windows NT 11.0; d…",1464,true,true,1
"""39dce626-694f-40e0-86c5-faf9f6…","""38d5fd98-7d9b-4810-9fbf-e071e8…","""2026-01-13T22:07:03.428810+00:…",91.3,"""GBP""","""merchant_34""","""groceries""","""deposit""","""bank_transfer""","""{""lat"":-8.958949,""lon"":153.096…","""4.199.222.149""","""{""os"":""iOS"",""browser"":""Edge""}""","""Mozilla/5.0 (X11; Linux x86_64…",1769,true,true,0
"""258a812c-537f-44d3-8d88-74f1c5…","""6b02a5fc-1789-4e3f-a3af-6c63f3…","""2026-01-13T22:07:03.585432+00:…",368.3,"""AUD""","""merchant_119""","""services""","""purchase""","""crypto""","""{""lat"":65.935948,""lon"":-17.505…","""205.20.247.117""","""{""os"":""macOS"",""browser"":""Edge""…","""Mozilla/5.0 (Windows NT 6.2) A…",1059,true,true,0


In [9]:
samples = result.to_dicts()
samples

[{'transaction_id': 'ee583be0-c39d-4c17-ab99-9e941b0a7606',
  'user_id': '1b02783e-492a-4820-9d44-46a343e6420a',
  'timestamp': '2026-01-13T22:08:26.555237+00:00',
  'amount': 35.62,
  'currency': 'GBP',
  'merchant_id': 'merchant_53',
  'product_category': 'digital_goods',
  'transaction_type': 'deposit',
  'payment_method': 'debit_card',
  'location': '{"lat":-3.685515,"lon":61.184191}',
  'ip_address': '157.23.223.51',
  'device_info': '{"os":"Windows","browser":"Other"}',
  'user_agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows 98; Trident/3.0)',
  'account_age_days': 657,
  'cvv_provided': True,
  'billing_address_match': True,
  'is_fraud': 0},
 {'transaction_id': '6ed1848c-5f04-40bb-a506-2faf06edd832',
  'user_id': '92294c5a-26ed-429d-9ddd-e1f2ada5d08f',
  'timestamp': '2026-01-13T22:08:26.927459+00:00',
  'amount': 316.91,
  'currency': 'EUR',
  'merchant_id': 'merchant_68',
  'product_category': 'services',
  'transaction_type': 'payment',
  'payment_method': 'crypto',
  'l

In [10]:
for sample in samples:
    pprint(sample)
    break

{'account_age_days': 657,
 'amount': 35.62,
 'billing_address_match': True,
 'currency': 'GBP',
 'cvv_provided': True,
 'device_info': '{"os":"Windows","browser":"Other"}',
 'ip_address': '157.23.223.51',
 'is_fraud': 0,
 'location': '{"lat":-3.685515,"lon":61.184191}',
 'merchant_id': 'merchant_53',
 'payment_method': 'debit_card',
 'product_category': 'digital_goods',
 'timestamp': '2026-01-13T22:08:26.555237+00:00',
 'transaction_id': 'ee583be0-c39d-4c17-ab99-9e941b0a7606',
 'transaction_type': 'deposit',
 'user_agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows 98; Trident/3.0)',
 'user_id': '1b02783e-492a-4820-9d44-46a343e6420a'}


In [11]:
class CustomOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        # Dictionary to store mappings for each feature.
        # Keys are feature names (from input dictionary), values are dictionaries
        # mapping category value to integer ID for that feature.
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        # Dictionary to store the next available integer ID for each feature.
        # Keys are feature names, values are integers.
        self._feature_next_ids: Dict[Hashable, int] = {}
    def learn_one(self, x: Dict[Hashable, Any]):
        """
        Learns categories from a single sample dictionary.
        Iterates through the dictionary's items and learns each category value
        for its corresponding feature.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
               Assumes categorical features are present in this dictionary.
        """
        for feature_name, category_value in x.items():
            # Ensure the category value is hashable (dictionaries/lists are not)
            # You might need more sophisticated type checking or handling
            # if your input dictionaries contain complex unhashable types
            if not isinstance(category_value, Hashable):
                 print(f"Warning: Skipping unhashable value for feature '{feature_name}': {category_value}")
                 continue # Skip this feature for learning
            # If this is the first time we see this feature, initialize its mapping and counter
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            # Get the mapping and counter for this specific feature
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            # Check if the category value is already in the mapping for this feature
            if category_value not in feature_map:
                # If it's a new category for this feature, assign the next available ID
                feature_map[category_value] = feature_next_id
                # Increment the counter for the next new category for this feature
                self._feature_next_ids[feature_name] += 1
    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        """
        Transforms categorical features in a single sample dictionary into integer IDs.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
        Returns:
            A new dictionary containing the transformed integer IDs for the
            categorical features that the encoder has seen. Features not
            seen by the encoder are excluded from the output dictionary.
        Raises:
            KeyError: If a feature is seen but a specific category value
                      within that feature has not been seen during learning.
                      You might want to add logic here to handle unseen categories
                      (e.g., return a default value like -1 or NaN for that feature).
        """
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            # Only attempt to transform features that the encoder has seen
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]
                # Check if the category value for this feature has been seen
                if category_value in feature_map:
                    # Transform the category value using the feature's mapping
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    # Handle unseen category values for a known feature
                    # By default, this will raise a KeyError as per the docstring.
                    # Example: return a placeholder value instead of raising error:
                    # transformed_sample[feature_name] = -1 # Or some other indicator
                    # print(f"Warning: Unseen category '{category_value}' for feature '{feature_name}' during transform.")
                    # Or raise the error explicitly:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}' during transform.")
            # Features not in self._feature_mappings are ignored in the output.
            # If you need to include them (e.g., original numerical features),
            # you would copy them over here. This encoder only outputs encoded features.
        return transformed_sample
    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        """Returns the current mappings for all features."""
        return self._feature_mappings
    def get_feature_next_ids(self) -> Dict[Hashable, int]:
        """Returns the next available IDs for all features."""
        return self._feature_next_ids
    def __repr__(self) -> str:
        """String representation of the encoder."""
        num_features = len(self._feature_mappings)
        feature_details = ", ".join([f"{name}: {len(mapping)} categories" for name, mapping in self._feature_mappings.items()])
        return f"CustomPicklableOrdinalEncoder(features={num_features} [{feature_details}])"
    


In [12]:
class DictImputer(base.Transformer):
    """
    Imputes missing values (None or missing keys) for specified features in a dictionary.

    Parameters
    ----------
    on
        List of feature names to impute.
    fill_value
        The value to use for imputation.
    """
    def __init__(self, on: list, fill_value):
        self.on = on
        self.fill_value = fill_value
    def transform_one(self, x: dict):
        x_transformed = x.copy()
        for feature in self.on:
            if x_transformed.get(feature) is None:
                x_transformed[feature] = self.fill_value
        return x_transformed

In [13]:
def extract_device_info(x):
    x_ = x['device_info']
    # Parse JSON string if coming from Delta Lake
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

def extract_coordinates(x):
    x_ = x['location']
    # Parse JSON string if coming from Delta Lake
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'lat': x_['lat'],
        'lon': x_['lon'],
    }

In [14]:
def process_sample(x, encoders, project_name):
    """Process a single sample for River incremental learning."""
    if project_name == "Transaction Fraud Detection":
        pipe1 = compose.Select(
            "amount",
            "account_age_days",
            "cvv_provided",
            "billing_address_match"
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            "currency",
            "merchant_id",
            "payment_method",
            "product_category",
            "transaction_type",
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select("device_info")
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(extract_device_info)
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        pipe4a = compose.Select("timestamp")
        pipe4a.learn_one(x)
        x_pipe_4 = pipe4a.transform_one(x)
        pipe4b = compose.FuncTransformer(extract_timestamp_info)
        pipe4b.learn_one(x_pipe_4)
        x_pipe_4 = pipe4b.transform_one(x_pipe_4)
        x_to_encode = x_pipe_2 | x_pipe_3 | x_pipe_4
        encoders["ordinal_encoder"].learn_one(x_to_encode)
        x2 = encoders["ordinal_encoder"].transform_one(x_to_encode)
        return x1 | x2, {"ordinal_encoder": encoders["ordinal_encoder"]}
    elif project_name == "Estimated Time of Arrival":
        pipe1 = compose.Select(
            'estimated_distance_km',
            'temperature_celsius',
            'hour_of_day',
            'driver_rating',
            'initial_estimated_travel_time_seconds',
            'debug_traffic_factor',
            'debug_weather_factor',
            'debug_incident_delay_seconds',
            'debug_driver_factor'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            'driver_id',
            'vehicle_id',
            'weather',
            'vehicle_type'
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select(
            "timestamp",
        )
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        x_to_encode = x_pipe_2 | x_pipe_3
        encoders["ordinal_encoder"].learn_one(x_to_encode)
        x2 = encoders["ordinal_encoder"].transform_one(x_to_encode)
        return x1 | x2, {
            "ordinal_encoder": encoders["ordinal_encoder"]
        }
    elif project_name == "E-Commerce Customer Interactions":
        pipe1 = compose.Select(
            'price',
            'quantity',
            'session_event_sequence',
            'time_on_page_seconds'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            'event_type',
            'product_category',
            'product_id',
            'referrer_url',
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select(
            "device_info"
        )
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(
            extract_device_info,
        )
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        pipe4a = compose.Select(
            "timestamp",
        )
        pipe4a.learn_one(x)
        x_pipe_4 = pipe4a.transform_one(x)
        pipe4b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe4b.learn_one(x_pipe_4)
        x_pipe_4 = pipe4b.transform_one(x_pipe_4)
        pipe5a = compose.Select(
            "location",
        )
        pipe5a.learn_one(x)
        x_pipe_5 = pipe5a.transform_one(x)
        pipe5b = compose.FuncTransformer(
            extract_coordinates,
        )
        pipe5b.learn_one(x_pipe_5)
        x_pipe_5 = pipe5b.transform_one(x_pipe_5)
        x_to_prep = x1 | x_pipe_2 | x_pipe_3 | x_pipe_4 | x_pipe_5
        x_to_prep = DictImputer(
            fill_value = False, 
            on = list(x_to_prep.keys())).transform_one(
                x_to_prep)
        numerical_features = [
            'price',
            'session_event_sequence',
            'time_on_page_seconds',
            'quantity'
        ]
        categorical_features = [
            'event_type',
            'product_category',
            'product_id',
            'referrer_url',
            'os',
            'browser',
            'year',
            'month',
            'day',
            'hour',
            'minute',
            'second'
        ]
        num_pipe = compose.Select(*numerical_features)
        num_pipe.learn_one(x_to_prep)
        x_num = num_pipe.transform_one(x_to_prep)
        cat_pipe = compose.Select(*categorical_features)
        cat_pipe.learn_one(x_to_prep)
        x_cat = cat_pipe.transform_one(x_to_prep)
        encoders["standard_scaler"].learn_one(x_num)
        x_scaled = encoders["standard_scaler"].transform_one(x_num)
        encoders["feature_hasher"].learn_one(x_cat)
        x_hashed = encoders["feature_hasher"].transform_one(x_cat)
        return x_scaled | x_hashed, {
            "standard_scaler": encoders["standard_scaler"], 
            "feature_hasher": encoders["feature_hasher"]
        }
    elif project_name == "Sales Forecasting":
        pipe1 = compose.Select(
            'concept_drift_stage',
            'day_of_week',
            'is_holiday',
            'is_promotion_active',
            'month',
            #'total_sales_amount',
            'unit_price'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2a = compose.Select(
            "timestamp",
        )
        pipe2a.learn_one(x)
        x_pipe_2 = pipe2a.transform_one(x)
        pipe2b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe2b.learn_one(x_pipe_2)
        x2 = pipe2b.transform_one(x_pipe_2)
        pipe3a = compose.Select(
            'product_id',
            'promotion_id',
            'store_id'
        )
        pipe3a.learn_one(x)
        x3 = pipe3a.transform_one(x)
        x_to_process = x1 | x2 | x3
        numerical_features = [
            'unit_price',
            #'total_sales_amount',
        ]
        categorical_features = [
            'is_promotion_active',
            'is_holiday',
            'day_of_week',
            'concept_drift_stage',
            'year',
            'month',
            'day',
            #'hour',
            #'minute',
            #'second',
            'product_id',
            'promotion_id',
            'store_id',
        ]
        pipe_num = compose.Select(*numerical_features)
        pipe_num.learn_one(x_to_process)
        x_num = pipe_num.transform_one(x_to_process)
        pipe_cat = compose.Select(*categorical_features)
        pipe_cat.learn_one(x_to_process)
        x_cat = pipe_cat.transform_one(x_to_process)
        encoders["standard_scaler"].learn_one(x_num)
        x_num = encoders["standard_scaler"].transform_one(x_num)
        encoders["one_hot_encoder"].learn_one(x_cat)
        x_cat = encoders["one_hot_encoder"].transform_one(x_cat)
        return x_num | x_cat, {
            "one_hot_encoder": encoders["one_hot_encoder"],
            "standard_scaler": encoders["standard_scaler"],
        }

In [15]:
def _create_default_model(project_name):
    """Create default model based on project type.

    Models are configured based on River ML documentation and best practices.
    All parameters are documented with their River ML defaults and rationale.

    See: https://riverml.xyz/latest/
    """
    if project_name == "Transaction Fraud Detection":
        # =================================================================
        # ARFClassifier - Adaptive Random Forest Classifier
        # For fraud detection with concept drift handling
        # =================================================================
        # OLD CONFIGURATION:
        # return forest.ARFClassifier(
        #     n_models = 10,
        #     drift_detector = drift.ADWIN(),
        #     warning_detector = drift.ADWIN(),
        #     metric = metrics.ROCAUC(),
        #     max_features = "sqrt",
        #     lambda_value = 6,
        #     seed = 42
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/forest/ARFClassifier/
        # Reference: https://riverml.xyz/latest/examples/imbalanced-learning/
        #
        # - n_models=10: Default number of trees in ensemble
        # - max_features="sqrt": Default, sqrt of features per split
        # - lambda_value=6: Default Leveraging Bagging parameter
        # - metric=ROCAUC(): RECOMMENDED by River for imbalanced fraud detection
        #   (River's imbalanced-learning guide uses ROCAUC for fraud detection)
        # - disable_weighted_vote=False: Enable weighted voting for better accuracy
        # - drift_detector ADWIN(delta=0.002): Default sensitivity (0.002)
        # - warning_detector ADWIN(delta=0.01): Default warning sensitivity
        # - grace_period=50: Default observations between split attempts
        # - max_depth=None: Default, unlimited tree depth
        # - split_criterion="info_gain": Default, information gain criterion
        # - delta=0.01: Default allowed error in split decision
        # - tau=0.05: Default tie-breaking threshold
        # - leaf_prediction="nba": Default, Naive Bayes Adaptive
        # - nb_threshold=0: Default, enable NB immediately
        # - binary_split=False: Default, allow multi-way splits
        # - min_branch_fraction=0.01: Default minimum data per branch
        # - max_share_to_split=0.99: Default majority class proportion
        # - max_size=100.0: Default max memory in MiB
        # - memory_estimate_period=2000000: Default instances between memory checks
        # - merit_preprune=True: Default merit-based pre-pruning
        return forest.ARFClassifier(
            n_models = 10,
            max_features = "sqrt",
            lambda_value = 6,
            metric = metrics.ROCAUC(),
            disable_weighted_vote = False,
            drift_detector = drift.ADWIN(delta = 0.002),
            warning_detector = drift.ADWIN(delta = 0.01),
            grace_period = 50,
            max_depth = None,
            split_criterion = "info_gain",
            delta = 0.01,
            tau = 0.05,
            leaf_prediction = "nba",
            nb_threshold = 0,
            nominal_attributes = None,
            binary_split = False,
            min_branch_fraction = 0.01,
            max_share_to_split = 0.99,
            max_size = 100.0,
            memory_estimate_period = 2000000,
            stop_mem_management = False,
            remove_poor_attrs = False,
            merit_preprune = True,
            seed = 42,
        )
    elif project_name == "Estimated Time of Arrival":
        # =================================================================
        # ARFRegressor - Adaptive Random Forest Regressor
        # For ETA prediction with continuous drift handling
        # =================================================================
        # OLD CONFIGURATION:
        # return forest.ARFRegressor(
        #     n_models = 10,
        #     drift_detector = drift.ADWIN(),
        #     warning_detector = drift.ADWIN(),
        #     metric = metrics.RMSE(),
        #     max_features = "sqrt",
        #     lambda_value = 6,
        #     seed = 42
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/forest/ARFRegressor/
        #
        # - n_models=10: Default number of trees
        # - max_features="sqrt": Default feature selection
        # - aggregation_method="median": Default, robust to outliers
        # - lambda_value=6: Default Leveraging Bagging parameter
        # - metric=MAE(): Using MAE as it's common for ETA prediction
        # - disable_weighted_vote=True: Default for regressor
        # - drift_detector ADWIN(delta=0.002): Default sensitivity
        # - warning_detector ADWIN(delta=0.01): Default warning sensitivity
        # - grace_period=50: Default observations between split attempts
        # - max_depth=None: Default unlimited depth
        # - delta=0.01: Default allowed error
        # - tau=0.05: Default tie-breaking threshold
        # - leaf_prediction="adaptive": Default, dynamically chooses mean/model
        # - model_selector_decay=0.95: Default decay for leaf model selection
        # - min_samples_split=5: Default minimum samples for split
        # - binary_split=False: Default multi-way splits
        # - max_size=500.0: Default max memory in MiB
        return forest.ARFRegressor(
            n_models=10,
            max_features="sqrt",
            aggregation_method="median",
            lambda_value=6,
            metric=metrics.MAE(),
            disable_weighted_vote=True,
            drift_detector=drift.ADWIN(delta=0.002),
            warning_detector=drift.ADWIN(delta=0.01),
            grace_period=50,
            max_depth=None,
            delta=0.01,
            tau=0.05,
            leaf_prediction="adaptive",
            leaf_model=None,
            model_selector_decay=0.95,
            min_samples_split=5,
            binary_split=False,
            max_size=500.0,
            memory_estimate_period=2000000,
            nominal_attributes=None,
            seed=42,
        )
    elif project_name == "E-Commerce Customer Interactions":
        # =================================================================
        # DBSTREAM - Density-Based Stream Clustering
        # For customer behavior clustering with arbitrary shapes
        # =================================================================
        # OLD CONFIGURATION:
        # return cluster.DBSTREAM(
        #     clustering_threshold = 1.0,
        #     fading_factor = 0.01,
        #     cleanup_interval = 2,
        # )

        # CONFIGURATION based on River ML documentation example:
        # Reference: https://riverml.xyz/latest/api/cluster/DBSTREAM/
        #
        # The River documentation provides this exact example configuration:
        # - clustering_threshold=1.5: Micro-cluster radius
        # - fading_factor=0.05: Historical data importance (must be > 0)
        # - cleanup_interval=4: Time between cleanup processes
        # - intersection_factor=0.5: Cluster overlap ratio for connectivity
        # - minimum_weight=1.0: Threshold for non-noisy cluster classification
        return cluster.DBSTREAM(
            clustering_threshold=1.5,
            fading_factor=0.05,
            cleanup_interval=4,
            intersection_factor=0.5,
            minimum_weight=1.0,
        )
    elif project_name == "Sales Forecasting":
        # =================================================================
        # SNARIMAX - Seasonal Non-linear Auto-Regressive Integrated
        # Moving Average with eXogenous inputs
        # For sales forecasting with weekly seasonality
        # =================================================================
        # OLD CONFIGURATION:
        # regressor_snarimax = linear_model.PARegressor(
        #     C = 0.01,
        #     mode = 1)
        # return time_series.SNARIMAX(
        #     p = 2,
        #     d = 1,
        #     q = 1,
        #     m = 7,
        #     sp = 1,
        #     sd = 0,
        #     sq = 1,
        #     regressor = regressor_snarimax
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/time-series/SNARIMAX/
        # Reference: https://riverml.xyz/latest/api/linear-model/PARegressor/
        #
        # SNARIMAX parameters for weekly sales data:
        # - p=7: Past 7 days of target values (full week)
        # - d=1: First-order differencing for trend removal
        # - q=2: Past error terms for noise handling
        # - m=7: Weekly seasonality period
        # - sp=1: Seasonal autoregressive order
        # - sd=1: Seasonal differencing (recommended for seasonal data)
        # - sq=1: Seasonal moving average order
        #
        # PARegressor parameters (defaults from River docs):
        # - C=1.0: Default regularization strength
        # - mode=1: Default algorithm mode
        # - eps=0.1: Default tolerance parameter
        # - learn_intercept=True: Default bias learning
        regressor_snarimax = linear_model.PARegressor(
            C=1.0,
            mode=1,
            eps=0.1,
            learn_intercept=True,
        )
        return time_series.SNARIMAX(
            p=7,
            d=1,
            q=2,
            m=7,
            sp=1,
            sd=1,
            sq=1,
            regressor=regressor_snarimax,
        )
    else:
        raise ValueError(f"Unknown project: {project_name}")

In [16]:
def _create_default_encoders(project_name):
    """Create default encoders based on project type."""
    if project_name in ["Transaction Fraud Detection", "Estimated Time of Arrival"]:
        return {"ordinal_encoder": CustomOrdinalEncoder()}
    elif project_name == "E-Commerce Customer Interactions":
        return {
            "standard_scaler": preprocessing.StandardScaler(),
            "feature_hasher": preprocessing.FeatureHasher()
        }
    elif project_name == "Sales Forecasting":
        return {
            "one_hot_encoder": preprocessing.OneHotEncoder(),
            "standard_scaler": preprocessing.StandardScaler(),
        }
    else:
        raise ValueError(f"Unknown project: {project_name}")

In [20]:
dir(metrics)

['Accuracy',
 'AdjustedMutualInfo',
 'AdjustedRand',
 'BalancedAccuracy',
 'ClassificationReport',
 'CohenKappa',
 'Completeness',
 'ConfusionMatrix',
 'CrossEntropy',
 'F1',
 'FBeta',
 'FowlkesMallows',
 'GeometricMean',
 'Homogeneity',
 'Jaccard',
 'LogLoss',
 'MAE',
 'MAPE',
 'MCC',
 'MSE',
 'MacroF1',
 'MacroFBeta',
 'MacroJaccard',
 'MacroPrecision',
 'MacroRecall',
 'MicroF1',
 'MicroFBeta',
 'MicroJaccard',
 'MicroPrecision',
 'MicroRecall',
 'MultiFBeta',
 'MutualInfo',
 'NormalizedMutualInfo',
 'Precision',
 'R2',
 'RMSE',
 'RMSLE',
 'ROCAUC',
 'Rand',
 'Recall',
 'RollingROCAUC',
 'SMAPE',
 'Silhouette',
 'VBeta',
 'WeightedF1',
 'WeightedFBeta',
 'WeightedJaccard',
 'WeightedPrecision',
 'WeightedRecall',
 '__all__',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 'accuracy',
 'annotations',
 'balanced_accuracy',
 'base',
 'confusion',
 'cross_entropy',
 'efficient_rollingrocauc',
 'expected_mutu

In [37]:
model = _create_default_model(PROJECT_NAME)
encoders = _create_default_encoders(PROJECT_NAME)

# =============================================================================
# RIVER ML METRICS CONFIGURATION FOR TRANSACTION FRAUD DETECTION
# =============================================================================
# Research-based optimal configuration for real-time TFD training.
# Sources:
#   - River ML Documentation: https://riverml.xyz/dev/api/metrics/
#   - Fraud Detection Best Practices: https://www.cesarsotovalero.net/blog/
#     evaluation-metrics-for-real-time-financial-fraud-detection-ml-models.html
#   - F-Beta Score Guide: https://machinelearningmastery.com/fbeta-measure-for-ml
# =============================================================================

# Shared confusion matrix for efficiency (multiple metrics share same CM)
# BENEFIT: Reduces computation, metrics share TP/TN/FP/FN counts
shared_cm = metrics.ConfusionMatrix()

# -----------------------------------------------------------------------------
# CLASS-BASED METRICS (use predict_one - class labels)
# -----------------------------------------------------------------------------
# These metrics compare predicted class (0/1) vs actual class (0/1)
# All use shared_cm for efficiency
# pos_val=1 means fraud (is_fraud=1) is the positive class
# -----------------------------------------------------------------------------
class_metric_classes = {
    # PRIMARY METRICS (most important for fraud detection)
    "Recall": metrics.Recall,       # TP / (TP + FN) - catch rate of fraud
    "Precision": metrics.Precision, # TP / (TP + FP) - accuracy of fraud alerts
    "F1": metrics.F1,               # Harmonic mean (beta=1, balanced)
    "FBeta": metrics.FBeta,         # Weighted harmonic mean (configurable beta)
    # SECONDARY METRICS (additional insights)
    "Accuracy": metrics.Accuracy,           # Overall correct predictions
    "BalancedAccuracy": metrics.BalancedAccuracy,  # Mean recall per class
    "MCC": metrics.MCC,             # Matthews Correlation Coefficient
    "GeometricMean": metrics.GeometricMean, # sqrt(TPR * TNR) - imbalance robust
    "CohenKappa": metrics.CohenKappa,       # Agreement beyond chance
    "Jaccard": metrics.Jaccard,     # TP / (TP + FP + FN) - IoU for positive class
}
class_metric_args = {
    # PRIMARY METRICS
    "Recall": {"cm": shared_cm, "pos_val": 1},
    "Precision": {"cm": shared_cm, "pos_val": 1},
    "F1": {"cm": shared_cm, "pos_val": 1},
    # FBeta with beta=2.0: Industry standard for fraud detection
    # beta=2.0 weights Recall 2x more than Precision (prioritize catching fraud)
    # Alternative: beta=0.5 if customer experience (fewer false positives) is priority
    # Reference: https://www.analyticsvidhya.com/blog/2024/12/f-beta-score/
    "FBeta": {"beta": 2.0, "cm": shared_cm, "pos_val": 1},
    # SECONDARY METRICS
    "Accuracy": {"cm": shared_cm},
    "BalancedAccuracy": {"cm": shared_cm},
    # MCC: One of best metrics for imbalanced data per research
    # Reference: https://link.springer.com/article/10.1186/s12864-019-6413-7
    "MCC": {"cm": shared_cm, "pos_val": 1},
    "GeometricMean": {"cm": shared_cm},
    "CohenKappa": {"cm": shared_cm},
    "Jaccard": {"cm": shared_cm, "pos_val": 1},
}

# -----------------------------------------------------------------------------
# PROBABILITY-BASED METRICS (use predict_proba_one - probabilities)
# -----------------------------------------------------------------------------
# These metrics use predicted probability of fraud (0.0 to 1.0)
# Better for threshold optimization and model calibration assessment
# -----------------------------------------------------------------------------
proba_metric_classes = {
    "ROCAUC": metrics.ROCAUC,
    "RollingROCAUC": metrics.RollingROCAUC,
    "LogLoss": metrics.LogLoss,
}
proba_metric_args = {
    # ROCAUC: Approximation of true ROC AUC for streaming data
    # n_thresholds: Higher = more accurate but more memory/CPU
    # Default=10, River example uses 20, production recommended=50
    # Reference: https://riverml.xyz/dev/api/metrics/ROCAUC/
    "ROCAUC": {"n_thresholds": 50, "pos_val": 1},
    
    # RollingROCAUC: Windowed ROCAUC for concept drift detection
    # window_size: Number of recent samples to consider
    # Default=1000, but for rare fraud (~1-5%), need larger window
    # With window=5000 and 2% fraud rate, expect ~100 fraud samples
    # This provides stable AUC while detecting recent drift
    "RollingROCAUC": {"window_size": 5000, "pos_val": 1},
    
    # LogLoss: Measures probability calibration (lower = better)
    # 0.693 is random baseline, target < 0.1 for good calibration
    # No configurable parameters
    "LogLoss": {},
}

# -----------------------------------------------------------------------------
# MATRIX/REPORT METRICS (no .get(), display separately)
# -----------------------------------------------------------------------------
# These metrics don't return a single scalar value
# ConfusionMatrix: Shows TP, TN, FP, FN distribution
# ClassificationReport: Comprehensive per-class statistics
# -----------------------------------------------------------------------------
report_metric_classes = {
    "ConfusionMatrix": metrics.ConfusionMatrix,
    "ClassificationReport": metrics.ClassificationReport,
}
report_metric_args = {
    # Separate ConfusionMatrix for report (not shared_cm)
    "ConfusionMatrix": {},
    # decimals=4: More precision for monitoring subtle changes
    # Uses shared_cm for efficiency with class_metrics
    "ClassificationReport": {"decimals": 4, "cm": shared_cm},
}

# =============================================================================
# INSTANTIATE ALL METRICS
# =============================================================================
class_metrics = {
    name: class_metric_classes[name](**class_metric_args[name])
    for name in class_metric_classes
}
proba_metrics = {
    name: proba_metric_classes[name](**proba_metric_args[name])
    for name in proba_metric_classes
}
report_metrics = {
    name: report_metric_classes[name](**report_metric_args[name])
    for name in report_metric_classes
}

# =============================================================================
# TRAINING LOOP
# =============================================================================
for sample in tqdm.tqdm(samples):
    x, encoders = process_sample(sample, encoders, PROJECT_NAME)
    y = sample['is_fraud']
    # Update the model (learn from this sample)
    model.learn_one(x, y)
    # Get predictions AFTER learning (prequential evaluation)
    prediction = model.predict_one(x)
    prediction_proba = model.predict_proba_one(x)
    # Extract probability of positive class (fraud=1)
    # Handle case where model returns None or empty dict
    proba_positive = prediction_proba.get(1, 0.0) if prediction_proba else 0.0
    # Update class-based metrics (use class prediction: 0 or 1)
    for metric in class_metrics.values():
        metric.update(y, prediction)
    # Update probability-based metrics (use probability: 0.0 to 1.0)
    for metric in proba_metrics.values():
        metric.update(y, proba_positive)
    # Update report metrics (use class prediction)
    for metric in report_metrics.values():
        metric.update(y, prediction)

# =============================================================================
# COLLECT SCALAR METRICS FOR LOGGING
# =============================================================================
# Combine class + proba metrics (all have .get() method)
metrics_to_log = {}
for name, metric in class_metrics.items():
    metrics_to_log[name] = metric.get()
for name, metric in proba_metrics.items():
    metrics_to_log[name] = metric.get()

metrics_to_log

100%|██████████| 1000/1000 [00:19<00:00, 52.13it/s]


{'Recall': 0.4166666666666667,
 'Precision': 1.0,
 'F1': 0.5882352941176471,
 'FBeta': 0.4716981132075472,
 'Accuracy': 0.993,
 'BalancedAccuracy': 0.7083333333333334,
 'MCC': 0.6432226235010586,
 'GeometricMean': np.float64(0.6454972243679028),
 'CohenKappa': 0.5853080568720405,
 'Jaccard': 0.4166666666666667,
 'ROCAUC': np.float64(0.9017375168690958),
 'RollingROCAUC': 0.9224021592442645,
 'LogLoss': 0.03484638887493995}

In [38]:
# Display report metrics (ConfusionMatrix, ClassificationReport)
print("Confusion Matrix:")
print(report_metrics["ConfusionMatrix"])
print("\nClassification Report:")
print(report_metrics["ClassificationReport"])

Confusion Matrix:
    0     1  
0   988   0  
1     7   5  

Classification Report:
           Precision   Recall      F1         Support  
                                                       
   False    99.2965%   100.0000%   99.6470%     10868  
    True   100.0000%    41.6667%   58.8235%       132  
                                                       
   Macro    99.6482%    70.8333%   79.2353%            
   Micro    99.3000%    99.3000%   99.3000%            
Weighted    99.3049%    99.3000%   99.1571%            

                   99.3000% accuracy                   
