In [18]:
import polars as pl
import orjson
import datetime as dt
from pprint import pprint
from typing import Any, Dict, Hashable, Optional, List
import tqdm
from river import (
    base,
    compose,
    metrics,
    drift,
    forest,
    cluster,
    preprocessing,
    time_series,
    linear_model,
)

In [19]:
MINIO_HOST = "localhost"
MINIO_ENDPOINT = f"http://{MINIO_HOST}:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
PROJECT_NAME = "Transaction Fraud Detection"

In [20]:
DELTA_STORAGE_OPTIONS = {
    "AWS_ENDPOINT_URL": MINIO_ENDPOINT,
    "AWS_ACCESS_KEY_ID": MINIO_ACCESS_KEY,
    "AWS_SECRET_ACCESS_KEY": MINIO_SECRET_KEY,
    "AWS_REGION": "us-east-1",
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
    "AWS_ALLOW_HTTP": "true",
}

In [21]:
DELTA_PATHS = {
    "Transaction Fraud Detection": "s3://lakehouse/delta/transaction_fraud_detection",
    "Estimated Time of Arrival": "s3://lakehouse/delta/estimated_time_of_arrival",
    "E-Commerce Customer Interactions": "s3://lakehouse/delta/e_commerce_customer_interactions",
    "Sales Forecasting": "s3://lakehouse/delta/sales_forecasting",
}

In [22]:
delta_path = DELTA_PATHS.get("Estimated Time of Arrival")

In [23]:
lf = pl.scan_delta(
    delta_path, 
    storage_options = DELTA_STORAGE_OPTIONS)

In [24]:
sql = pl.SQLContext()
sql.register("data", lf)

<SQLContext [tables:1] at 0x7f7bae29fd90>

In [25]:
result = sql.execute("SELECT * FROM data LIMIT 1000").collect()
result

trip_id,driver_id,vehicle_id,timestamp,origin,destination,estimated_distance_km,weather,temperature_celsius,day_of_week,hour_of_day,driver_rating,vehicle_type,initial_estimated_travel_time_seconds,simulated_actual_travel_time_seconds,debug_traffic_factor,debug_weather_factor,debug_incident_delay_seconds,debug_driver_factor
str,str,str,str,str,str,f64,str,f64,i32,i32,f64,str,i32,i32,f64,f64,i32,f64
"""6c6429de-5d72-4a6e-9843-5e71d6…","""driver_2502""","""vehicle_977""","""2025-12-29T21:45:01.446634+00:…","""{""lat"":29.828526,""lon"":-95.245…","""{""lat"":29.975473,""lon"":-95.045…",25.29,"""Clouds""",19.7,0,21,3.8,"""Motorcycle""",2501,2605,1.1,1.0,0,1.03
"""1e3da605-e251-43a9-b8ba-f1bbb4…","""driver_1647""","""vehicle_182""","""2025-12-29T21:45:01.569208+00:…","""{""lat"":30.0287,""lon"":-95.73198…","""{""lat"":29.764542,""lon"":-95.390…",44.12,"""Clear""",23.3,0,21,4.1,"""Sedan""",4103,4530,1.1,1.0,0,1.02
"""86cb0ede-7f6b-4a79-997b-c52019…","""driver_4068""","""vehicle_122""","""2025-12-29T21:45:02.049161+00:…","""{""lat"":29.788907,""lon"":-95.570…","""{""lat"":29.552983,""lon"":-95.485…",27.49,"""Thunderstorm""",24.9,0,21,4.8,"""Hatchback""",2652,2385,0.96,1.0,0,0.98
"""72def807-01a9-48b9-9db5-b752bd…","""driver_3653""","""vehicle_897""","""2025-12-29T21:45:03.019405+00:…","""{""lat"":29.812463,""lon"":-95.705…","""{""lat"":29.922032,""lon"":-95.149…",55.0,"""Fog""",22.9,0,21,4.8,"""Motorcycle""",4867,10144,1.39,1.49,0,0.98
"""a32c3a1d-13ad-4674-a29b-429a6e…","""driver_2955""","""vehicle_221""","""2025-12-29T21:45:03.755616+00:…","""{""lat"":29.97569,""lon"":-95.6535…","""{""lat"":30.091343,""lon"":-95.688…",13.3,"""Clouds""",24.3,0,21,4.7,"""Hatchback""",1307,1094,0.93,1.0,0,0.99
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""9b47160c-021e-4fa2-b5ba-edef91…","""driver_3349""","""vehicle_468""","""2026-01-14T13:17:09.863182+00:…","""{""lat"":29.718243,""lon"":-95.244…","""{""lat"":29.964426,""lon"":-95.479…",35.58,"""Clear""",21.7,2,13,4.0,"""Van""",2928,3775,1.16,1.0,0,1.02
"""9fa5bff4-c591-4f12-916c-a9448a…","""driver_1870""","""vehicle_622""","""2026-01-14T13:17:10.441598+00:…","""{""lat"":29.794235,""lon"":-95.516…","""{""lat"":29.591784,""lon"":-95.512…",22.52,"""Clear""",20.3,2,13,4.3,"""Motorcycle""",1917,1857,0.92,1.0,0,1.01
"""a14664b7-3e58-4e50-a7ee-b68ac1…","""driver_4622""","""vehicle_464""","""2026-01-14T13:17:10.663561+00:…","""{""lat"":29.711525,""lon"":-95.744…","""{""lat"":29.5236,""lon"":-95.48543…",32.64,"""Fog""",20.5,2,13,4.1,"""Hatchback""",2845,4133,1.38,1.0,0,1.02
"""11d37598-3fe6-492b-9422-98273c…","""driver_4606""","""vehicle_758""","""2026-01-14T13:17:10.882418+00:…","""{""lat"":29.654642,""lon"":-95.552…","""{""lat"":30.051371,""lon"":-95.686…",45.95,"""Clear""",24.9,2,13,3.9,"""Hatchback""",3942,5672,1.33,1.0,0,1.03


In [26]:
samples = result.to_dicts()
samples[:10]

[{'trip_id': '6c6429de-5d72-4a6e-9843-5e71d6349cbe',
  'driver_id': 'driver_2502',
  'vehicle_id': 'vehicle_977',
  'timestamp': '2025-12-29T21:45:01.446634+00:00',
  'origin': '{"lat":29.828526,"lon":-95.245478}',
  'destination': '{"lat":29.975473,"lon":-95.045179}',
  'estimated_distance_km': 25.29,
  'weather': 'Clouds',
  'temperature_celsius': 19.7,
  'day_of_week': 0,
  'hour_of_day': 21,
  'driver_rating': 3.8,
  'vehicle_type': 'Motorcycle',
  'initial_estimated_travel_time_seconds': 2501,
  'simulated_actual_travel_time_seconds': 2605,
  'debug_traffic_factor': 1.1,
  'debug_weather_factor': 1.0,
  'debug_incident_delay_seconds': 0,
  'debug_driver_factor': 1.03},
 {'trip_id': '1e3da605-e251-43a9-b8ba-f1bbb4a2fcdf',
  'driver_id': 'driver_1647',
  'vehicle_id': 'vehicle_182',
  'timestamp': '2025-12-29T21:45:01.569208+00:00',
  'origin': '{"lat":30.0287,"lon":-95.731989}',
  'destination': '{"lat":29.764542,"lon":-95.390439}',
  'estimated_distance_km': 44.12,
  'weather': 'C

In [27]:
for sample in samples:
    pprint(sample)
    break

{'day_of_week': 0,
 'debug_driver_factor': 1.03,
 'debug_incident_delay_seconds': 0,
 'debug_traffic_factor': 1.1,
 'debug_weather_factor': 1.0,
 'destination': '{"lat":29.975473,"lon":-95.045179}',
 'driver_id': 'driver_2502',
 'driver_rating': 3.8,
 'estimated_distance_km': 25.29,
 'hour_of_day': 21,
 'initial_estimated_travel_time_seconds': 2501,
 'origin': '{"lat":29.828526,"lon":-95.245478}',
 'simulated_actual_travel_time_seconds': 2605,
 'temperature_celsius': 19.7,
 'timestamp': '2025-12-29T21:45:01.446634+00:00',
 'trip_id': '6c6429de-5d72-4a6e-9843-5e71d6349cbe',
 'vehicle_id': 'vehicle_977',
 'vehicle_type': 'Motorcycle',
 'weather': 'Clouds'}


In [28]:
class CustomOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        # Dictionary to store mappings for each feature.
        # Keys are feature names (from input dictionary), values are dictionaries
        # mapping category value to integer ID for that feature.
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        # Dictionary to store the next available integer ID for each feature.
        # Keys are feature names, values are integers.
        self._feature_next_ids: Dict[Hashable, int] = {}
    def learn_one(self, x: Dict[Hashable, Any]):
        """
        Learns categories from a single sample dictionary.
        Iterates through the dictionary's items and learns each category value
        for its corresponding feature.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
               Assumes categorical features are present in this dictionary.
        """
        for feature_name, category_value in x.items():
            # Ensure the category value is hashable (dictionaries/lists are not)
            # You might need more sophisticated type checking or handling
            # if your input dictionaries contain complex unhashable types
            if not isinstance(category_value, Hashable):
                 print(f"Warning: Skipping unhashable value for feature '{feature_name}': {category_value}")
                 continue # Skip this feature for learning
            # If this is the first time we see this feature, initialize its mapping and counter
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            # Get the mapping and counter for this specific feature
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            # Check if the category value is already in the mapping for this feature
            if category_value not in feature_map:
                # If it's a new category for this feature, assign the next available ID
                feature_map[category_value] = feature_next_id
                # Increment the counter for the next new category for this feature
                self._feature_next_ids[feature_name] += 1
    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        """
        Transforms categorical features in a single sample dictionary into integer IDs.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
        Returns:
            A new dictionary containing the transformed integer IDs for the
            categorical features that the encoder has seen. Features not
            seen by the encoder are excluded from the output dictionary.
        Raises:
            KeyError: If a feature is seen but a specific category value
                      within that feature has not been seen during learning.
                      You might want to add logic here to handle unseen categories
                      (e.g., return a default value like -1 or NaN for that feature).
        """
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            # Only attempt to transform features that the encoder has seen
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]
                # Check if the category value for this feature has been seen
                if category_value in feature_map:
                    # Transform the category value using the feature's mapping
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    # Handle unseen category values for a known feature
                    # By default, this will raise a KeyError as per the docstring.
                    # Example: return a placeholder value instead of raising error:
                    # transformed_sample[feature_name] = -1 # Or some other indicator
                    # print(f"Warning: Unseen category '{category_value}' for feature '{feature_name}' during transform.")
                    # Or raise the error explicitly:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}' during transform.")
            # Features not in self._feature_mappings are ignored in the output.
            # If you need to include them (e.g., original numerical features),
            # you would copy them over here. This encoder only outputs encoded features.
        return transformed_sample
    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        """Returns the current mappings for all features."""
        return self._feature_mappings
    def get_feature_next_ids(self) -> Dict[Hashable, int]:
        """Returns the next available IDs for all features."""
        return self._feature_next_ids
    def __repr__(self) -> str:
        """String representation of the encoder."""
        num_features = len(self._feature_mappings)
        feature_details = ", ".join([f"{name}: {len(mapping)} categories" for name, mapping in self._feature_mappings.items()])
        return f"CustomPicklableOrdinalEncoder(features={num_features} [{feature_details}])"
    


In [29]:
class DictImputer(base.Transformer):
    """
    Imputes missing values (None or missing keys) for specified features in a dictionary.

    Parameters
    ----------
    on
        List of feature names to impute.
    fill_value
        The value to use for imputation.
    """
    def __init__(self, on: list, fill_value):
        self.on = on
        self.fill_value = fill_value
    def transform_one(self, x: dict):
        x_transformed = x.copy()
        for feature in self.on:
            if x_transformed.get(feature) is None:
                x_transformed[feature] = self.fill_value
        return x_transformed

In [30]:
def extract_device_info(x):
    x_ = x['device_info']
    # Parse JSON string if coming from Delta Lake
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

def extract_coordinates(x):
    x_ = x['location']
    # Parse JSON string if coming from Delta Lake
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'lat': x_['lat'],
        'lon': x_['lon'],
    }

In [31]:
def process_sample(x, encoders, project_name):
    """Process a single sample for River incremental learning."""
    if project_name == "Transaction Fraud Detection":
        pipe1 = compose.Select(
            "amount",
            "account_age_days",
            "cvv_provided",
            "billing_address_match"
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            "currency",
            "merchant_id",
            "payment_method",
            "product_category",
            "transaction_type",
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select("device_info")
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(extract_device_info)
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        pipe4a = compose.Select("timestamp")
        pipe4a.learn_one(x)
        x_pipe_4 = pipe4a.transform_one(x)
        pipe4b = compose.FuncTransformer(extract_timestamp_info)
        pipe4b.learn_one(x_pipe_4)
        x_pipe_4 = pipe4b.transform_one(x_pipe_4)
        x_to_encode = x_pipe_2 | x_pipe_3 | x_pipe_4
        encoders["ordinal_encoder"].learn_one(x_to_encode)
        x2 = encoders["ordinal_encoder"].transform_one(x_to_encode)
        return x1 | x2, {"ordinal_encoder": encoders["ordinal_encoder"]}
    elif project_name == "Estimated Time of Arrival":
        pipe1 = compose.Select(
            'estimated_distance_km',
            'temperature_celsius',
            'hour_of_day',
            'driver_rating',
            'initial_estimated_travel_time_seconds',
            'debug_traffic_factor',
            'debug_weather_factor',
            'debug_incident_delay_seconds',
            'debug_driver_factor'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            'driver_id',
            'vehicle_id',
            'weather',
            'vehicle_type'
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select(
            "timestamp",
        )
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        x_to_encode = x_pipe_2 | x_pipe_3
        encoders["ordinal_encoder"].learn_one(x_to_encode)
        x2 = encoders["ordinal_encoder"].transform_one(x_to_encode)
        return x1 | x2, {
            "ordinal_encoder": encoders["ordinal_encoder"]
        }
    elif project_name == "E-Commerce Customer Interactions":
        pipe1 = compose.Select(
            'price',
            'quantity',
            'session_event_sequence',
            'time_on_page_seconds'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2 = compose.Select(
            'event_type',
            'product_category',
            'product_id',
            'referrer_url',
        )
        pipe2.learn_one(x)
        x_pipe_2 = pipe2.transform_one(x)
        pipe3a = compose.Select(
            "device_info"
        )
        pipe3a.learn_one(x)
        x_pipe_3 = pipe3a.transform_one(x)
        pipe3b = compose.FuncTransformer(
            extract_device_info,
        )
        pipe3b.learn_one(x_pipe_3)
        x_pipe_3 = pipe3b.transform_one(x_pipe_3)
        pipe4a = compose.Select(
            "timestamp",
        )
        pipe4a.learn_one(x)
        x_pipe_4 = pipe4a.transform_one(x)
        pipe4b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe4b.learn_one(x_pipe_4)
        x_pipe_4 = pipe4b.transform_one(x_pipe_4)
        pipe5a = compose.Select(
            "location",
        )
        pipe5a.learn_one(x)
        x_pipe_5 = pipe5a.transform_one(x)
        pipe5b = compose.FuncTransformer(
            extract_coordinates,
        )
        pipe5b.learn_one(x_pipe_5)
        x_pipe_5 = pipe5b.transform_one(x_pipe_5)
        x_to_prep = x1 | x_pipe_2 | x_pipe_3 | x_pipe_4 | x_pipe_5
        x_to_prep = DictImputer(
            fill_value = False, 
            on = list(x_to_prep.keys())).transform_one(
                x_to_prep)
        numerical_features = [
            'price',
            'session_event_sequence',
            'time_on_page_seconds',
            'quantity'
        ]
        categorical_features = [
            'event_type',
            'product_category',
            'product_id',
            'referrer_url',
            'os',
            'browser',
            'year',
            'month',
            'day',
            'hour',
            'minute',
            'second'
        ]
        num_pipe = compose.Select(*numerical_features)
        num_pipe.learn_one(x_to_prep)
        x_num = num_pipe.transform_one(x_to_prep)
        cat_pipe = compose.Select(*categorical_features)
        cat_pipe.learn_one(x_to_prep)
        x_cat = cat_pipe.transform_one(x_to_prep)
        encoders["standard_scaler"].learn_one(x_num)
        x_scaled = encoders["standard_scaler"].transform_one(x_num)
        encoders["feature_hasher"].learn_one(x_cat)
        x_hashed = encoders["feature_hasher"].transform_one(x_cat)
        return x_scaled | x_hashed, {
            "standard_scaler": encoders["standard_scaler"], 
            "feature_hasher": encoders["feature_hasher"]
        }
    elif project_name == "Sales Forecasting":
        pipe1 = compose.Select(
            'concept_drift_stage',
            'day_of_week',
            'is_holiday',
            'is_promotion_active',
            'month',
            #'total_sales_amount',
            'unit_price'
        )
        pipe1.learn_one(x)
        x1 = pipe1.transform_one(x)
        pipe2a = compose.Select(
            "timestamp",
        )
        pipe2a.learn_one(x)
        x_pipe_2 = pipe2a.transform_one(x)
        pipe2b = compose.FuncTransformer(
            extract_timestamp_info,
        )
        pipe2b.learn_one(x_pipe_2)
        x2 = pipe2b.transform_one(x_pipe_2)
        pipe3a = compose.Select(
            'product_id',
            'promotion_id',
            'store_id'
        )
        pipe3a.learn_one(x)
        x3 = pipe3a.transform_one(x)
        x_to_process = x1 | x2 | x3
        numerical_features = [
            'unit_price',
            #'total_sales_amount',
        ]
        categorical_features = [
            'is_promotion_active',
            'is_holiday',
            'day_of_week',
            'concept_drift_stage',
            'year',
            'month',
            'day',
            #'hour',
            #'minute',
            #'second',
            'product_id',
            'promotion_id',
            'store_id',
        ]
        pipe_num = compose.Select(*numerical_features)
        pipe_num.learn_one(x_to_process)
        x_num = pipe_num.transform_one(x_to_process)
        pipe_cat = compose.Select(*categorical_features)
        pipe_cat.learn_one(x_to_process)
        x_cat = pipe_cat.transform_one(x_to_process)
        encoders["standard_scaler"].learn_one(x_num)
        x_num = encoders["standard_scaler"].transform_one(x_num)
        encoders["one_hot_encoder"].learn_one(x_cat)
        x_cat = encoders["one_hot_encoder"].transform_one(x_cat)
        return x_num | x_cat, {
            "one_hot_encoder": encoders["one_hot_encoder"],
            "standard_scaler": encoders["standard_scaler"],
        }

In [32]:
def _create_default_model(project_name):
    """Create default model based on project type.

    Models are configured based on River ML documentation and best practices.
    All parameters are documented with their River ML defaults and rationale.

    See: https://riverml.xyz/latest/
    """
    if project_name == "Transaction Fraud Detection":
        # =================================================================
        # ARFClassifier - Adaptive Random Forest Classifier
        # For fraud detection with concept drift handling
        # =================================================================
        # OLD CONFIGURATION:
        # return forest.ARFClassifier(
        #     n_models = 10,
        #     drift_detector = drift.ADWIN(),
        #     warning_detector = drift.ADWIN(),
        #     metric = metrics.ROCAUC(),
        #     max_features = "sqrt",
        #     lambda_value = 6,
        #     seed = 42
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/forest/ARFClassifier/
        # Reference: https://riverml.xyz/latest/examples/imbalanced-learning/
        #
        # - n_models=10: Default number of trees in ensemble
        # - max_features="sqrt": Default, sqrt of features per split
        # - lambda_value=6: Default Leveraging Bagging parameter
        # - metric=ROCAUC(): RECOMMENDED by River for imbalanced fraud detection
        #   (River's imbalanced-learning guide uses ROCAUC for fraud detection)
        # - disable_weighted_vote=False: Enable weighted voting for better accuracy
        # - drift_detector ADWIN(delta=0.002): Default sensitivity (0.002)
        # - warning_detector ADWIN(delta=0.01): Default warning sensitivity
        # - grace_period=50: Default observations between split attempts
        # - max_depth=None: Default, unlimited tree depth
        # - split_criterion="info_gain": Default, information gain criterion
        # - delta=0.01: Default allowed error in split decision
        # - tau=0.05: Default tie-breaking threshold
        # - leaf_prediction="nba": Default, Naive Bayes Adaptive
        # - nb_threshold=0: Default, enable NB immediately
        # - binary_split=False: Default, allow multi-way splits
        # - min_branch_fraction=0.01: Default minimum data per branch
        # - max_share_to_split=0.99: Default majority class proportion
        # - max_size=100.0: Default max memory in MiB
        # - memory_estimate_period=2000000: Default instances between memory checks
        # - merit_preprune=True: Default merit-based pre-pruning
        return forest.ARFClassifier(
            n_models = 10,
            max_features = "sqrt",
            lambda_value = 6,
            metric = metrics.ROCAUC(),
            disable_weighted_vote = False,
            drift_detector = drift.ADWIN(delta = 0.002),
            warning_detector = drift.ADWIN(delta = 0.01),
            grace_period = 50,
            max_depth = None,
            split_criterion = "info_gain",
            delta = 0.01,
            tau = 0.05,
            leaf_prediction = "nba",
            nb_threshold = 0,
            nominal_attributes = None,
            binary_split = False,
            min_branch_fraction = 0.01,
            max_share_to_split = 0.99,
            max_size = 100.0,
            memory_estimate_period = 2000000,
            stop_mem_management = False,
            remove_poor_attrs = False,
            merit_preprune = True,
            seed = 42,
        )
    elif project_name == "Estimated Time of Arrival":
        # =================================================================
        # ARFRegressor - Adaptive Random Forest Regressor
        # For ETA prediction with continuous drift handling
        # =================================================================
        # OLD CONFIGURATION:
        # return forest.ARFRegressor(
        #     n_models = 10,
        #     drift_detector = drift.ADWIN(),
        #     warning_detector = drift.ADWIN(),
        #     metric = metrics.RMSE(),
        #     max_features = "sqrt",
        #     lambda_value = 6,
        #     seed = 42
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/forest/ARFRegressor/
        #
        # - n_models=10: Default number of trees
        # - max_features="sqrt": Default feature selection
        # - aggregation_method="median": Default, robust to outliers
        # - lambda_value=6: Default Leveraging Bagging parameter
        # - metric=MAE(): Using MAE as it's common for ETA prediction
        # - disable_weighted_vote=True: Default for regressor
        # - drift_detector ADWIN(delta=0.002): Default sensitivity
        # - warning_detector ADWIN(delta=0.01): Default warning sensitivity
        # - grace_period=50: Default observations between split attempts
        # - max_depth=None: Default unlimited depth
        # - delta=0.01: Default allowed error
        # - tau=0.05: Default tie-breaking threshold
        # - leaf_prediction="adaptive": Default, dynamically chooses mean/model
        # - model_selector_decay=0.95: Default decay for leaf model selection
        # - min_samples_split=5: Default minimum samples for split
        # - binary_split=False: Default multi-way splits
        # - max_size=500.0: Default max memory in MiB
        return forest.ARFRegressor(
            n_models=10,
            max_features="sqrt",
            aggregation_method="median",
            lambda_value=6,
            metric=metrics.MAE(),
            disable_weighted_vote=True,
            drift_detector=drift.ADWIN(delta=0.002),
            warning_detector=drift.ADWIN(delta=0.01),
            grace_period=50,
            max_depth=None,
            delta=0.01,
            tau=0.05,
            leaf_prediction="adaptive",
            leaf_model=None,
            model_selector_decay=0.95,
            min_samples_split=5,
            binary_split=False,
            max_size=500.0,
            memory_estimate_period=2000000,
            nominal_attributes=None,
            seed=42,
        )
    elif project_name == "E-Commerce Customer Interactions":
        # =================================================================
        # DBSTREAM - Density-Based Stream Clustering
        # For customer behavior clustering with arbitrary shapes
        # =================================================================
        # OLD CONFIGURATION:
        # return cluster.DBSTREAM(
        #     clustering_threshold = 1.0,
        #     fading_factor = 0.01,
        #     cleanup_interval = 2,
        # )

        # CONFIGURATION based on River ML documentation example:
        # Reference: https://riverml.xyz/latest/api/cluster/DBSTREAM/
        #
        # The River documentation provides this exact example configuration:
        # - clustering_threshold=1.5: Micro-cluster radius
        # - fading_factor=0.05: Historical data importance (must be > 0)
        # - cleanup_interval=4: Time between cleanup processes
        # - intersection_factor=0.5: Cluster overlap ratio for connectivity
        # - minimum_weight=1.0: Threshold for non-noisy cluster classification
        return cluster.DBSTREAM(
            clustering_threshold=1.5,
            fading_factor=0.05,
            cleanup_interval=4,
            intersection_factor=0.5,
            minimum_weight=1.0,
        )
    elif project_name == "Sales Forecasting":
        # =================================================================
        # SNARIMAX - Seasonal Non-linear Auto-Regressive Integrated
        # Moving Average with eXogenous inputs
        # For sales forecasting with weekly seasonality
        # =================================================================
        # OLD CONFIGURATION:
        # regressor_snarimax = linear_model.PARegressor(
        #     C = 0.01,
        #     mode = 1)
        # return time_series.SNARIMAX(
        #     p = 2,
        #     d = 1,
        #     q = 1,
        #     m = 7,
        #     sp = 1,
        #     sd = 0,
        #     sq = 1,
        #     regressor = regressor_snarimax
        # )

        # CONFIGURATION based on River ML documentation:
        # Reference: https://riverml.xyz/latest/api/time-series/SNARIMAX/
        # Reference: https://riverml.xyz/latest/api/linear-model/PARegressor/
        #
        # SNARIMAX parameters for weekly sales data:
        # - p=7: Past 7 days of target values (full week)
        # - d=1: First-order differencing for trend removal
        # - q=2: Past error terms for noise handling
        # - m=7: Weekly seasonality period
        # - sp=1: Seasonal autoregressive order
        # - sd=1: Seasonal differencing (recommended for seasonal data)
        # - sq=1: Seasonal moving average order
        #
        # PARegressor parameters (defaults from River docs):
        # - C=1.0: Default regularization strength
        # - mode=1: Default algorithm mode
        # - eps=0.1: Default tolerance parameter
        # - learn_intercept=True: Default bias learning
        regressor_snarimax = linear_model.PARegressor(
            C=1.0,
            mode=1,
            eps=0.1,
            learn_intercept=True,
        )
        return time_series.SNARIMAX(
            p=7,
            d=1,
            q=2,
            m=7,
            sp=1,
            sd=1,
            sq=1,
            regressor=regressor_snarimax,
        )
    else:
        raise ValueError(f"Unknown project: {project_name}")

In [33]:
def _create_default_encoders(project_name):
    """Create default encoders based on project type."""
    if project_name in ["Transaction Fraud Detection", "Estimated Time of Arrival"]:
        return {"ordinal_encoder": CustomOrdinalEncoder()}
    elif project_name == "E-Commerce Customer Interactions":
        return {
            "standard_scaler": preprocessing.StandardScaler(),
            "feature_hasher": preprocessing.FeatureHasher()
        }
    elif project_name == "Sales Forecasting":
        return {
            "one_hot_encoder": preprocessing.OneHotEncoder(),
            "standard_scaler": preprocessing.StandardScaler(),
        }
    else:
        raise ValueError(f"Unknown project: {project_name}")

In [34]:
# =============================================================================
# RIVER ML METRICS CONFIGURATION FOR ESTIMATED TIME OF ARRIVAL
# =============================================================================
# Research-based optimal configuration for real-time ETA training.
# Sources:
#   - River ML Documentation: https://riverml.xyz/dev/api/metrics/
#   - ETA Prediction Best Practices: https://peerj.com/articles/cs-3259/
#   - Travel Time Prediction Review: https://pmc.ncbi.nlm.nih.gov/articles/PMC8444094/
#
# Key insights from literature:
#   - MAE + RMSE + MAPE is the most common metric combination (15-23% of studies)
#   - MAE is the primary metric (used in 33% of ETA studies)
#   - Rolling metrics help detect concept drift (traffic pattern changes)
#
# IMPORTANT: Unlike classification metrics, River regression metrics have
#            NO configurable parameters (no cm, pos_val, beta, n_thresholds, etc.)
# =============================================================================

from river import metrics, utils

# -----------------------------------------------------------------------------
# REGRESSION METRICS (use predict_one - continuous values)
# -----------------------------------------------------------------------------
# These metrics compare predicted travel time vs actual travel time (seconds)
# All metrics have NO configurable parameters - use default constructors
# -----------------------------------------------------------------------------
regression_metric_classes = {
    # PRIMARY METRICS (most important for ETA - used in 15-23% of studies)
    "MAE": metrics.MAE,      # Mean Absolute Error - Primary metric (seconds)
                             # Formula: (1/n) * Σ|y_true - y_pred|
                             # Use case: Most interpretable, same units as target
    
    "RMSE": metrics.RMSE,    # Root Mean Squared Error - Large error penalty
                             # Formula: sqrt((1/n) * Σ(y_true - y_pred)²)
                             # Use case: Penalizes late arrivals more heavily
    
    "MAPE": metrics.MAPE,    # Mean Absolute Percentage Error - Scale-independent
                             # Formula: (100/n) * Σ|y_true - y_pred| / |y_true|
                             # Use case: Compare across different trip lengths
                             # Warning: Undefined when y_true = 0
    
    # SECONDARY METRICS (additional insights)
    "R2": metrics.R2,        # Coefficient of Determination - Goodness of fit
                             # Formula: 1 - (SS_res / SS_tot)
                             # Use case: Proportion of variance explained
                             # Range: Can be negative (worse than baseline)
    
    "SMAPE": metrics.SMAPE,  # Symmetric MAPE - Robust percentage error
                             # Formula: (100/n) * Σ|y_true - y_pred| / ((|y_true| + |y_pred|) / 2)
                             # Use case: More robust than MAPE (bounded 0-200%)
    
    "MSE": metrics.MSE,      # Mean Squared Error - For optimization
                             # Formula: (1/n) * Σ(y_true - y_pred)²
                             # Use case: Squared units, used in loss functions
    
    "RMSLE": metrics.RMSLE,  # Root Mean Squared Logarithmic Error - Log-scale
                             # Formula: sqrt((1/n) * Σ(log(y_pred+1) - log(y_true+1))²)
                             # Use case: Less sensitive to large errors than RMSE
}

# All regression metrics have NO configurable parameters
regression_metric_args = {
    "MAE": {},      # No args - optimal: lower is better
    "RMSE": {},     # No args - optimal: lower is better
    "MAPE": {},     # No args - optimal: lower is better
    "R2": {},       # No args - optimal: higher is better (only one!)
    "SMAPE": {},    # No args - optimal: lower is better
    "MSE": {},      # No args - optimal: lower is better
    "RMSLE": {},    # No args - optimal: lower is better
}

# -----------------------------------------------------------------------------
# ROLLING METRICS (for concept drift detection)
# -----------------------------------------------------------------------------
# Rolling metrics use utils.Rolling() wrapper to compute over sliding window
# Use case: Detect traffic pattern shifts (rush hour, weather, incidents)
# -----------------------------------------------------------------------------
rolling_metric_classes = {
    "RollingMAE": utils.Rolling,
}

rolling_metric_args = {
    # RollingMAE: Windowed MAE for drift detection
    # window_size=1000: Number of recent samples to consider
    # - Provides stable estimates while detecting recent performance changes
    # - For ETA with ~10 samples/second, covers ~100 seconds of data
    # - Smaller windows (500) = more sensitive to drift
    # - Larger windows (2000) = more stable but slower to detect drift
    "RollingMAE": {"obj": metrics.MAE(), "window_size": 1000},
}

# =============================================================================
# INSTANTIATE ALL METRICS
# =============================================================================
regression_metrics = {
    name: regression_metric_classes[name](**regression_metric_args[name])
    for name in regression_metric_classes
}

rolling_metrics = {
    name: rolling_metric_classes[name](**rolling_metric_args[name])
    for name in rolling_metric_classes
}

# =============================================================================
# BEST MODEL SELECTION CRITERION
# =============================================================================
# For ETA: Minimize MAE (lower is better)
# Rationale: MAE is most interpretable and used in 33% of ETA studies
# Alternative: RMSE if large errors (late arrivals) should be penalized more
# This should be added to BEST_METRIC_CRITERIA in functions.py:
# "Estimated Time of Arrival": ("MAE", "minimize")
# =============================================================================

print("=" * 70)
print("REGRESSION METRICS FOR ESTIMATED TIME OF ARRIVAL")
print("=" * 70)
print("\nMetric Classes:")
for name in regression_metric_classes:
    metric = regression_metrics[name]
    optimal = "higher" if metric.bigger_is_better else "lower"
    print(f"  {name:<10}: {type(metric).__name__}() - {optimal} is better")

print("\nMetric Arguments (all empty - no configurable params):")
for name, args in regression_metric_args.items():
    print(f"  {name:<10}: {args}")

print("\nRolling Metrics:")
for name, args in rolling_metric_args.items():
    print(f"  {name:<10}: Rolling(MAE(), window_size={args['window_size']})")

print("\n" + "=" * 70)
print("BEST MODEL SELECTION: Minimize MAE")
print("=" * 70)

REGRESSION METRICS FOR ESTIMATED TIME OF ARRIVAL

Metric Classes:
  MAE       : MAE() - lower is better
  RMSE      : RMSE() - lower is better
  MAPE      : MAPE() - lower is better
  R2        : R2() - higher is better
  SMAPE     : SMAPE() - lower is better
  MSE       : MSE() - lower is better
  RMSLE     : RMSLE() - lower is better

Metric Arguments (all empty - no configurable params):
  MAE       : {}
  RMSE      : {}
  MAPE      : {}
  R2        : {}
  SMAPE     : {}
  MSE       : {}
  RMSLE     : {}

Rolling Metrics:
  RollingMAE: Rolling(MAE(), window_size=1000)

BEST MODEL SELECTION: Minimize MAE


In [36]:
from river import metrics, utils
import datetime as dt

# =============================================================================
# INVESTIGATION FINDINGS: dir(metrics)
# =============================================================================
# Total items: 89 (49 classes, 27 submodules, 0 functions, 0 constants)
#
# BASE CLASSES (metrics.base):
#   RegressionMetric()           - NO parameters
#   MeanMetric()                 - NO parameters  
#   ClassificationMetric(cm)     - has cm parameter
#   BinaryMetric(cm, pos_val)    - has cm, pos_val parameters
#   MultiClassMetric(cm)         - has cm parameter
#   ClusteringMetric()           - NO parameters
#   WrapperMetric()              - abstract base
#   Metrics(metrics, str_sep)    - container for multiple metrics
#
# REGRESSION METRIC HIERARCHY:
#   MAE   -> MeanMetric, RegressionMetric
#   MAPE  -> MeanMetric, RegressionMetric
#   MSE   -> MeanMetric, RegressionMetric
#   RMSE  -> MSE
#   RMSLE -> RMSE
#   SMAPE -> MeanMetric, RegressionMetric
#   R2    -> RegressionMetric (only one NOT using MeanMetric)
#
# ROLLING WRAPPERS (river.utils, NOT river.metrics):
#   utils.Rolling(obj, window_size)      - sample-based window
#   utils.TimeRolling(obj, period)       - time-based window (requires t=timestamp)
#
# MULTIOUTPUT METRICS (metrics.multioutput) - for multi-target regression:
#   MacroAverage(metric), MicroAverage(metric), PerOutput(metric), SampleAverage(metric)
# =============================================================================

PROJECT_NAME = "Estimated Time of Arrival"

model = _create_default_model(PROJECT_NAME)
encoders = _create_default_encoders(PROJECT_NAME)

regression_metric_classes = {
    "MAE": metrics.MAE,
    "RMSE": metrics.RMSE,
    "MAPE": metrics.MAPE,
    "R2": metrics.R2,
    "SMAPE": metrics.SMAPE,
    "MSE": metrics.MSE,
    "RMSLE": metrics.RMSLE,
}

regression_metric_args = {
    "MAE": {},
    "RMSE": {},
    "MAPE": {},
    "R2": {},
    "SMAPE": {},
    "MSE": {},
    "RMSLE": {},
}

rolling_metric_classes = {
    "RollingMAE": utils.Rolling,
    "RollingRMSE": utils.Rolling,
}

rolling_metric_args = {
    "RollingMAE": {"obj": metrics.MAE(), "window_size": 1000},
    "RollingRMSE": {"obj": metrics.RMSE(), "window_size": 1000},
}

time_rolling_metric_classes = {
    "TimeRollingMAE": utils.TimeRolling,
}

time_rolling_metric_args = {
    "TimeRollingMAE": {"obj": metrics.MAE(), "period": dt.timedelta(minutes=5)},
}

regression_metrics = {
    name: regression_metric_classes[name](**regression_metric_args[name])
    for name in regression_metric_classes
}

rolling_metrics = {
    name: rolling_metric_classes[name](**rolling_metric_args[name])
    for name in rolling_metric_classes
}

time_rolling_metrics = {
    name: time_rolling_metric_classes[name](**time_rolling_metric_args[name])
    for name in time_rolling_metric_classes
}

for i, sample in enumerate(tqdm.tqdm(samples)):
    x, encoders = process_sample(sample, encoders, PROJECT_NAME)
    y = sample['simulated_actual_travel_time_seconds']
    timestamp = dt.datetime.strptime(sample['timestamp'], "%Y-%m-%dT%H:%M:%S.%f%z")
    
    prediction = model.predict_one(x)
    model.learn_one(x, y)
    
    if prediction is not None:
        for metric in regression_metrics.values():
            try:
                metric.update(y, prediction)
            except:
                pass
        for metric in rolling_metrics.values():
            try:
                metric.update(y, prediction)
            except:
                pass
        for metric in time_rolling_metrics.values():
            try:
                metric.update(y, prediction, t=timestamp)
            except:
                pass

metrics_to_log = {}
for name, metric in regression_metrics.items():
    metrics_to_log[name] = metric.get()
for name, metric in rolling_metrics.items():
    metrics_to_log[name] = metric.get()
for name, metric in time_rolling_metrics.items():
    metrics_to_log[name] = metric.get()

metrics_to_log

100%|██████████| 1000/1000 [00:03<00:00, 264.90it/s]


{'MAE': 3209314.4296047348,
 'RMSE': 101458798.23315397,
 'MAPE': 70854.1680197065,
 'R2': -2002101934.0681758,
 'SMAPE': 23.803354530158078,
 'MSE': 1.0293887738915848e+16,
 'RMSLE': 0.5928288151108694,
 'RollingMAE': 3209314.4296047348,
 'RollingRMSE': 101458798.23315397,
 'TimeRollingMAE': 3209314.4296047348}