In [1]:
import pickle
import os
import sys
from typing import Any, Dict, Hashable
from river import (
    compose, 
    metrics, 
    drift,
    forest
)
from kafka import KafkaConsumer
import json
import pandas as pd
import datetime as dt

In [7]:
class CustomOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        # Dictionary to store mappings for each feature.
        # Keys are feature names (from input dictionary), values are dictionaries
        # mapping category value to integer ID for that feature.
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        # Dictionary to store the next available integer ID for each feature.
        # Keys are feature names, values are integers.
        self._feature_next_ids: Dict[Hashable, int] = {}
    def learn_one(self, x: Dict[Hashable, Any]):
        """
        Learns categories from a single sample dictionary.
        Iterates through the dictionary's items and learns each category value
        for its corresponding feature.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
               Assumes categorical features are present in this dictionary.
        """
        for feature_name, category_value in x.items():
            # Ensure the category value is hashable (dictionaries/lists are not)
            # You might need more sophisticated type checking or handling
            # if your input dictionaries contain complex unhashable types
            if not isinstance(category_value, Hashable):
                 print(f"Warning: Skipping unhashable value for feature '{feature_name}': {category_value}")
                 continue # Skip this feature for learning
            # If this is the first time we see this feature, initialize its mapping and counter
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            # Get the mapping and counter for this specific feature
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            # Check if the category value is already in the mapping for this feature
            if category_value not in feature_map:
                # If it's a new category for this feature, assign the next available ID
                feature_map[category_value] = feature_next_id
                # Increment the counter for the next new category for this feature
                self._feature_next_ids[feature_name] += 1
    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        """
        Transforms categorical features in a single sample dictionary into integer IDs.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
        Returns:
            A new dictionary containing the transformed integer IDs for the
            categorical features that the encoder has seen. Features not
            seen by the encoder are excluded from the output dictionary.
        Raises:
            KeyError: If a feature is seen but a specific category value
                      within that feature has not been seen during learning.
                      You might want to add logic here to handle unseen categories
                      (e.g., return a default value like -1 or NaN for that feature).
        """
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            # Only attempt to transform features that the encoder has seen
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]

                # Check if the category value for this feature has been seen
                if category_value in feature_map:
                    # Transform the category value using the feature's mapping
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    # Handle unseen category values for a known feature
                    # By default, this will raise a KeyError as per the docstring.
                    # Example: return a placeholder value instead of raising error:
                    # transformed_sample[feature_name] = -1 # Or some other indicator
                    # print(f"Warning: Unseen category '{category_value}' for feature '{feature_name}' during transform.")
                    # Or raise the error explicitly:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}' during transform.")
            # Features not in self._feature_mappings are ignored in the output.
            # If you need to include them (e.g., original numerical features),
            # you would copy them over here. This encoder only outputs encoded features.
        return transformed_sample
    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        """Returns the current mappings for all features."""
        return self._feature_mappings
    def get_feature_next_ids(self) -> Dict[Hashable, int]:
        """Returns the next available IDs for all features."""
        return self._feature_next_ids
    def __repr__(self) -> str:
        """String representation of the encoder."""
        num_features = len(self._feature_mappings)
        feature_details = ", ".join([f"{name}: {len(mapping)} categories" for name, mapping in self._feature_mappings.items()])
        return f"CustomPicklableOrdinalEncoder(features={num_features} [{feature_details}])"

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second,
        'day_of_week': x_.weekday()
    }

In [15]:
x = {
    'trip_id': 'a1b2c3d4-e5f6-7890-1234-567890abcdef', # Example UUID
    'driver_id': 'driver_3456', # Example driver ID
    'vehicle_id': 'vehicle_789', # Example vehicle ID
    'timestamp': '2025-05-03T01:08:00.000000+00:00', # ISO format timestamp in UTC
    'origin': {'lat': -23.551234, 'lon': -46.633456}, # Example coordinates
    'destination': {'lat': -23.587654, 'lon': -46.698765}, # Example coordinates
    'estimated_distance_km': 8.52, # Calculated Haversine distance
    'weather': 'Clouds', # Simulated weather condition
    'temperature_celsius': 22.5, # Simulated temperature
    'day_of_week': 5, # Day of the week in UTC (0=Monday, 5=Saturday)
    'hour_of_day': 1, # Hour of the day in UTC (0-23)
    'driver_rating': 4.7, # Simulated driver rating
    'vehicle_type': 'Car', # Randomly selected vehicle type
    # --- Feature available for prediction ---
    'initial_estimated_travel_time_seconds': 805, # Initial estimate based on distance/avg speed + noise
    # --- Ground Truth (Simulated) ---
    'simulated_actual_travel_time_seconds': 1595, # Simulated ground truth after factors
    # --- Debugging Info (Optional) ---
    'debug_traffic_factor': 1.08, # Simulated traffic multiplier (weekend example)
    'debug_weather_factor': 1.00, # Simulated weather multiplier (no bad weather)
    'debug_incident_delay_seconds': 750, # Simulated delay from a random incident
    'debug_driver_factor': 0.99 # Simulated factor based on driver rating
}

In [10]:
def process_sample(x):
    pipe1 = compose.Select(
        'estimated_distance_km',
        'temperature_celsius',
        'hour_of_day',
        'driver_rating',
        'initial_estimated_travel_time_seconds',
        'debug_traffic_factor',
        'debug_weather_factor',
        'debug_incident_delay_seconds',
        'debug_driver_factor'
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    pipe2 = compose.Select(
        'driver_id',
        'vehicle_id',
        'weather',
        'vehicle_type'
    )
    pipe2.learn_one(x)
    x_pipe_2 = pipe2.transform_one(x)
    pipe3a = compose.Select(
        "timestamp",
    )
    pipe3a.learn_one(x)
    x_pipe_3 = pipe3a.transform_one(x)
    pipe3b = compose.FuncTransformer(
        extract_timestamp_info,
    )
    pipe3b.learn_one(x_pipe_3)
    x_pipe_3 = pipe3b.transform_one(x_pipe_3)
    return x1 | x_pipe_2 | x_pipe_3


In [16]:
process_sample(x)

{'debug_traffic_factor': 1.08,
 'driver_rating': 4.7,
 'debug_driver_factor': 0.99,
 'initial_estimated_travel_time_seconds': 805,
 'debug_incident_delay_seconds': 750,
 'temperature_celsius': 22.5,
 'hour_of_day': 1,
 'debug_weather_factor': 1.0,
 'estimated_distance_km': 8.52,
 'weather': 'Clouds',
 'vehicle_type': 'Car',
 'vehicle_id': 'vehicle_789',
 'driver_id': 'driver_3456',
 'year': 2025,
 'month': 5,
 'day': 3,
 'hour': 1,
 'minute': 8,
 'second': 0,
 'day_of_week': 5}