# Rewriting the processing pipeline

In [1]:
from pprint import pprint

x = {
    'transaction_id': 'ffd3d366-06e4-4ddb-894e-03876e893079', 
    'user_id': '61fa227e-d309-4ed0-b513-3cffa5526463', 
    'timestamp': '2025-04-17T19:52:06.994066+00:00', 
    'amount': 302.69, 
    'currency': 'BRL', 
    'merchant_id': 'merchant_65', 
    'product_category': 'luxury_items', 
    'transaction_type': 'deposit', 
    'payment_method': 'debit_card', 
    'location': {'lat': -68.4965105, 'lon': -153.515477}, 
    'ip_address': '169.235.63.28', 
    'device_info': {'os': 'Windows', 'browser': 'Opera'}, 
    'user_agent': 'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 (KHTML, like Gecko) Version/4.1 Safari/532.16.3', 
    'account_age_days': 370, 
    'cvv_provided': True, 
    'billing_address_match': True, 
    'is_fraud': 0}

pprint(x)

{'account_age_days': 370,
 'amount': 302.69,
 'billing_address_match': True,
 'currency': 'BRL',
 'cvv_provided': True,
 'device_info': {'browser': 'Opera', 'os': 'Windows'},
 'ip_address': '169.235.63.28',
 'is_fraud': 0,
 'location': {'lat': -68.4965105, 'lon': -153.515477},
 'merchant_id': 'merchant_65',
 'payment_method': 'debit_card',
 'product_category': 'luxury_items',
 'timestamp': '2025-04-17T19:52:06.994066+00:00',
 'transaction_id': 'ffd3d366-06e4-4ddb-894e-03876e893079',
 'transaction_type': 'deposit',
 'user_agent': 'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 '
               '(KHTML, like Gecko) Version/4.1 Safari/532.16.3',
 'user_id': '61fa227e-d309-4ed0-b513-3cffa5526463'}


In [2]:
import pickle
import os
from typing import Any, Dict, Hashable

# Define the path for saving/loading the encoder state
ENCODER_FILE = "custom_ordinal_encoder.pkl"

class CustomPicklableOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        # Dictionary to store mappings for each feature.
        # Keys are feature names (from input dictionary), values are dictionaries
        # mapping category value to integer ID for that feature.
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        # Dictionary to store the next available integer ID for each feature.
        # Keys are feature names, values are integers.
        self._feature_next_ids: Dict[Hashable, int] = {}
    def learn_one(self, x: Dict[Hashable, Any]):
        """
        Learns categories from a single sample dictionary.
        Iterates through the dictionary's items and learns each category value
        for its corresponding feature.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
               Assumes categorical features are present in this dictionary.
        """
        for feature_name, category_value in x.items():
            # Ensure the category value is hashable (dictionaries/lists are not)
            # You might need more sophisticated type checking or handling
            # if your input dictionaries contain complex unhashable types
            if not isinstance(category_value, Hashable):
                 print(f"Warning: Skipping unhashable value for feature '{feature_name}': {category_value}")
                 continue # Skip this feature for learning
            # If this is the first time we see this feature, initialize its mapping and counter
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            # Get the mapping and counter for this specific feature
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            # Check if the category value is already in the mapping for this feature
            if category_value not in feature_map:
                # If it's a new category for this feature, assign the next available ID
                feature_map[category_value] = feature_next_id
                # Increment the counter for the next new category for this feature
                self._feature_next_ids[feature_name] += 1
    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        """
        Transforms categorical features in a single sample dictionary into integer IDs.
        Args:
            x: A dictionary representing a single sample.
               Keys are feature names, values are feature values.
        Returns:
            A new dictionary containing the transformed integer IDs for the
            categorical features that the encoder has seen. Features not
            seen by the encoder are excluded from the output dictionary.
        Raises:
            KeyError: If a feature is seen but a specific category value
                      within that feature has not been seen during learning.
                      You might want to add logic here to handle unseen categories
                      (e.g., return a default value like -1 or NaN for that feature).
        """
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            # Only attempt to transform features that the encoder has seen
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]

                # Check if the category value for this feature has been seen
                if category_value in feature_map:
                    # Transform the category value using the feature's mapping
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    # Handle unseen category values for a known feature
                    # By default, this will raise a KeyError as per the docstring.
                    # Example: return a placeholder value instead of raising error:
                    # transformed_sample[feature_name] = -1 # Or some other indicator
                    # print(f"Warning: Unseen category '{category_value}' for feature '{feature_name}' during transform.")
                    # Or raise the error explicitly:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}' during transform.")
            # Features not in self._feature_mappings are ignored in the output.
            # If you need to include them (e.g., original numerical features),
            # you would copy them over here. This encoder only outputs encoded features.
        return transformed_sample
    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        """Returns the current mappings for all features."""
        return self._feature_mappings
    def get_feature_next_ids(self) -> Dict[Hashable, int]:
        """Returns the next available IDs for all features."""
        return self._feature_next_ids
    def __repr__(self) -> str:
        """String representation of the encoder."""
        num_features = len(self._feature_mappings)
        feature_details = ", ".join([f"{name}: {len(mapping)} categories" for name, mapping in self._feature_mappings.items()])
        return f"CustomPicklableOrdinalEncoder(features={num_features} [{feature_details}])"

# --- Example Usage ---

# 1. Create an instance of the custom encoder
custom_encoder = CustomPicklableOrdinalEncoder()
print(f"Initial encoder state: {custom_encoder}")

# 2. Learn from some sample dictionaries
sample_data = [
    {'color': 'red', 'size': 'M', 'city': 'NY'},
    {'color': 'blue', 'size': 'L', 'city': 'London'},
    {'color': 'red', 'size': 'S', 'city': 'NY'},
    {'color': 'green', 'size': 'M', 'city': 'Paris'},
    {'color': 'blue', 'size': 'L', 'city': 'NY'},
]

print("\nLearning from sample dictionaries:")
for sample in sample_data:
    print(f"Learning sample: {sample}")
    custom_encoder.learn_one(sample)

print(f"\nEncoder state after learning: {custom_encoder}")
print(f"Learned feature mappings: {custom_encoder.get_feature_mappings()}")
print(f"Next IDs per feature: {custom_encoder.get_feature_next_ids()}")


# 3. Transform data points
print("\nTransforming sample dictionaries:")
transformed_samples = [custom_encoder.transform_one(sample) for sample in sample_data]
for i, sample in enumerate(sample_data):
    print(f"Original: {sample}")
    print(f"Transformed: {transformed_samples[i]}")

# Example of transforming a sample with an unseen category for a known feature (will raise KeyError)
# unseen_sample = {'color': 'yellow', 'size': 'S', 'city': 'NY'}
# try:
#     print("\nAttempting to transform sample with unseen category 'yellow':")
#     custom_encoder.transform_one(unseen_sample)
# except KeyError as e:
#     print(f"Caught expected error: {e}")

# Example of transforming a sample with a new feature (the new feature is ignored)
# new_feature_sample = {'color': 'red', 'size': 'M', 'material': 'wood'}
# print("\nAttempting to transform sample with new feature 'material':")
# transformed_new_feature = custom_encoder.transform_one(new_feature_sample)
# print(f"Original: {new_feature_sample}")
# print(f"Transformed: {transformed_new_feature}") # 'material' is not in the output


# 4. Save the encoder using pickle
print(f"\nSaving encoder to '{ENCODER_FILE}'...")
try:
    with open(ENCODER_FILE, "wb") as f:
        pickle.dump(custom_encoder, f)
    print("Encoder saved successfully.")
except Exception as e:
    print(f"Error saving encoder: {e}")

# 5. Load the encoder using pickle
loaded_encoder = None
if os.path.exists(ENCODER_FILE):
    print(f"\nLoading encoder from '{ENCODER_FILE}'...")
    try:
        with open(ENCODER_FILE, "rb") as f:
            loaded_encoder = pickle.load(f)
        print("Encoder loaded successfully.")
        print(f"Loaded encoder state: {loaded_encoder}")
        print(f"Loaded feature mappings: {loaded_encoder.get_feature_mappings()}")

        # Verify the loaded encoder works
        print("\nTransforming data points using loaded encoder:")
        transformed_data_loaded = [loaded_encoder.transform_one(sample) for sample in sample_data]
        for i, sample in enumerate(sample_data):
            print(f"Original: {sample}")
            print(f"Transformed (loaded encoder): {transformed_data_loaded[i]}")

    except Exception as e:
        print(f"Error loading encoder: {e}")
    # finally:
        # Clean up the saved file
        # os.remove(ENCODER_FILE)
        # print(f"\nCleaned up '{ENCODER_FILE}'.")
        # pass # Keep the file for inspection if needed



Initial encoder state: CustomPicklableOrdinalEncoder(features=0 [])

Learning from sample dictionaries:
Learning sample: {'color': 'red', 'size': 'M', 'city': 'NY'}
Learning sample: {'color': 'blue', 'size': 'L', 'city': 'London'}
Learning sample: {'color': 'red', 'size': 'S', 'city': 'NY'}
Learning sample: {'color': 'green', 'size': 'M', 'city': 'Paris'}
Learning sample: {'color': 'blue', 'size': 'L', 'city': 'NY'}

Encoder state after learning: CustomPicklableOrdinalEncoder(features=3 [color: 3 categories, size: 3 categories, city: 3 categories])
Learned feature mappings: {'color': {'red': 0, 'blue': 1, 'green': 2}, 'size': {'M': 0, 'L': 1, 'S': 2}, 'city': {'NY': 0, 'London': 1, 'Paris': 2}}
Next IDs per feature: {'color': 3, 'size': 3, 'city': 3}

Transforming sample dictionaries:
Original: {'color': 'red', 'size': 'M', 'city': 'NY'}
Transformed: {'color': 0, 'size': 0, 'city': 0}
Original: {'color': 'blue', 'size': 'L', 'city': 'London'}
Transformed: {'color': 1, 'size': 1, 'city'

In [3]:
from river import compose

pipe1 = compose.Select(
    "amount",
    "account_age_days",
    "cvv_provided",
    "billing_address_match"
)

x_pipe_1 = pipe1.transform_one(x)
pprint(x_pipe_1)

{'account_age_days': 370,
 'amount': 302.69,
 'billing_address_match': True,
 'cvv_provided': True}


In [4]:
from river import preprocessing

pipe2a = compose.Select(
    "currency",
    "merchant_id",
    "payment_method",
    "product_category",
    "transaction_type",
    "user_agent"
)

pipe2a.learn_one(x)
x_pipe_2 = pipe2a.transform_one(x)

#pipe2b = preprocessing.OrdinalEncoder()
pipe2b = CustomPicklableOrdinalEncoder()

pipe2b.learn_one(x_pipe_2)
x_pipe_2 = pipe2b.transform_one(x_pipe_2)

pprint(x_pipe_2)

{'currency': 0,
 'merchant_id': 0,
 'payment_method': 0,
 'product_category': 0,
 'transaction_type': 0,
 'user_agent': 0}


In [5]:
pipe2b.get_feature_mappings()

{'product_category': {'luxury_items': 0},
 'user_agent': {'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 (KHTML, like Gecko) Version/4.1 Safari/532.16.3': 0},
 'merchant_id': {'merchant_65': 0},
 'transaction_type': {'deposit': 0},
 'currency': {'BRL': 0},
 'payment_method': {'debit_card': 0}}

In [11]:
import datetime as dt
test = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")#.strftime("%Y-%m-%dT%H:%M:%S")
test.second

6

In [16]:
def extract_device_info(x):
    x_ = x['device_info']
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

pipe3a = compose.Select(
    "device_info",
)

pipe3a.learn_one(x)
x_pipe_3 = pipe3a.transform_one(x)

pipe3b = compose.FuncTransformer(
    extract_device_info,
)

pipe3b.learn_one(x_pipe_3)
x_pipe_3 = pipe3b.transform_one(x_pipe_3)

#pipe3c = preprocessing.OrdinalEncoder()
pipe3c = CustomPicklableOrdinalEncoder()

pipe3c.learn_one(x_pipe_3)
x_pipe_3 = pipe3c.transform_one(x_pipe_3)

In [15]:
pipe3c.get_feature_mappings()

{'os': {'Windows': 0}, 'browser': {'Opera': 0}}

In [18]:
import datetime as dt

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

pipe4a = compose.Select(
    "timestamp",
)

pipe4a.learn_one(x)
x_pipe_4 = pipe4a.transform_one(x)

pipe4b = compose.FuncTransformer(
    extract_timestamp_info,
)

pipe4b.learn_one(x_pipe_4)
x_pipe_4 = pipe4b.transform_one(x_pipe_4)

pprint(x_pipe_4)

{'day': 17, 'hour': 19, 'minute': 52, 'month': 4, 'second': 6, 'year': 2025}


In [19]:
x = x_pipe_1 | x_pipe_2 | x_pipe_3 | x_pipe_4

pprint(x)

{'account_age_days': 370,
 'amount': 302.69,
 'billing_address_match': True,
 'browser': 0,
 'currency': 0,
 'cvv_provided': True,
 'day': 17,
 'hour': 19,
 'merchant_id': 0,
 'minute': 52,
 'month': 4,
 'os': 0,
 'payment_method': 0,
 'product_category': 0,
 'second': 6,
 'transaction_type': 0,
 'user_agent': 0,
 'year': 2025}


## Trying to serialize (pickle) some parts

In [66]:
import pickle

with open("ordinal_encoder_1.pkl", "wb") as f:
    pickle.dump(pipe2b, f)

In [67]:
with open("ordinal_encoder_2.pkl", "wb") as f:
    pickle.dump(pipe3c, f)

## Now, let's try to retrieve the saved encoder and check if internal data was saved

In [68]:
with open("ordinal_encoder_1.pkl", "rb") as f:
    pipe2b = pickle.load(f)

In [69]:
pipe2b.get_feature_mappings()

{'user_agent': {'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 (KHTML, like Gecko) Version/4.1 Safari/532.16.3': 0},
 'payment_method': {'debit_card': 0},
 'merchant_id': {'merchant_65': 0},
 'product_category': {'luxury_items': 0},
 'transaction_type': {'deposit': 0},
 'currency': {'BRL': 0}}

## Create a function to process each sample

In [None]:
from river import compose
import datetime as dt

def extract_device_info(x):
    x_ = x['device_info']
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

def process_sample(x):
    pipe1 = compose.Select(
        "amount",
        "account_age_days",
        "cvv_provided",
        "billing_address_match"
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    pipe2a = compose.Select(
        "currency",
        "merchant_id",
        "payment_method",
        "product_category",
        "transaction_type",
        "user_agent"
    )
    pipe2a.learn_one(x)
    x_pipe_2 = pipe2a.transform_one(x)
    pipe3a = compose.Select(
        "device_info"
    )
    pipe3a.learn_one(x)
    x_pipe_3 = pipe3a.transform_one(x)
    pipe3b = compose.FuncTransformer(
        extract_device_info,
    )
    pipe3b.learn_one(x_pipe_3)
    x_pipe_3 = pipe3b.transform_one(x_pipe_3)
    pipe4a = compose.Select(
        "timestamp",
    )
    pipe4a.learn_one(x)
    x_pipe_4 = pipe4a.transform_one(x)
    pipe4b = compose.FuncTransformer(
        extract_timestamp_info,
    )
    pipe4b.learn_one(x_pipe_4)
    x_pipe_4 = pipe4b.transform_one(x_pipe_4)
    ordinal_encoder = CustomPicklableOrdinalEncoder()
    x_to_encode = x_pipe_2 | x_pipe_3 | x_pipe_4
    ordinal_encoder.learn_one(x_to_encode)
    x2 = ordinal_encoder.transform_one(x_to_encode)
    return x1 | x2    

In [15]:
from pprint import pprint

x = {
    'transaction_id': 'ffd3d366-06e4-4ddb-894e-03876e893079', 
    'user_id': '61fa227e-d309-4ed0-b513-3cffa5526463', 
    'timestamp': '2025-04-17T19:52:06.994066+00:00', 
    'amount': 302.69, 
    'currency': 'BRL', 
    'merchant_id': 'merchant_65', 
    'product_category': 'luxury_items', 
    'transaction_type': 'deposit', 
    'payment_method': 'debit_card', 
    'location': {'lat': -68.4965105, 'lon': -153.515477}, 
    'ip_address': '169.235.63.28', 
    'device_info': {'os': 'Windows', 'browser': 'Opera'}, 
    'user_agent': 'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 (KHTML, like Gecko) Version/4.1 Safari/532.16.3', 
    'account_age_days': 370, 
    'cvv_provided': True, 
    'billing_address_match': True, 
    'is_fraud': 0}

pprint(x)

{'account_age_days': 370,
 'amount': 302.69,
 'billing_address_match': True,
 'currency': 'BRL',
 'cvv_provided': True,
 'device_info': {'browser': 'Opera', 'os': 'Windows'},
 'ip_address': '169.235.63.28',
 'is_fraud': 0,
 'location': {'lat': -68.4965105, 'lon': -153.515477},
 'merchant_id': 'merchant_65',
 'payment_method': 'debit_card',
 'product_category': 'luxury_items',
 'timestamp': '2025-04-17T19:52:06.994066+00:00',
 'transaction_id': 'ffd3d366-06e4-4ddb-894e-03876e893079',
 'transaction_type': 'deposit',
 'user_agent': 'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 '
               '(KHTML, like Gecko) Version/4.1 Safari/532.16.3',
 'user_id': '61fa227e-d309-4ed0-b513-3cffa5526463'}


In [16]:
processed_x = process_sample(x)
processed_x

{'account_age_days': 370,
 'billing_address_match': True,
 'amount': 302.69,
 'cvv_provided': True,
 'currency': 0,
 'merchant_id': 0,
 'transaction_type': 0,
 'user_agent': 0,
 'product_category': 0,
 'payment_method': 0,
 'os': 0,
 'browser': 0,
 'year': 0,
 'month': 0,
 'day': 0,
 'hour': 0,
 'minute': 0,
 'second': 0}

In [85]:
with open("ordinal_encoder_1.pkl", "rb") as f:
    pipe2b = pickle.load(f)
with open("ordinal_encoder_2.pkl", "rb") as f:
    pipe3c = pickle.load(f)

In [83]:
pipe2b.get_feature_mappings()

{'user_agent': {'Mozilla/5.0 (Windows; U; Windows NT 11.0) AppleWebKit/532.16.3 (KHTML, like Gecko) Version/4.1 Safari/532.16.3': 0},
 'payment_method': {'debit_card': 0},
 'merchant_id': {'merchant_65': 0},
 'product_category': {'luxury_items': 0},
 'transaction_type': {'deposit': 0},
 'currency': {'BRL': 0}}

In [81]:
pipe3c.get_feature_mappings()

{'os': {'Windows': 0}, 'browser': {'Opera': 0}}

## CLUSTERING

In [11]:
from river import base

# --- Custom Transformer for Imputing missing values ---
class DictImputer(base.Transformer):
    """
    Imputes missing values (None or missing keys) for specified features in a dictionary.

    Parameters
    ----------
    on
        List of feature names to impute.
    fill_value
        The value to use for imputation.
    """
    def __init__(self, on: list, fill_value):
        self.on = on
        self.fill_value = fill_value
    def transform_one(self, x: dict):
        x_transformed = x.copy()
        for feature in self.on:
            if x_transformed.get(feature) is None:
                x_transformed[feature] = self.fill_value
        return x_transformed

In [8]:
from river import compose
import datetime as dt

def extract_device_info(x):
    x_ = x['device_info']
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second,
        'weekday': x_.weekday(),
    }

def extract_coordinates(x):
    x_ = x['location']
    return {
        'lat': x_['lat'],
        'lon': x_['lon'],
    }

In [None]:
#FROM CLASSIFICATION, ERASE LATER
def process_sample(x):
    pipe1 = compose.Select(
        "amount",
        "account_age_days",
        "cvv_provided",
        "billing_address_match"
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    pipe2 = compose.Select(
        "currency",
        "merchant_id",
        "payment_method",
        "product_category",
        "transaction_type",
        #"user_agent"
    )
    pipe2.learn_one(x)
    x_pipe_2 = pipe2.transform_one(x)
    pipe3a = compose.Select(
        "device_info"
    )
    pipe3a.learn_one(x)
    x_pipe_3 = pipe3a.transform_one(x)
    pipe3b = compose.FuncTransformer(
        extract_device_info,
    )
    pipe3b.learn_one(x_pipe_3)
    x_pipe_3 = pipe3b.transform_one(x_pipe_3)
    pipe4a = compose.Select(
        "timestamp",
    )
    pipe4a.learn_one(x)
    x_pipe_4 = pipe4a.transform_one(x)
    pipe4b = compose.FuncTransformer(
        extract_timestamp_info,
    )
    pipe4b.learn_one(x_pipe_4)
    x_pipe_4 = pipe4b.transform_one(x_pipe_4)
    x_to_encode = x_pipe_2 | x_pipe_3 | x_pipe_4
    ordinal_encoder.learn_one(x_to_encode)
    x2 = ordinal_encoder.transform_one(x_to_encode)
    return x1 | x2   

In [93]:
from pprint import pprint

x = {'customer_id': '494ad3bb-7170-4286-bbbc-1486adec7a67',
     'device_info': {'browser': 'Safari', 'device_type': 'Mobile', 'os': 'Windows'},
     'event_id': '8ae9d883-dae1-4446-befc-db3ae2d4d599',
     'event_type': 'page_view',
     'location': {'lat': 30.010473, 'lon': -95.78408},
     'page_url': 'https://example.com/grocery-gourmet-food/prod_69785',
     'price': 1058.04,
     'product_category': 'Grocery & Gourmet Food',
     'product_id': 'prod_69785',
     'quantity': None,
     'referrer_url': None,
     'search_query': None,
     'session_event_sequence': 3,
     'session_id': 'c89c8d69-d39a-4366-95a5-9b83f0ea9db5',
     'time_on_page_seconds': 53,
     'timestamp': '2025-05-04T15:18:32.284468+00:00'}

pprint(x)

{'customer_id': '494ad3bb-7170-4286-bbbc-1486adec7a67',
 'device_info': {'browser': 'Safari', 'device_type': 'Mobile', 'os': 'Windows'},
 'event_id': '8ae9d883-dae1-4446-befc-db3ae2d4d599',
 'event_type': 'page_view',
 'location': {'lat': 30.010473, 'lon': -95.78408},
 'page_url': 'https://example.com/grocery-gourmet-food/prod_69785',
 'price': 1058.04,
 'product_category': 'Grocery & Gourmet Food',
 'product_id': 'prod_69785',
 'quantity': None,
 'referrer_url': None,
 'search_query': None,
 'session_event_sequence': 3,
 'session_id': 'c89c8d69-d39a-4366-95a5-9b83f0ea9db5',
 'time_on_page_seconds': 53,
 'timestamp': '2025-05-04T15:18:32.284468+00:00'}


In [94]:
from river import compose, preprocessing

scaler = preprocessing.StandardScaler()
feature_hasher = preprocessing.FeatureHasher(n_features = 50, seed = 42)

def process_sample(x):
    pipe1 = compose.Select(
        'price',
        'quantity',
        'session_event_sequence',
        'time_on_page_seconds'
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    pipe2 = compose.Select(
        'event_type',
        'product_category',
        'product_id',
        'referrer_url',
    )
    pipe2.learn_one(x)
    x_pipe_2 = pipe2.transform_one(x)
    pipe3a = compose.Select(
        "device_info"
    )
    pipe3a.learn_one(x)
    x_pipe_3 = pipe3a.transform_one(x)
    pipe3b = compose.FuncTransformer(
        extract_device_info,
    )
    pipe3b.learn_one(x_pipe_3)
    x_pipe_3 = pipe3b.transform_one(x_pipe_3)
    pipe4a = compose.Select(
        "timestamp",
    )
    pipe4a.learn_one(x)
    x_pipe_4 = pipe4a.transform_one(x)
    pipe4b = compose.FuncTransformer(
        extract_timestamp_info,
    )
    pipe4b.learn_one(x_pipe_4)
    x_pipe_4 = pipe4b.transform_one(x_pipe_4)
    pipe5a = compose.Select(
        "location",
    )
    pipe5a.learn_one(x)
    x_pipe_5 = pipe5a.transform_one(x)
    pipe5b = compose.FuncTransformer(
        extract_coordinates,
    )
    pipe5b.learn_one(x_pipe_5)
    x_pipe_5 = pipe5b.transform_one(x_pipe_5)
    x_to_prep = x1 | x_pipe_2 | x_pipe_3 | x_pipe_4 | x_pipe_5
    x_to_prep = DictImputer(
        fill_value = False, 
        on = list(x_to_prep.keys())).transform_one(
            x_to_prep)
    numerical_features = [
        'price',
        'session_event_sequence',
        'time_on_page_seconds',
        'quantity'
    ]
    categorical_features = [
        'event_type',
        'product_category',
        'product_id',
        'referrer_url',
        'os',
        'browser',
        'year',
        'month',
        'day',
        'hour',
        'minute',
        'second',
        'weekday'
    ]
    num_pipe = compose.Select(*numerical_features)
    num_pipe.learn_one(x_to_prep)
    x_num = num_pipe.transform_one(x_to_prep)
    cat_pipe = compose.Select(*categorical_features)
    cat_pipe.learn_one(x_to_prep)
    x_cat = cat_pipe.transform_one(x_to_prep)
    scaler.learn_one(x_num)
    x_scaled = scaler.transform_one(x_num)
    feature_hasher.learn_one(x_cat)
    x_hashed = feature_hasher.transform_one(x_cat)
    x = x_scaled | x_hashed
    return x, scaler, feature_hasher
    



In [95]:
x, scaler, feature_hasher = process_sample(x)
pprint(x)

{1: 1,
 2: 18,
 14: 5,
 20: 6,
 27: 0,
 29: 19,
 31: 32,
 34: 1,
 37: 1,
 42: 2025,
 46: 1,
 48: 1,
 'price': 0.0,
 'quantity': 0.0,
 'session_event_sequence': 0.0,
 'time_on_page_seconds': 0.0}


In [47]:
import pickle
import os

os.makedirs('encoders', exist_ok=True)

with open('encoders/scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('encoders/feature_hasher.pkl', 'wb') as f:
    pickle.dump(feature_hasher, f)

In [50]:
scaler.counts

Counter({'price': 1,
         'session_event_sequence': 1,
         'time_on_page_seconds': 1,
         'quantity': 1})

In [115]:
from river import cluster

model = cluster.DBSTREAM(
                clustering_threshold = 1.0,
                fading_factor = 0.01,
                cleanup_interval = 2,
            )
#model = cluster.KMeans(n_clusters=3, halflife=0.4, sigma=3, seed=0)

In [116]:
model.learn_one(x)
y_pred = model.predict_one(x)
y_pred

0

In [117]:
model.centers

{0: {'price': 0.0,
  'session_event_sequence': 0.0,
  'time_on_page_seconds': 0.0,
  'quantity': 0.0,
  48: 1,
  2: 18,
  46: 1,
  34: 1,
  31: 32,
  1: 1,
  20: 6,
  29: 19,
  42: 2025,
  37: 1,
  14: 5,
  27: 0}}

In [118]:
print(x)
print(y_pred)
print(model.centers)

{'price': 0.0, 'session_event_sequence': 0.0, 'time_on_page_seconds': 0.0, 'quantity': 0.0, 48: 1, 2: 18, 46: 1, 34: 1, 31: 32, 1: 1, 20: 6, 29: 19, 42: 2025, 37: 1, 14: 5, 27: 0}
0
{0: {'price': 0.0, 'session_event_sequence': 0.0, 'time_on_page_seconds': 0.0, 'quantity': 0.0, 48: 1, 2: 18, 46: 1, 34: 1, 31: 32, 1: 1, 20: 6, 29: 19, 42: 2025, 37: 1, 14: 5, 27: 0}}


In [120]:
model.clustering_threshold

1.0

In [119]:
dir(model) #DBSTREAM

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__or__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_centers',
 '_cleanup',
 '_clusters',
 '_distance',
 '_find_fixed_radius_nn',
 '_gaussian_neighborhood',
 '_generate_clusters_from_labels',
 '_generate_labels',
 '_generate_weighted_adjacency_matrix',
 '_get_params',
 '_is_stochastic',
 '_memory_usage',
 '_micro_clusters',
 '_more_tags',
 '_mutable_attributes',
 '_n_clusters',
 '_raw_memory_usage',
 '_recluster',
 '_repr_html_',
 '_supervised',
 '_tags',
 '_time_stamp',
 '_unit_test_params',
 '_unit_test_skips',
 '_update',
 'centers',
 'cleanup_interval',
 'clone',
 'clustering_i

In [92]:
from river import cluster
from river import stream
from river import metrics

X = [
    [1, 2],
    [1, 4],
    [1, 0],
    [4, 2],
    [4, 4],
    [4, 0],
    [-2, 2],
    [-2, 4],
    [-2, 0]
]

k_means = cluster.KMeans(n_clusters=3, halflife=0.4, sigma=3, seed=0)
metric = metrics.Silhouette()

for x, _ in stream.iter_array(X):
    k_means.learn_one(x)
    y_pred = k_means.predict_one(x)
    metric.update(x, y_pred, k_means.centers)

print(x)
print(y_pred)
print(k_means.centers)

metric

{0: -2, 1: 0}
2
{0: defaultdict(..., {0: 2.825146214041993, 1: -4.1897343141034495}), 1: defaultdict(..., {0: 3.2102057281313594, 1: 1.5694266433303334}), 2: defaultdict(..., {0: -2.22659408343387, 1: 1.201266225235274})}


Silhouette: 0.32145