In [2]:
import polars as pl
import orjson
import datetime as dt
from pprint import pprint
from typing import Any, Dict, Hashable, Optional, List
import tqdm
from river import (
    base,
    compose,
    metrics,
    drift,
    forest,
    cluster,
    preprocessing,
    time_series,
    linear_model,
    utils,
)

In [3]:
MINIO_HOST = "localhost"
MINIO_ENDPOINT = f"http://{MINIO_HOST}:9000"
MINIO_ACCESS_KEY = "minioadmin"
MINIO_SECRET_KEY = "minioadmin123"
PROJECT_NAME = "E-Commerce Customer Interactions"

In [4]:
DELTA_STORAGE_OPTIONS = {
    "AWS_ENDPOINT_URL": MINIO_ENDPOINT,
    "AWS_ACCESS_KEY_ID": MINIO_ACCESS_KEY,
    "AWS_SECRET_ACCESS_KEY": MINIO_SECRET_KEY,
    "AWS_REGION": "us-east-1",
    "AWS_S3_ALLOW_UNSAFE_RENAME": "true",
    "AWS_ALLOW_HTTP": "true",
}

In [5]:
DELTA_PATHS = {
    "Transaction Fraud Detection": "s3://lakehouse/delta/transaction_fraud_detection",
    "Estimated Time of Arrival": "s3://lakehouse/delta/estimated_time_of_arrival",
    "E-Commerce Customer Interactions": "s3://lakehouse/delta/e_commerce_customer_interactions",
    "Sales Forecasting": "s3://lakehouse/delta/sales_forecasting",
}

In [6]:
delta_path = DELTA_PATHS.get(PROJECT_NAME)

In [7]:
lf = pl.scan_delta(
    delta_path, 
    storage_options = DELTA_STORAGE_OPTIONS)

In [8]:
sql = pl.SQLContext()
sql.register("data", lf)

<SQLContext [tables:1] at 0x7fbf0f6bfe00>

In [9]:
result = sql.execute("SELECT * FROM data LIMIT 1000").collect()
result

event_id,customer_id,session_id,timestamp,event_type,product_id,product_category,price,quantity,page_url,referrer_url,device_info,location,session_event_sequence,time_on_page_seconds,search_query
str,str,str,str,str,str,str,f64,i32,str,str,str,str,i32,i32,str
"""d2aa112c-0d3d-4eb1-8597-a6dc4b…","""c8085743-2d9b-4030-ab8d-662626…","""6bc80f2f-1aed-4ed6-b193-4199cd…","""2026-01-14T19:49:42.812284+00:…","""add_to_cart""","""prod_1029""","""Pet Supplies""",1596.25,1,"""https://example.com/pet-suppli…",,"""{""device_type"":""Desktop"",""brow…","""{""lat"":29.691,""lon"":-95.774}""",112,72,
"""608c8202-e358-4e53-bbf5-eca2f0…","""d26c26bd-a2d2-4340-9bda-3d2f39…","""6e7a8e16-f3df-4c4d-9999-5fc865…","""2026-01-14T19:49:42.918144+00:…","""page_view""","""prod_1021""","""Electronics""",2456.9,,"""https://example.com/electronic…",,"""{""device_type"":""Tablet"",""brows…","""{""lat"":30.093,""lon"":-95.478}""",109,82,
"""0b7c927c-19a1-4c7f-b056-62cff1…","""c553af98-cd6e-4cad-a602-04847f…","""c9fea4a0-edec-4c35-8c54-63899a…","""2026-01-14T19:49:43.024483+00:…","""page_view""","""prod_1083""","""Beauty & Personal Care""",1849.36,,"""https://example.com/beauty-per…",,"""{""device_type"":""Desktop"",""brow…","""{""lat"":30.089,""lon"":-95.194}""",119,183,
"""70b41dc4-445c-4c62-a740-5f1bf9…","""e57cc5b4-b47f-41b5-a8b6-0cd692…","""21de46a2-fe38-4afa-bfa5-f9cc7a…","""2026-01-14T19:49:43.058619+00:…","""purchase""","""prod_1076""","""Pet Supplies""",1541.54,2,"""https://example.com/pet-suppli…",,"""{""device_type"":""Tablet"",""brows…","""{""lat"":29.582,""lon"":-95.02}""",120,79,
"""0b212d3f-b143-4271-851a-8c9ba6…","""13ef1d2e-6de5-4646-8b93-0f11a4…","""2a95b204-a56b-4609-8ee6-faa2dd…","""2026-01-14T19:49:43.132349+00:…","""add_to_cart""","""prod_1005""","""Beauty & Personal Care""",2018.21,4,"""https://example.com/beauty-per…",,"""{""device_type"":""Tablet"",""brows…","""{""lat"":29.627,""lon"":-95.509}""",108,168,
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""e0a02f9e-8a16-4a8a-af1f-df2841…","""adf5a822-0946-4556-afcc-6e1424…","""b3fc4c91-39a8-435d-9c58-72394e…","""2026-01-14T19:48:19.004959+00:…","""add_to_cart""","""prod_1061""","""Computers""",180.11,1,"""https://example.com/computers/…",,"""{""device_type"":""Desktop"",""brow…","""{""lat"":29.974,""lon"":-95.147}""",95,205,
"""fba1409c-d7f2-4b41-abcc-9b0e71…","""316f45c3-a553-4908-b96d-8efc7c…","""b27644d6-3003-4268-9f5d-5e3828…","""2026-01-14T19:48:19.275875+00:…","""page_view""","""prod_1009""","""Electronics""",454.54,,"""https://example.com/electronic…",,"""{""device_type"":""Mobile"",""brows…","""{""lat"":29.716,""lon"":-95.397}""",113,60,
"""fe07e754-9390-4012-9495-94f7eb…","""a26e4d50-f82e-4ca2-8f17-1febea…","""6b84e786-0a15-4ea4-9c05-2d6e5b…","""2026-01-14T19:48:19.612982+00:…","""page_view""","""prod_1053""","""Automotive""",628.71,,"""https://example.com/automotive…",,"""{""device_type"":""Tablet"",""brows…","""{""lat"":29.733,""lon"":-95.72}""",122,63,
"""d1634fca-4a9f-4b95-8041-f8a2ad…","""187e429b-a41f-4e09-9a80-9b3b2b…","""3ce88fda-4d6a-4765-98fe-cc05a2…","""2026-01-14T19:48:19.713327+00:…","""page_view""","""prod_1071""","""Pet Supplies""",435.44,,"""https://example.com/pet-suppli…",,"""{""device_type"":""Desktop"",""brow…","""{""lat"":29.8,""lon"":-95.259}""",115,265,


In [10]:
samples = result.to_dicts()
samples[:3]

[{'event_id': 'd2aa112c-0d3d-4eb1-8597-a6dc4b33b7d2',
  'customer_id': 'c8085743-2d9b-4030-ab8d-6626263831dc',
  'session_id': '6bc80f2f-1aed-4ed6-b193-4199cd9731a5',
  'timestamp': '2026-01-14T19:49:42.812284+00:00',
  'event_type': 'add_to_cart',
  'product_id': 'prod_1029',
  'product_category': 'Pet Supplies',
  'price': 1596.25,
  'quantity': 1,
  'page_url': 'https://example.com/pet-supplies/prod_1029',
  'referrer_url': None,
  'device_info': '{"device_type":"Desktop","browser":"Opera","os":"iOS"}',
  'location': '{"lat":29.691,"lon":-95.774}',
  'session_event_sequence': 112,
  'time_on_page_seconds': 72,
  'search_query': None},
 {'event_id': '608c8202-e358-4e53-bbf5-eca2f0e12c69',
  'customer_id': 'd26c26bd-a2d2-4340-9bda-3d2f39cd6e6e',
  'session_id': '6e7a8e16-f3df-4c4d-9999-5fc865055cf8',
  'timestamp': '2026-01-14T19:49:42.918144+00:00',
  'event_type': 'page_view',
  'product_id': 'prod_1021',
  'product_category': 'Electronics',
  'price': 2456.9,
  'quantity': None,
  

In [11]:
for sample in samples:
    pprint(sample)
    break

{'customer_id': 'c8085743-2d9b-4030-ab8d-6626263831dc',
 'device_info': '{"device_type":"Desktop","browser":"Opera","os":"iOS"}',
 'event_id': 'd2aa112c-0d3d-4eb1-8597-a6dc4b33b7d2',
 'event_type': 'add_to_cart',
 'location': '{"lat":29.691,"lon":-95.774}',
 'page_url': 'https://example.com/pet-supplies/prod_1029',
 'price': 1596.25,
 'product_category': 'Pet Supplies',
 'product_id': 'prod_1029',
 'quantity': 1,
 'referrer_url': None,
 'search_query': None,
 'session_event_sequence': 112,
 'session_id': '6bc80f2f-1aed-4ed6-b193-4199cd9731a5',
 'time_on_page_seconds': 72,
 'timestamp': '2026-01-14T19:49:42.812284+00:00'}


# River ML Clustering Metrics Investigation

## dir(metrics) Analysis for Clustering

### Summary of Findings:
- **Total items in metrics**: 89 (49 classes, 27 submodules)
- **Clustering-specific metrics**: Only **1** - `Silhouette`

### Clustering Metric Classes:
| Metric | Base Class | Parameters | bigger_is_better |
|--------|------------|------------|------------------|
| Silhouette | ClusteringMetric | None | False* |

*Note: `bigger_is_better=False` appears to be a River implementation quirk. Standard interpretation is higher Silhouette = better clustering.

### Silhouette.update() Signature:
```python
update(x, y_pred, centers, w=1.0)
```
- `x`: Input data point (dict)
- `y_pred`: Predicted cluster ID
- `centers`: Cluster centers from algorithm (dict)
- `w`: Weight (optional, default=1.0)

**Requirement**: At least 2 clusters with centers for Silhouette to compute.

### NOT Available in River (would need external implementation):
- Davies-Bouldin Index
- Calinski-Harabasz Index
- Dunn Index
- WCSS (Within-Cluster Sum of Squares)
- BCSS (Between-Cluster Sum of Squares)

# River ML Clustering Algorithms Investigation

## dir(cluster) Analysis

### Available Clustering Algorithms (7):

| Algorithm | Description | Has Centers | Silhouette Compatible |
|-----------|-------------|-------------|----------------------|
| **DBSTREAM** | Density-based stream clustering | Yes | Yes |
| **DenStream** | Density-based stream clustering | Yes | Yes (after init) |
| **KMeans** | Incremental k-means | Yes | Yes |
| **CluStream** | Micro-cluster based | Yes | Yes (after init) |
| **STREAMKMeans** | Stream k-means variant | Yes | Yes |
| **ODAC** | Online Divisive-Agglomerative | No | No |
| **TextClust** | Text stream clustering | No | No |

## DBSTREAM - Used in ECCI Project

### Parameters:
| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| clustering_threshold | float | 1.0 | Micro-cluster radius (r) |
| fading_factor | float | 0.01 | Historical data importance (must be > 0) |
| cleanup_interval | float | 2 | Time between cleanup processes |
| intersection_factor | float | 0.3 | Cluster overlap ratio for connectivity |
| minimum_weight | float | 1.0 | Threshold for non-noisy cluster |

### Attributes (after training):
- `n_clusters`: Number of macro clusters
- `centers`: Dict of cluster centers (for Silhouette)
- `micro_clusters`: Dict of micro-cluster objects
- `clusters`: Final cluster assignments

### Micro-cluster Attributes:
- `center`: Centroid coordinates
- `weight`: Accumulated weight
- `last_update`: Last update timestamp

In [12]:
class CustomOrdinalEncoder:
    """
    An incremental ordinal encoder that is picklable and processes dictionaries.
    Assigns a unique integer ID to each unique category encountered for each feature.
    """
    def __init__(self):
        self._feature_mappings: Dict[Hashable, Dict[Any, int]] = {}
        self._feature_next_ids: Dict[Hashable, int] = {}

    def learn_one(self, x: Dict[Hashable, Any]):
        for feature_name, category_value in x.items():
            if not isinstance(category_value, Hashable):
                 continue
            if feature_name not in self._feature_mappings:
                self._feature_mappings[feature_name] = {}
                self._feature_next_ids[feature_name] = 0
            feature_map = self._feature_mappings[feature_name]
            feature_next_id = self._feature_next_ids[feature_name]
            if category_value not in feature_map:
                feature_map[category_value] = feature_next_id
                self._feature_next_ids[feature_name] += 1

    def transform_one(self, x: Dict[Hashable, Any]) -> Dict[Hashable, int]:
        transformed_sample: Dict[Hashable, int] = {}
        for feature_name, category_value in x.items():
            if feature_name in self._feature_mappings:
                feature_map = self._feature_mappings[feature_name]
                if category_value in feature_map:
                    transformed_sample[feature_name] = feature_map[category_value]
                else:
                    raise KeyError(f"Unseen category '{category_value}' for feature '{feature_name}'")
        return transformed_sample

    def get_feature_mappings(self) -> Dict[Hashable, Dict[Any, int]]:
        return self._feature_mappings

    def __repr__(self) -> str:
        num_features = len(self._feature_mappings)
        return f"CustomOrdinalEncoder(features={num_features})"

In [13]:
class DictImputer(base.Transformer):
    """
    Imputes missing values (None or missing keys) for specified features in a dictionary.
    """
    def __init__(self, on: list, fill_value):
        self.on = on
        self.fill_value = fill_value

    def transform_one(self, x: dict):
        x_transformed = x.copy()
        for feature in self.on:
            if x_transformed.get(feature) is None:
                x_transformed[feature] = self.fill_value
        return x_transformed

In [14]:
def extract_device_info(x):
    x_ = x['device_info']
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'os': x_['os'],
        'browser': x_['browser'],
    }

def extract_timestamp_info(x):
    x_ = dt.datetime.strptime(
        x['timestamp'],
        "%Y-%m-%dT%H:%M:%S.%f%z")
    return {
        'year': x_.year,
        'month': x_.month,
        'day': x_.day,
        'hour': x_.hour,
        'minute': x_.minute,
        'second': x_.second
    }

def extract_coordinates(x):
    x_ = x['location']
    if isinstance(x_, str):
        x_ = orjson.loads(x_)
    return {
        'lat': x_['lat'],
        'lon': x_['lon'],
    }

In [15]:
def process_sample_ecci(x, encoders):
    """Process a single ECCI sample for River incremental learning."""
    pipe1 = compose.Select(
        'price',
        'quantity',
        'session_event_sequence',
        'time_on_page_seconds'
    )
    pipe1.learn_one(x)
    x1 = pipe1.transform_one(x)
    
    pipe2 = compose.Select(
        'event_type',
        'product_category',
        'product_id',
        'referrer_url',
    )
    pipe2.learn_one(x)
    x_pipe_2 = pipe2.transform_one(x)
    
    pipe3a = compose.Select("device_info")
    pipe3a.learn_one(x)
    x_pipe_3 = pipe3a.transform_one(x)
    pipe3b = compose.FuncTransformer(extract_device_info)
    pipe3b.learn_one(x_pipe_3)
    x_pipe_3 = pipe3b.transform_one(x_pipe_3)
    
    pipe4a = compose.Select("timestamp")
    pipe4a.learn_one(x)
    x_pipe_4 = pipe4a.transform_one(x)
    pipe4b = compose.FuncTransformer(extract_timestamp_info)
    pipe4b.learn_one(x_pipe_4)
    x_pipe_4 = pipe4b.transform_one(x_pipe_4)
    
    pipe5a = compose.Select("location")
    pipe5a.learn_one(x)
    x_pipe_5 = pipe5a.transform_one(x)
    pipe5b = compose.FuncTransformer(extract_coordinates)
    pipe5b.learn_one(x_pipe_5)
    x_pipe_5 = pipe5b.transform_one(x_pipe_5)
    
    x_to_prep = x1 | x_pipe_2 | x_pipe_3 | x_pipe_4 | x_pipe_5
    x_to_prep = DictImputer(
        fill_value=False, 
        on=list(x_to_prep.keys())
    ).transform_one(x_to_prep)
    
    numerical_features = [
        'price',
        'session_event_sequence',
        'time_on_page_seconds',
        'quantity'
    ]
    categorical_features = [
        'event_type',
        'product_category',
        'product_id',
        'referrer_url',
        'os',
        'browser',
        'year',
        'month',
        'day',
        'hour',
        'minute',
        'second'
    ]
    
    num_pipe = compose.Select(*numerical_features)
    num_pipe.learn_one(x_to_prep)
    x_num = num_pipe.transform_one(x_to_prep)
    
    cat_pipe = compose.Select(*categorical_features)
    cat_pipe.learn_one(x_to_prep)
    x_cat = cat_pipe.transform_one(x_to_prep)
    
    encoders["standard_scaler"].learn_one(x_num)
    x_scaled = encoders["standard_scaler"].transform_one(x_num)
    
    encoders["feature_hasher"].learn_one(x_cat)
    x_hashed = encoders["feature_hasher"].transform_one(x_cat)
    
    return x_scaled | x_hashed, encoders

In [16]:
def _create_default_model_ecci():
    """
    Create DBSTREAM model for E-Commerce Customer Interactions.
    
    Reference: https://riverml.xyz/latest/api/cluster/DBSTREAM/
    
    DBSTREAM Parameters:
    - clustering_threshold=1.5: Micro-cluster radius
    - fading_factor=0.05: Historical data importance (must be > 0)
    - cleanup_interval=4: Time between cleanup processes
    - intersection_factor=0.5: Cluster overlap ratio for connectivity
    - minimum_weight=1.0: Threshold for non-noisy cluster classification
    """
    return cluster.DBSTREAM(
        clustering_threshold=1.5,
        fading_factor=0.05,
        cleanup_interval=4,
        intersection_factor=0.5,
        minimum_weight=1.0,
    )

def _create_default_encoders_ecci():
    """Create default encoders for ECCI."""
    return {
        "standard_scaler": preprocessing.StandardScaler(),
        "feature_hasher": preprocessing.FeatureHasher()
    }

In [17]:
# =============================================================================
# RIVER ML METRICS CONFIGURATION FOR E-COMMERCE CUSTOMER INTERACTIONS
# =============================================================================
# Research-based configuration for real-time customer behavior clustering.
#
# KEY FINDINGS FROM dir(metrics) INVESTIGATION:
#
# CLUSTERING METRICS IN RIVER:
#   - Only ONE clustering metric: metrics.Silhouette
#   - No configurable parameters
#   - Requires: at least 2 clusters with centers
#
# SILHOUETTE METRIC:
#   - Range: [-1, 1]
#   - Higher values = better clustering (cohesion + separation)
#   - bigger_is_better=False (River implementation quirk)
#   - update(x, y_pred, centers, w=1.0)
#
# ROLLING WRAPPERS (river.utils):
#   - utils.Rolling(obj, window_size) - sample-based window
#   - utils.TimeRolling(obj, period) - time-based window
#
# ALTERNATIVE METRICS (computed from model attributes):
#   - n_clusters: Number of macro clusters formed
#   - n_micro_clusters: Number of micro clusters (DBSTREAM specific)
#   - Custom cohesion/separation from centers
# =============================================================================

from river import metrics, utils

# -----------------------------------------------------------------------------
# CLUSTERING METRICS
# -----------------------------------------------------------------------------
clustering_metric_classes = {
    "Silhouette": metrics.Silhouette,
}
clustering_metric_args = {
    "Silhouette": {},  # No configurable parameters
}

# -----------------------------------------------------------------------------
# ROLLING METRICS (for concept drift detection)
# -----------------------------------------------------------------------------
rolling_metric_classes = {
    "RollingSilhouette": utils.Rolling,
}
rolling_metric_args = {
    "RollingSilhouette": {"obj": metrics.Silhouette(), "window_size": 1000},
}

# -----------------------------------------------------------------------------
# TIME-BASED ROLLING METRICS
# -----------------------------------------------------------------------------
time_rolling_metric_classes = {
    "TimeRollingSilhouette": utils.TimeRolling,
}
time_rolling_metric_args = {
    "TimeRollingSilhouette": {"obj": metrics.Silhouette(), "period": dt.timedelta(minutes=5)},
}

print("=" * 70)
print("CLUSTERING METRICS FOR E-COMMERCE CUSTOMER INTERACTIONS")
print("=" * 70)
print("\nMetric Classes:")
for name in clustering_metric_classes:
    print(f"  {name}: metrics.Silhouette() - higher is better (range: -1 to 1)")

print("\nMetric Arguments (all empty - no configurable params):")
for name, args in clustering_metric_args.items():
    print(f"  {name}: {args}")

print("\nRolling Metrics:")
for name, args in rolling_metric_args.items():
    print(f"  {name}: Rolling(Silhouette(), window_size={args['window_size']})")

print("\nTime Rolling Metrics:")
for name, args in time_rolling_metric_args.items():
    print(f"  {name}: TimeRolling(Silhouette(), period={args['period']})")

print("\n" + "=" * 70)
print("BEST MODEL SELECTION: Maximize Silhouette")
print("NOTE: Despite bigger_is_better=False, higher Silhouette = better clustering")
print("=" * 70)

CLUSTERING METRICS FOR E-COMMERCE CUSTOMER INTERACTIONS

Metric Classes:
  Silhouette: metrics.Silhouette() - higher is better (range: -1 to 1)

Metric Arguments (all empty - no configurable params):
  Silhouette: {}

Rolling Metrics:
  RollingSilhouette: Rolling(Silhouette(), window_size=1000)

Time Rolling Metrics:
  TimeRollingSilhouette: TimeRolling(Silhouette(), period=0:05:00)

BEST MODEL SELECTION: Maximize Silhouette
NOTE: Despite bigger_is_better=False, higher Silhouette = better clustering


In [18]:
# =============================================================================
# INVESTIGATION FINDINGS: dir(metrics) - CLUSTERING
# =============================================================================
# Total items: 89 (49 classes, 27 submodules)
#
# CLUSTERING METRIC HIERARCHY:
#   metrics.base.ClusteringMetric
#       -> metrics.Silhouette (ONLY subclass)
#
# SILHOUETTE DETAILS:
#   - __init__(): No parameters
#   - update(x, y_pred, centers, w=1.0)
#   - get() -> float
#   - bigger_is_better: False (implementation quirk)
#   - Range: [-1, 1]
#   - Requires: len(centers) >= 2
#
# ROLLING WRAPPERS (river.utils, NOT river.metrics):
#   utils.Rolling(obj, window_size)      - sample-based window
#   utils.TimeRolling(obj, period)       - time-based window (requires t=timestamp)
#
# CLUSTERING ALGORITHMS WITH CENTERS (Silhouette compatible):
#   - cluster.DBSTREAM    -> centers: dict
#   - cluster.DenStream   -> centers: dict
#   - cluster.KMeans      -> centers: dict
#   - cluster.CluStream   -> centers: dict
#   - cluster.STREAMKMeans -> centers: dict
#
# ALGORITHMS WITHOUT CENTERS (Silhouette NOT compatible):
#   - cluster.ODAC
#   - cluster.TextClust
# =============================================================================

from river import metrics, cluster, utils
import datetime as dt

model = _create_default_model_ecci()
encoders = _create_default_encoders_ecci()

clustering_metric_classes = {
    "Silhouette": metrics.Silhouette,
}
clustering_metric_args = {
    "Silhouette": {},
}

rolling_metric_classes = {
    "RollingSilhouette": utils.Rolling,
}
rolling_metric_args = {
    "RollingSilhouette": {"obj": metrics.Silhouette(), "window_size": 1000},
}

time_rolling_metric_classes = {
    "TimeRollingSilhouette": utils.TimeRolling,
}
time_rolling_metric_args = {
    "TimeRollingSilhouette": {"obj": metrics.Silhouette(), "period": dt.timedelta(minutes=5)},
}


clustering_metrics = {
    name: clustering_metric_classes[name](**clustering_metric_args[name])
    for name in clustering_metric_classes
}

rolling_metrics = {
    name: rolling_metric_classes[name](**rolling_metric_args[name])
    for name in rolling_metric_classes
}
time_rolling_metrics = {
    name: time_rolling_metric_classes[name](**time_rolling_metric_args[name])
    for name in time_rolling_metric_classes
}

n_clusters_history = []
n_micro_clusters_history = []

for i, sample in enumerate(tqdm.tqdm(samples)):
    x, encoders = process_sample_ecci(sample, encoders)
    timestamp = dt.datetime.strptime(sample['timestamp'], "%Y-%m-%dT%H:%M:%S.%f%z")
    # Learn the point (unsupervised - no y)
    model.learn_one(x)
    # Predict cluster
    y_pred = model.predict_one(x)
    # Track cluster statistics
    n_clusters_history.append(model.n_clusters)
    n_micro_clusters_history.append(len(model.micro_clusters))
    # Update metrics (only if at least 2 clusters exist)
    if len(model.centers) >= 2:
        for metric in clustering_metrics.values():
            try:
                metric.update(x, y_pred, model.centers)
            except:
                pass
        for metric in rolling_metrics.values():
            try:
                metric.update(x, y_pred, model.centers)
            except:
                pass
        for metric in time_rolling_metrics.values():
            try:
                metric.update(x, y_pred, model.centers, t=timestamp)
            except:
                pass

metrics_to_log = {}
for name, metric in clustering_metrics.items():
    metrics_to_log[name] = metric.get()
for name, metric in rolling_metrics.items():
    metrics_to_log[name] = metric.get()
for name, metric in time_rolling_metrics.items():
    metrics_to_log[name] = metric.get()

# Add cluster statistics
metrics_to_log["n_clusters"] = model.n_clusters
metrics_to_log["n_micro_clusters"] = len(model.micro_clusters)

print("\n" + "=" * 70)
print("CLUSTERING METRICS RESULTS")
print("=" * 70)
for name, value in metrics_to_log.items():
    print(f"  {name}: {value}")

100%|██████████| 1000/1000 [00:05<00:00, 170.23it/s]


CLUSTERING METRICS RESULTS
  Silhouette: 0.0009734391724861395
  RollingSilhouette: 0.0009734391724861395
  TimeRollingSilhouette: 0.0009734391724861395
  n_clusters: 4
  n_micro_clusters: 4





In [19]:
# =============================================================================
# CLUSTER ANALYSIS DETAILS
# =============================================================================

print("=" * 70)
print("DBSTREAM MODEL DETAILS")
print("=" * 70)

print(f"\nFinal number of macro clusters: {model.n_clusters}")
print(f"Final number of micro clusters: {len(model.micro_clusters)}")

print("\nCluster Centers:")
for cluster_id, center in model.centers.items():
    print(f"  Cluster {cluster_id}: {len(center)} dimensions")

print("\nMicro-cluster Details:")
for mc_id, mc in list(model.micro_clusters.items())[:5]:  # Show first 5
    print(f"  MC {mc_id}: weight={mc.weight:.4f}, last_update={mc.last_update}")

print("\nCluster Count History (first 20 samples):")
print(f"  n_clusters: {n_clusters_history[:20]}")
print(f"  n_micro_clusters: {n_micro_clusters_history[:20]}")

DBSTREAM MODEL DETAILS

Final number of macro clusters: 4
Final number of micro clusters: 4

Cluster Centers:
  Cluster 0: 16 dimensions
  Cluster 1: 16 dimensions
  Cluster 2: 16 dimensions
  Cluster 3: 16 dimensions

Micro-cluster Details:
  MC 991: weight=1.0000, last_update=996
  MC 992: weight=1.0000, last_update=997
  MC 993: weight=1.0000, last_update=998
  MC 994: weight=1.0000, last_update=999

Cluster Count History (first 20 samples):
  n_clusters: [1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4]
  n_micro_clusters: [1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7, 4]


# Summary: ECCI Clustering Metrics Configuration

## Available Metrics for MLflow Logging:

### Primary Clustering Metric:
| Metric | Class | Parameters | Range | Optimal |
|--------|-------|------------|-------|--------|
| Silhouette | metrics.Silhouette | None | [-1, 1] | Higher |

### Rolling Metrics (Drift Detection):
| Metric | Wrapper | Inner Metric | Window |
|--------|---------|--------------|--------|
| RollingSilhouette | utils.Rolling | Silhouette() | 1000 samples |
| TimeRollingSilhouette | utils.TimeRolling | Silhouette() | 5 minutes |

### Model Statistics:
| Metric | Source | Description |
|--------|--------|-------------|
| n_clusters | model.n_clusters | Number of macro clusters |
| n_micro_clusters | len(model.micro_clusters) | Number of micro clusters |

## Best Model Selection Criterion:
```python
BEST_METRIC_CRITERIA = {
    "E-Commerce Customer Interactions": ("Silhouette", "maximize")
}
```

## Important Notes:
1. Silhouette requires at least 2 clusters with centers
2. DBSTREAM may take time to form multiple clusters
3. Early samples may not have Silhouette computed (insufficient clusters)
4. `bigger_is_better=False` in River is misleading - higher Silhouette IS better