From d5e797b909c8f40e7e245cf29aabc3b632101997 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 13:04:50 +0100 Subject: [PATCH 01/20] DOC-6661 draft Python feature store --- content/develop/use-cases/_index.md | 1 + .../develop/use-cases/feature-store/_index.md | 158 ++++ .../feature-store/redis-py/_index.md | 644 ++++++++++++++ .../feature-store/redis-py/build_features.py | 89 ++ .../feature-store/redis-py/demo_server.py | 802 ++++++++++++++++++ .../feature-store/redis-py/feature_store.py | 337 ++++++++ .../redis-py/streaming_worker.py | 146 ++++ 7 files changed, 2177 insertions(+) create mode 100644 content/develop/use-cases/feature-store/_index.md create mode 100644 content/develop/use-cases/feature-store/redis-py/_index.md create mode 100644 content/develop/use-cases/feature-store/redis-py/build_features.py create mode 100644 content/develop/use-cases/feature-store/redis-py/demo_server.py create mode 100644 content/develop/use-cases/feature-store/redis-py/feature_store.py create mode 100644 content/develop/use-cases/feature-store/redis-py/streaming_worker.py diff --git a/content/develop/use-cases/_index.md b/content/develop/use-cases/_index.md index fabd8bfeca..40f7cb9043 100644 --- a/content/develop/use-cases/_index.md +++ b/content/develop/use-cases/_index.md @@ -27,3 +27,4 @@ This section provides practical examples and reference implementations for commo * [Pub/sub messaging]({{< relref "/develop/use-cases/pub-sub" >}}) - Broadcast real-time events to many consumers with channel and pattern subscriptions * [Streaming]({{< relref "/develop/use-cases/streaming" >}}) - Process ordered event streams with consumer groups, replay, and configurable retention * [Recommendation engine]({{< relref "/develop/use-cases/recommendation-engine" >}}) - Serve personalized recommendations under tight latency budgets by combining vector similarity with structured filters in a single Redis call +* [Feature store]({{< relref "/develop/use-cases/feature-store" >}}) - Serve pre-computed ML features on the request path with mixed batch-and-streaming freshness using per-field TTL diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md new file mode 100644 index 0000000000..e793433eec --- /dev/null +++ b/content/develop/use-cases/feature-store/_index.md @@ -0,0 +1,158 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Serve pre-computed ML features on the request path under tight latency budgets, with batch and streaming features kept fresh in the same store. +hideListLinks: true +linkTitle: Feature store +title: Redis feature store +weight: 7 +--- + +## When to use Redis as a feature store + +Use Redis as the online layer of a feature store when production models — fraud +scoring, recommendations, dynamic pricing — need dozens of pre-computed features +per prediction on every request, with sub-millisecond reads, mixed batch-and-streaming +freshness, and high write throughput from concurrent ingestion pipelines. + +## Why the problem is hard + +An online feature store has to serve dozens of features per inference call inside +a request budget measured in milliseconds, while batch jobs and streaming +pipelines update those same features at very different cadences. Some of the +obvious workarounds have real drawbacks: + +- **Querying the offline warehouse directly** adds hundreds of milliseconds per + inference call, which makes real-time serving impossible. +- **A bespoke cache in front of the warehouse** solves latency but introduces + *training-serving skew*: the features served at inference drift from what the + model trained on, silently degrading accuracy whenever a transform changes + on one side and not the other. +- **Disk-backed online stores** hit a throughput wall when every user action + has to update a dozen features simultaneously across millions of entities — + the IO mix of small concurrent writes is exactly what they are slowest at. +- **Single-TTL stores** can't handle mixed staleness: batch features refreshed + nightly coexist with streaming features updated every few seconds, and a + single per-key expiry can't express both. Worse, a failed ingestion + pipeline must *expire* its features rather than serve stale values silently. + +A workable online feature store needs sub-millisecond reads at request rate, +high concurrent write throughput from mixed batch and streaming ingestion, +independent freshness controls per feature, and self-cleaning behavior when an +upstream pipeline fails — without standing up a dedicated piece of +infrastructure beside the rest of the model-serving stack. + +## What you can expect from a Redis solution + +You can: + +- Serve feature vectors to inference endpoints under 1 ms P99 + (99% of requests have a latency of 1 ms or less) at millions of + reads per second from a single shard, and scale horizontally beyond that + with Redis Cluster. +- Run batch and streaming ingestion concurrently against the same entities + without locking or version columns — Redis is single-threaded per shard, so + individual field writes are atomic by construction. +- Apply *different* freshness guarantees to individual features within the same + entity hash: seconds for real-time signals, hours for batch aggregates, with + per-field TTL via [`HEXPIRE`]({{< relref "/commands/hexpire" >}}). +- Let stale streaming features self-expire when their ingestion pipeline + fails, so models receive missing features rather than silently outdated ones. +- Retrieve features for hundreds of entities in a single round trip for batch + scoring, using pipelined [`HMGET`]({{< relref "/commands/hmget" >}}). +- Plug into [Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) — + Redis's own materialize / serve layer — or + [Feast](https://docs.feast.dev/) with a connection-string change, so no + bespoke serving code is required. +- Co-locate the online feature store on the same Redis instance already + handling cache, sessions, or rate limiting in the stack — no additional + infrastructure. + +## How Redis supports the solution + +In practice, each entity (a user, an account, an item) is a single +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key like +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature — batch-materialized aggregates alongside streaming-updated signals — +so one [`HMGET`]({{< relref "/commands/hmget" >}}) call returns whatever subset +the model needs in one round trip. A key-level +[`EXPIRE`]({{< relref "/commands/expire" >}}) aligns with the batch +materialization cycle so a whole entity self-cleans when its pipeline stops +refreshing it, and per-field [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) +lets each streaming feature carry its own shorter expiry independent of the +rest of the hash. + +Redis provides the following features that make it a good fit for an online +feature store: + +- [Hashes]({{< relref "/develop/data-types/hashes" >}}) group every feature + for an entity under one key, so retrieval reads everything the model needs + in a single network round trip with [`HMGET`]({{< relref "/commands/hmget" >}}), + and small hashes use *listpack* encoding for compact in-memory representation. +- [`HSET`]({{< relref "/commands/hset" >}}) writes any subset of fields + atomically, so batch and streaming pipelines can update overlapping or + disjoint features on the same entity concurrently without locks or version + columns. +- [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) (Redis 7.4+) give per-field TTLs, + so streaming features (5-minute freshness) and batch features (24-hour + freshness) can live in the same hash with independent expiry — the + *mixed-staleness* problem becomes a one-line server-side guarantee. +- [`EXPIRE`]({{< relref "/commands/expire" >}}) at the key level lets an + entity disappear entirely if its batch refresher fails, so inference sees + a missing entity (which the model handler can detect and fall back on) + rather than silently outdated values. +- [Pipelining]({{< relref "/develop/using-commands/pipelining" >}}) bundles + [`HMGET`]({{< relref "/commands/hmget" >}}) calls for many entities into + one round trip, which is the right primitive for batch scoring where the + model needs features for hundreds of entities at once. +- Sub-millisecond reads and writes from memory keep the feature store off the + critical path of inference, so the model-server's request budget is spent + on the model rather than on feature retrieval. + +## Ecosystem + +The following libraries and platforms use Redis as their online feature store: + +- **[Redis Feature Form]({{< relref "/develop/ai/featureform" >}})** is + Redis's own feature-engineering platform. It defines features, labels, and + feature views in a Python definitions file, materializes them through a + [registered provider]({{< relref "/develop/ai/featureform/providers" >}}), + and [serves]({{< relref "/develop/ai/featureform/features-and-labels" >}}) + them from Redis as the low-latency online store. See the + [quickstart]({{< relref "/develop/ai/featureform/quickstart" >}}) for an + end-to-end walkthrough. +- **Python**: [Feast](https://docs.feast.dev/reference/online-stores/redis) + ships Redis as a first-class online store provider — point a Feast + `online_store` block at a Redis connection string and the + `RedisOnlineStore` backend handles materialization and serving. +- **Compute**: [Apache Spark](https://spark.apache.org/) batch jobs run the + nightly materialization, writing into Redis via the Redis Feature Form / + Feast materialize commands or directly with the + [`spark-redis`](https://github.com/RedisLabs/spark-redis) connector. +- **Streaming**: [Apache Flink](https://flink.apache.org/) or + [Kafka Streams](https://kafka.apache.org/documentation/streams/) compute the + real-time features and `HSET` them into Redis with per-field + [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) so each streaming signal + carries its own freshness window. +- **Infrastructure**: [Kubernetes](https://kubernetes.io/) co-locates Redis + pods alongside the model-serving containers, with horizontal-pod autoscaling + on the read replicas to track inference load; + [Active-Active geo-distribution]({{< relref "/operate/rs/databases/active-active" >}}) + on Redis Enterprise / Redis Cloud replicates the online store across + regions for low-latency reads close to each inference cluster. + +## Code examples to build your own Redis feature store + +The following guides show how to build a small Redis-backed online feature +store for a fraud-scoring model. Each guide includes a runnable interactive +demo that lets you bulk-load batch features, run a streaming worker that +updates real-time features with per-field TTL, retrieve any subset of features +for a single user under 1 ms, and pipeline batch reads across a hundred users. + +* [redis-py (Python)]({{< relref "/develop/use-cases/feature-store/redis-py" >}}) diff --git a/content/develop/use-cases/feature-store/redis-py/_index.md b/content/develop/use-cases/feature-store/redis-py/_index.md new file mode 100644 index 0000000000..9ca06fb221 --- /dev/null +++ b/content/develop/use-cases/feature-store/redis-py/_index.md @@ -0,0 +1,644 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Python with redis-py +linkTitle: redis-py example (Python) +title: Redis feature store with redis-py +weight: 1 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Python with [`redis-py`]({{< relref "/develop/clients/redis-py" >}}). It +includes a local web server built with the Python standard library so you can +bulk-load a batch of users with a key-level TTL, run a streaming worker that +overwrites real-time features with per-field TTL, retrieve any subset of +features for one user under 1 ms, and pipeline `HMGET` across a hundred users +for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis [Hash]({{< relref "/develop/data-types/hashes" >}}) +at a deterministic key — `fs:user:{id}`. The hash holds every feature for that +entity as one field per feature: batch-materialized aggregates (refreshed once +a day) alongside streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`build_features.py` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`streaming_worker.py` — the demo's stand-in for a Flink / Kafka Streams job. +The inference panel of the demo server reads any subset of those features +through `feature_store.py`'s helper class. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features as +events arrive, and an **inference path** that reads features on the request +side. + +### Batch path (per materialization cycle) + +1. The batch job calls `synthesize_users(N)` (in production, the equivalent + computation lives in an offline pipeline against the warehouse). The result + is `{user_id: {field: value, ...}}` for every user in this cycle. +2. `store.bulk_load(rows, ttl_seconds=86400)` pipelines one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user into a single + round trip. The `HSET` writes every batch field; the `EXPIRE` is what makes + the entity disappear if the next batch run fails, so inference reads a + missing entity rather than silently outdated values. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming layer +computes whatever real-time signals fall out of that event and calls +`store.update_streaming(user_id, fields, ttl_seconds=300)`. That pipelines: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field values. + Redis is single-threaded per shard, so this is atomic against any + concurrent batch write on the same hash — no version columns, no locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the fields + that were written, with the streaming TTL. Each streaming field carries + its own per-field expiry independent of the rest of the hash. Stop the + worker and these fields drop out one by one as their TTLs elapse, while + the batch fields remain populated under the longer key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is owned by + the model, not the store). +2. It calls `store.get_features(user_id, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values in + the same order as the requested fields, with `None` for any field that + doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.batch_get_features(user_ids, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` users + in a single network round trip. + +## The feature-store helper + +The `RedisFeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/redis-py/feature_store.py)): + +```python +import redis +from feature_store import RedisFeatureStore + +r = redis.Redis(host="localhost", port=6379, decode_responses=True) +store = RedisFeatureStore( + redis_client=r, + key_prefix="fs:user:", + batch_ttl_seconds=24 * 60 * 60, # whole-entity TTL aligned with the daily batch cycle + streaming_ttl_seconds=5 * 60, # per-field TTL on each streaming feature +) + +# Batch materialization: one HSET + EXPIRE per user, all pipelined. +store.bulk_load({ + "u0001": {"country_iso": "US", "risk_segment": "low", + "tx_count_7d": 14, "avg_amount_30d": 92.40, + "account_age_days": 612, "chargeback_count_180d": 0}, + "u0002": {"country_iso": "GB", "risk_segment": "medium", + "tx_count_7d": 47, "avg_amount_30d": 220.10, + "account_age_days": 1840, "chargeback_count_180d": 1}, +}) + +# Streaming write: HSET + HEXPIRE on just the fields that changed. +store.update_streaming("u0001", { + "last_login_ts": 1716998413541, + "last_device_id": "ios-9f02", + "tx_count_5m": 3, + "failed_logins_15m": 0, + "session_country": "US", +}) + +# Inference read: HMGET of whatever the model needs. +features = store.get_features("u0001", [ + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m", +]) + +# Batch scoring: pipelined HMGET across many users. +batch = store.batch_get_features( + user_ids=["u0001", "u0002", "u0003"], + field_names=["risk_segment", "tx_count_5m", "failed_logins_15m"], +) +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes-on-the-wire, so the helper encodes booleans as `"true"` / +`"false"` and numbers as their `str(...)` form. The model server is responsible +for parsing back to the right type, the same way it would when reading any +serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +The batch fields sit under the key-level `EXPIRE`. The streaming fields each +carry their own [`HEXPIRE`]({{< relref "/commands/hexpire" >}}). If the +streaming pipeline stops, the streaming fields drop one by one as their +per-field TTLs elapse; the batch fields stay until the daily key-level +`EXPIRE` fires (or the next batch cycle re-pins them). + +### Bulk-loading batch features + +`bulk_load` pipelines one `HSET` and one `EXPIRE` per user into a single round +trip. With 500 users that's 1000 commands in one network call — Redis processes +them sequentially on the server side but the client only pays one RTT. + +```python +def bulk_load( + self, + rows: Mapping[str, FeatureMap], + ttl_seconds: Optional[int] = None, +) -> int: + ttl = self.batch_ttl_seconds if ttl_seconds is None else ttl_seconds + pipe = self.redis.pipeline(transaction=False) + for entity_id, fields in rows.items(): + key = self.key_for(entity_id) + pipe.hset(key, mapping={k: _encode(v) for k, v in fields.items()}) + pipe.expire(key, ttl) + pipe.execute() + ... +``` + +`transaction=False` switches the pipeline from `MULTI/EXEC` to a plain command +batch: there's no all-or-nothing semantic, just a network optimization. That +is the right choice here — each user's `HSET` + `EXPIRE` is independent, and +wrapping the whole thing in a transaction would block the server for the +duration of the batch. + +In production, the equivalent of this script runs as an offline pipeline (a +Spark or Feast `materialize` job) that reads from the warehouse and writes +into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood. + +### Streaming writes with per-field TTL + +`update_streaming` is the linchpin of the mixed-staleness story: + +```python +def update_streaming( + self, + entity_id: str, + fields: FeatureMap, + ttl_seconds: Optional[int] = None, +) -> None: + if not fields: + return + ttl = self.streaming_ttl_seconds if ttl_seconds is None else ttl_seconds + key = self.key_for(entity_id) + encoded = {name: _encode(value) for name, value in fields.items()} + + pipe = self.redis.pipeline(transaction=False) + pipe.hset(key, mapping=encoded) + pipe.hexpire(key, ttl, *encoded.keys()) + pipe.execute() +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. The two commands here are sent in one round +trip but they could in principle run any order — the `HSET` always wins because +the field name is the same in both calls; in practice they run in pipeline +order on the server, so the field is written, then its TTL is applied. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse — there is no application-side cleaner involved. +[`HTTL`]({{< relref "/commands/httl" >}}) lets the model side inspect the +remaining TTL on any field, which is useful both for debugging ("why is this +feature missing?" → "it expired three seconds ago") and as a freshness signal +in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level TTL +> commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, +> `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in Redis 7.4. On older +> Redis builds you would have to put streaming features on their own keys +> (one key per feature, or one key per feature group) and set a key-level +> `EXPIRE` instead — at the cost of giving up the single-`HMGET` retrieval. + +### Inference reads with HMGET + +`get_features` is one `HMGET`: + +```python +def get_features( + self, + entity_id: str, + field_names: Optional[Iterable[str]] = None, +) -> dict[str, str]: + key = self.key_for(entity_id) + if field_names is None: + return self.redis.hgetall(key) + names = list(field_names) + if not names: + return {} + values = self.redis.hmget(key, names) + return {n: v for n, v in zip(names, values) if v is not None} +``` + +The model knows exactly which features it consumes, so the request path always +takes the `HMGET` branch with an explicit field list — that's the +sub-millisecond path. `HGETALL` is the right call for debugging (which is what +the demo's "Inspect" panel does) but not for serving: it forces Redis to +serialize every field, including ones the model doesn't need. + +Fields that don't exist (because they were never written, or because they +expired) come back as `None`. The helper drops them from the result dict so +the caller sees only the features that are actually available. A real model +server would either treat missing values as a feature ("this user has no +streaming signal yet") or fall back to a default from the model's training +data. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```python +def batch_get_features( + self, + entity_ids: Iterable[str], + field_names: Iterable[str], +) -> dict[str, dict[str, str]]: + ids = list(entity_ids) + names = list(field_names) + if not ids or not names: + return {} + + pipe = self.redis.pipeline(transaction=False) + for entity_id in ids: + pipe.hmget(self.key_for(entity_id), names) + rows = pipe.execute() + + out: dict[str, dict[str, str]] = {} + for entity_id, values in zip(ids, rows): + out[entity_id] = {n: v for n, v in zip(names, values) if v is not None} + return out +``` + +One round trip for the whole batch — the demo regularly returns 100 users in +2-3 ms against a local Redis. On a real network the round trip dominates; +pipelining is what keeps batch scoring practical. + +For very large batches on a clustered deployment, the same shape generalises +to one pipeline per shard: bucket the entity IDs by their hash slot +(`cluster.keyslot(key)`), then issue one pipeline against each shard in +parallel. `redis-py`'s +[`RedisCluster` pipeline](https://redis-py.readthedocs.io/en/stable/clustering.html#redis-cluster-pipeline) +handles that automatically — the per-user `HMGET` calls are dispatched to the +right shard transparently. + +## The streaming worker + +`streaming_worker.py` is the demo's stand-in for whatever Flink, Kafka Streams, +or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/redis-py/streaming_worker.py)). +It runs as a daemon thread next to the demo server so the UI can start, pause, +and resume it; in production this code would live in the streaming layer. + +Every tick the worker picks a few random users, generates a new value for each +streaming feature, and calls `store.update_streaming(user_id, fields)`. The +demo defaults to 5 users per tick at 1-second intervals — enough that within a +minute every user in a 200-user store has been touched at least once. + +```python +def _tick(self) -> None: + ids = self.store.list_entity_ids(limit=500) + if not ids: + return + chosen = self._rng.sample(ids, k=min(self.users_per_tick, len(ids))) + now_ms = int(time.time() * 1000) + for entity_id in chosen: + fields = { + "last_login_ts": now_ms, + "last_device_id": self._rng.choice(DEVICE_IDS), + "tx_count_5m": self._rng.randint(0, 12), + "failed_logins_15m": self._rng.choices( + (0, 1, 2, 5), weights=(70, 20, 8, 2), k=1, + )[0], + "session_country": self._rng.choice(SESSION_COUNTRIES), + } + self.store.update_streaming(entity_id, fields) +``` + +Pausing the worker is what shows off the mixed-staleness behavior: leave it +paused for longer than `streaming_ttl_seconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields remain +under the longer key-level `EXPIRE`. The demo's `Pause / resume` button lets +you see this happen in real time. + +## The batch builder + +`build_features.py` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/redis-py/build_features.py)). +It generates synthetic feature rows and calls `store.bulk_load` once. The +synthesis itself is not the point — in a real deployment the equivalent code +reads from the offline store (Snowflake, BigQuery, Iceberg) and writes the +resulting hashes into Redis. + +```python +def synthesize_users(count: int, seed: int = 42) -> dict[str, dict]: + rng = random.Random(seed) + users: dict[str, dict] = {} + for i in range(1, count + 1): + uid = f"u{i:04d}" + users[uid] = { + "country_iso": rng.choice(COUNTRY_CHOICES), + "risk_segment": rng.choices( + RISK_SEGMENTS, weights=(70, 25, 5), k=1, + )[0], + "account_age_days": rng.randint(7, 2400), + "tx_count_7d": rng.randint(0, 80), + "avg_amount_30d": round(rng.uniform(5, 350), 2), + "chargeback_count_180d": rng.choices( + (0, 1, 2, 3), weights=(85, 10, 4, 1), k=1, + )[0], + } + return users +``` + +You can run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +python3 build_features.py --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which is +how a typical operator would pre-seed a feature store from the command line +when debugging. + +## The interactive demo + +`demo_server.py` runs a `ThreadingHTTPServer` on port 8085. The HTML page lets +you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. Drop the TTL to 30 s and watch the entire store expire on + schedule — the same thing that happens if a daily refresher fails. +* See the **store state** at a glance: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status (running / paused, ticks completed, + writes performed) and **pause or resume** it. Leave it paused for longer + than the streaming TTL to watch streaming fields drop out. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users and see + the total elapsed time plus the per-user breakdown. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL — the right view for debugging "why is this feature missing?" at + read time. + +The server holds one `RedisFeatureStore` instance and one `StreamingWorker` +for the lifetime of the process. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the demo + relies on per-field TTL for the mixed-staleness story. +* **Python 3.9 or later.** +* The `redis-py` client. Install it with: + + ```bash + pip install "redis>=5.1" + ``` + + Field-level TTL commands (`hexpire`, `httl`) were added to `redis-py` in 5.1. + +If your Redis server is running elsewhere, start the demo with `--redis-host` +and `--redis-port`. + +## Running the demo + +### Get the source files + +The demo consists of four Python files. Download them from the +[`redis-py` source folder](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/redis-py) +on GitHub, or grab them with `curl`: + +```bash +mkdir feature-store-demo && cd feature-store-demo +BASE=https://raw.githubusercontent.com/redis/docs/main/content/develop/use-cases/feature-store/redis-py +curl -O $BASE/feature_store.py +curl -O $BASE/build_features.py +curl -O $BASE/streaming_worker.py +curl -O $BASE/demo_server.py +``` + +### Start the demo server + +From that directory: + +```bash +python3 demo_server.py +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8085 +Using Redis at localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +By default the demo wipes the configured key prefix on startup so each run +starts from a clean state. Pass `--no-reset` to keep any existing data, or +`--key-prefix ` to point the demo at a different prefix entirely. + +Open [http://127.0.0.1:8085](http://127.0.0.1:8085) in a browser. Useful things +to try: + +* Pick a user and click **Read features** with a mixed batch/streaming subset + — you'll see batch fields with no per-field TTL (covered by the key-level + TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a 100-user + batch read. +* Click **Pause / resume** on the streaming worker and leave it paused for + ~5 minutes (or restart the server with `--streaming-ttl-seconds 30` to + make it visible in seconds). Re-run **Read features** on any user and + watch the streaming fields disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Bulk-load** with a short TTL (say 30 seconds) and watch the user + count fall to zero on the next minute — the same thing that happens if a + daily batch run fails to land. +* Click **Reset** to drop every user and start over. + +The server is read/write against your local Redis. The default key prefix is +`fs:user:`. Pass `--no-reset` to keep existing data across restarts, or +`--redis-host` / `--redis-port` to point at a different Redis. + +## Production usage + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness from a +broken batch pipeline. Set it longer than your worst-case batch outage so a +single missed run doesn't take the feature store offline, but short enough +that a sustained outage causes loud failures (missing entities) rather than +quiet ones (yesterday's features being scored as today's). The standard +choice is one cycle of "expected refresh interval × 2" — for a daily batch, +48 hours; for a 6-hour batch, 12 hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't churn +features needlessly, but short enough that a stalled worker causes visible +freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the schema in +your offline store. The batch materialization step is your chance to flatten +joins, encode categoricals, and project to whatever shape the model server +wants — so the request path is exactly one `HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own schema; the +serving pipeline reads from Redis with the flattened serving schema. Keeping +those two pipelines as the same code path is what prevents training-serving +skew. + +### Pipeline batch reads across shards + +On a single Redis instance, pipelining `HMGET` across `N` users is one round +trip. On a Redis Cluster, the keys land on different shards — `redis-py`'s +[`RedisCluster` client](https://redis-py.readthedocs.io/en/stable/clustering.html) +dispatches each `HMGET` to the right shard transparently, but you still pay +one round trip per shard rather than one for the whole batch. For very +latency-sensitive batch inference, group users by shard slot +(`cluster.keyslot(key)`) and issue one pipeline per shard in parallel. + +For a small number of frequently-queried users (a top-N customer list, for +example), a hash tag like `fs:user:{vip}:u0001` forces the keys onto the same +shard and lets one pipeline serve them all in one round trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the streaming +write applies `HEXPIRE` *every time*. If a streaming worker writes a field +without renewing its TTL, the field carries whatever expiry was there before +— possibly none, possibly stale — and the mixed-staleness invariant breaks. +Keep the `HSET` and `HEXPIRE` in the same pipeline (or, even safer, in the +same Lua script if you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model doesn't +need. With dozens of features per entity, that is wasted serialization work +on the server and wasted bandwidth on the wire. Always specify the field list +explicitly with `HMGET` in the model server. + +The exception is debugging and feature-set discovery, where you genuinely +want the full hash. The demo's "Inspect" button uses `HGETALL` for exactly +this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the hash +(either it was never written, or it expired); `-1` means the field has no +TTL set (and is therefore covered only by the key-level `EXPIRE`); any +positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on streaming + features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL aligned + with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one network + round trip — see [Pipelining]({{< relref "/develop/using-commands/pipelining" >}}). + +See the [`redis-py` documentation]({{< relref "/develop/clients/redis-py" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the deeper +conceptual model — including the listpack encoding that makes small hashes +particularly compact in memory, which matters at feature-store scale. diff --git a/content/develop/use-cases/feature-store/redis-py/build_features.py b/content/develop/use-cases/feature-store/redis-py/build_features.py new file mode 100644 index 0000000000..a71b6a770c --- /dev/null +++ b/content/develop/use-cases/feature-store/redis-py/build_features.py @@ -0,0 +1,89 @@ +""" +Synthesize a small batch of users with realistic-looking features and +bulk-load them into Redis with a 24-hour key-level TTL. + +Stands in for the nightly Spark / Feast materialization job in a real +deployment. In production the equivalent of this script lives in an +offline pipeline that reads from the offline store and writes the +serving-time hashes into Redis via ``HSET`` + ``EXPIRE``. +""" + +from __future__ import annotations + +import argparse +import random +from typing import Iterable + +from feature_store import RedisFeatureStore + + +COUNTRY_CHOICES = ("US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL") +RISK_SEGMENTS = ("low", "medium", "high") + + +def synthesize_users(count: int, seed: int = 42) -> dict[str, dict]: + """Generate ``count`` synthetic user feature rows. + + The shape mirrors a small fraud-scoring feature set: country and + risk segment as TAG-like categorical features, plus a few numeric + aggregates over recent windows. + """ + rng = random.Random(seed) + users: dict[str, dict] = {} + for i in range(1, count + 1): + uid = f"u{i:04d}" + users[uid] = { + "country_iso": rng.choice(COUNTRY_CHOICES), + "risk_segment": rng.choices( + RISK_SEGMENTS, weights=(70, 25, 5), k=1, + )[0], + "account_age_days": rng.randint(7, 2400), + "tx_count_7d": rng.randint(0, 80), + "avg_amount_30d": round(rng.uniform(5, 350), 2), + "chargeback_count_180d": rng.choices( + (0, 1, 2, 3), weights=(85, 10, 4, 1), k=1, + )[0], + } + return users + + +def main(argv: Iterable[str] | None = None) -> int: + parser = argparse.ArgumentParser(description=__doc__) + parser.add_argument("--redis-host", default="localhost") + parser.add_argument("--redis-port", type=int, default=6379) + parser.add_argument( + "--count", type=int, default=200, + help="Number of synthetic users to materialize.", + ) + parser.add_argument( + "--ttl-seconds", type=int, default=24 * 60 * 60, + help="Key-level TTL for each user hash (default 24h).", + ) + parser.add_argument( + "--key-prefix", default="fs:user:", + help="Hash key prefix for each user.", + ) + parser.add_argument("--seed", type=int, default=42) + args = parser.parse_args(list(argv) if argv is not None else None) + + import redis + client = redis.Redis( + host=args.redis_host, port=args.redis_port, decode_responses=True, + ) + store = RedisFeatureStore( + redis_client=client, + key_prefix=args.key_prefix, + batch_ttl_seconds=args.ttl_seconds, + ) + + rows = synthesize_users(args.count, seed=args.seed) + store.bulk_load(rows) + print( + f"Materialized {len(rows)} users at {args.key_prefix}* " + f"with a {args.ttl_seconds}s key-level TTL." + ) + return 0 + + +if __name__ == "__main__": + raise SystemExit(main()) diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py new file mode 100644 index 0000000000..5a5b1ddd2c --- /dev/null +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -0,0 +1,802 @@ +#!/usr/bin/env python3 +""" +Redis feature-store demo server. + +Run this file and visit http://localhost:8085 to watch an online feature +store at work: a batch materialization loads N users with a 24-hour +key-level TTL, a streaming worker overwrites a handful of users' real-time +features every second with a per-field ``HEXPIRE``, and the inference +panel reads any subset of features for any user with ``HMGET`` in a +single round trip. + +Use the UI to: + +* Bulk-load (re-materialize) the batch features, optionally with a short + TTL so you can watch a whole entity expire on schedule. +* Pause the streaming worker and watch the streaming fields drop out + via ``HEXPIRE`` while the batch fields remain populated under the + longer key-level TTL — the *mixed staleness* story made visible. +* Pull features for one user (``HMGET``) and see the value, per-field + TTL, and read latency. +* Batch-score N users in one pipelined round trip and see the + per-entity / per-round-trip latency split. +* Inspect a single user's hash in detail with field-level TTLs. +""" + +from __future__ import annotations + +import argparse +import json +import sys +import time +from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from pathlib import Path +from urllib.parse import parse_qs, urlparse + +sys.path.insert(0, str(Path(__file__).resolve().parent)) + +try: + import redis + + from build_features import synthesize_users + from feature_store import ( + DEFAULT_BATCH_FIELDS, + DEFAULT_STREAMING_FIELDS, + RedisFeatureStore, + ) + from streaming_worker import StreamingWorker +except ImportError as exc: + print(f"Error: {exc}") + print("Make sure the 'redis' package is installed: pip install redis") + sys.exit(1) + + +HTML_TEMPLATE = """ + + + + + Redis Feature Store Demo + + + +
+
redis-py + Python standard library HTTP server
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user. + Pipelined so a 500-user load is one round trip per batch.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users. One + network round trip for the whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +""" + + +class FeatureStoreDemo: + """Demo orchestrator: feature store + streaming worker + housekeeping.""" + + def __init__( + self, + store: RedisFeatureStore, + worker: StreamingWorker, + default_user_count: int, + seed: int, + ) -> None: + self.store = store + self.worker = worker + self.default_user_count = default_user_count + self.seed = seed + + def materialize(self, count: int, ttl_seconds: int) -> dict: + rows = synthesize_users(count, seed=self.seed) + start = time.perf_counter() + loaded = self.store.bulk_load(rows, ttl_seconds=ttl_seconds) + elapsed_ms = (time.perf_counter() - start) * 1000.0 + return {"loaded": loaded, "ttl_seconds": ttl_seconds, "elapsed_ms": elapsed_ms} + + def reset(self) -> dict: + deleted = self.store.reset() + self.store.reset_stats() + self.worker.reset_stats() + return {"deleted": deleted} + + def toggle_worker(self) -> dict: + if not self.worker.is_running: + self.worker.start() + if self.worker.is_paused: + self.worker.resume() + else: + self.worker.pause() + return {"paused": self.worker.is_paused, "running": self.worker.is_running} + + +class FeatureStoreDemoHandler(BaseHTTPRequestHandler): + """HTTP handler. State is hung off class attributes.""" + + store: RedisFeatureStore | None = None + worker: StreamingWorker | None = None + demo: FeatureStoreDemo | None = None + + def do_GET(self) -> None: + parsed = urlparse(self.path) + if parsed.path in {"/", "/index.html"}: + self._send_html(self._html_page()) + return + if parsed.path == "/state": + self._send_json(self._build_state(), 200) + return + if parsed.path == "/inspect": + self._handle_inspect(parse_qs(parsed.query)) + return + self.send_error(404) + + def do_POST(self) -> None: + parsed = urlparse(self.path) + if parsed.path == "/bulk-load": + self._handle_bulk_load() + return + if parsed.path == "/reset": + self._send_json(self.demo.reset(), 200) + return + if parsed.path == "/worker/toggle": + self._send_json(self.demo.toggle_worker(), 200) + return + if parsed.path == "/read": + self._handle_read() + return + if parsed.path == "/batch-read": + self._handle_batch_read() + return + self.send_error(404) + + # ---- POST handlers -------------------------------------------------- + + def _handle_bulk_load(self) -> None: + params = self._read_form_data() + count = max(1, min(2000, int(params.get("count", ["200"])[0] or "200"))) + ttl = max(5, min(172_800, int(params.get("ttl", ["86400"])[0] or "86400"))) + self._send_json(self.demo.materialize(count, ttl), 200) + + def _handle_read(self) -> None: + params = self._read_form_data() + user = (params.get("user", [""])[0] or "").strip() + if not user: + self._send_json({"error": "user is required"}, 400) + return + fields = [f for f in params.get("field", []) if f] + start = time.perf_counter() + values = self.store.get_features(user, fields) if fields else {} + elapsed_ms = (time.perf_counter() - start) * 1000.0 + ttls = self.store.field_ttls_seconds(user, fields) if fields else {} + key_ttl = self.store.key_ttl_seconds(user) + self._send_json( + { + "requested": fields, + "values": values, + "ttls": ttls, + "key_ttl_seconds": key_ttl, + "returned_count": len(values), + "elapsed_ms": elapsed_ms, + }, + 200, + ) + + def _handle_batch_read(self) -> None: + params = self._read_form_data() + count = max(1, min(500, int(params.get("count", ["100"])[0] or "100"))) + fields = [f for f in params.get("field", []) if f] + if not fields: + fields = list(DEFAULT_STREAMING_FIELDS) + ["risk_segment"] + ids = self.store.list_entity_ids(limit=2000) + if len(ids) > count: + ids = ids[:count] + start = time.perf_counter() + rows = self.store.batch_get_features(ids, fields) + elapsed_ms = (time.perf_counter() - start) * 1000.0 + sample = [ + {"id": uid, "field_count": len(rows.get(uid, {}))} + for uid in ids[:10] + ] + self._send_json( + { + "entity_count": len(ids), + "field_count": len(fields), + "elapsed_ms": elapsed_ms, + "sample": sample, + }, + 200, + ) + + def _handle_inspect(self, query: dict[str, list[str]]) -> None: + user = (query.get("user", [""])[0] or "").strip() + if not user: + self._send_json({"error": "user is required"}, 400) + return + full = self.store.get_features(user, field_names=None) + if not full: + key_ttl = self.store.key_ttl_seconds(user) + self._send_json( + {"exists": False, "key_ttl_seconds": key_ttl}, + 200, + ) + return + ttls = self.store.field_ttls_seconds(user, full.keys()) + key_ttl = self.store.key_ttl_seconds(user) + fields = sorted( + [ + {"name": n, "value": v, "ttl_seconds": ttls.get(n, -1)} + for n, v in full.items() + ], + key=lambda r: r["name"], + ) + self._send_json( + { + "exists": True, + "key_ttl_seconds": key_ttl, + "fields": fields, + }, + 200, + ) + + # ---- State assembly ------------------------------------------------- + + def _build_state(self) -> dict: + ids = self.store.list_entity_ids(limit=500) + return { + "key_prefix": self.store.key_prefix, + "batch_ttl_seconds": self.store.batch_ttl_seconds, + "streaming_ttl_seconds": self.store.streaming_ttl_seconds, + "entity_count": len(ids), + "entity_ids": ids, + "stats": self.store.stats(), + "worker": self.worker.stats(), + } + + # ---- HTTP plumbing -------------------------------------------------- + + def _read_form_data(self) -> dict[str, list[str]]: + content_length = int(self.headers.get("Content-Length", "0")) + raw_body = self.rfile.read(content_length).decode("utf-8") + return parse_qs(raw_body) + + def _send_html(self, html: str, status: int = 200) -> None: + self.send_response(status) + self.send_header("Content-Type", "text/html; charset=utf-8") + self.end_headers() + self.wfile.write(html.encode("utf-8")) + + def _send_json(self, payload: dict, status: int) -> None: + self.send_response(status) + self.send_header("Content-Type", "application/json") + self.end_headers() + self.wfile.write(json.dumps(payload).encode("utf-8")) + + def log_message(self, format: str, *args) -> None: # noqa: A002 + sys.stderr.write(f"[demo] {format % args}\n") + + def _html_page(self) -> str: + return ( + HTML_TEMPLATE + .replace("__KEY_PREFIX__", self.store.key_prefix) + .replace("__STREAM_TTL__", str(self.store.streaming_ttl_seconds)) + .replace("__USERS_PER_TICK__", str(self.worker.users_per_tick)) + .replace("__BATCH_FIELDS_JSON__", + json.dumps(list(DEFAULT_BATCH_FIELDS))) + .replace("__STREAM_FIELDS_JSON__", + json.dumps(list(DEFAULT_STREAMING_FIELDS))) + ) + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser(description="Run the Redis feature-store demo server.") + parser.add_argument("--host", default="127.0.0.1", help="HTTP bind host") + parser.add_argument("--port", type=int, default=8085, help="HTTP bind port") + parser.add_argument("--redis-host", default="localhost", help="Redis host") + parser.add_argument("--redis-port", type=int, default=6379, help="Redis port") + parser.add_argument( + "--key-prefix", default="fs:user:", + help="Hash key prefix for each user.", + ) + parser.add_argument( + "--batch-ttl-seconds", type=int, default=24 * 60 * 60, + help="Default key-level TTL applied by bulk-load (default 24h).", + ) + parser.add_argument( + "--streaming-ttl-seconds", type=int, default=5 * 60, + help="Per-field TTL applied to streaming features (default 5m).", + ) + parser.add_argument( + "--users-per-tick", type=int, default=5, + help="How many users the streaming worker touches per tick.", + ) + parser.add_argument( + "--seed-users", type=int, default=200, + help="Number of users to materialize on startup.", + ) + parser.add_argument( + "--no-reset", + dest="reset_on_start", + action="store_false", + help=( + "Keep any existing data under --key-prefix instead of dropping" + " it on startup. By default the demo wipes the prefix so each" + " run starts from a clean state." + ), + ) + return parser.parse_args() + + +def main() -> None: + args = parse_args() + + redis_client = redis.Redis( + host=args.redis_host, + port=args.redis_port, + decode_responses=True, + ) + store = RedisFeatureStore( + redis_client=redis_client, + key_prefix=args.key_prefix, + batch_ttl_seconds=args.batch_ttl_seconds, + streaming_ttl_seconds=args.streaming_ttl_seconds, + ) + worker = StreamingWorker( + store=store, users_per_tick=args.users_per_tick, + ) + demo = FeatureStoreDemo( + store=store, worker=worker, + default_user_count=args.seed_users, seed=42, + ) + + if args.reset_on_start: + print( + f"Dropping any existing users under '{args.key_prefix}*'" + " for a clean demo run (pass --no-reset to keep them)." + ) + store.reset() + store.reset_stats() + seeded = demo.materialize(args.seed_users, args.batch_ttl_seconds)["loaded"] + + worker.start() + + FeatureStoreDemoHandler.store = store + FeatureStoreDemoHandler.worker = worker + FeatureStoreDemoHandler.demo = demo + + print(f"Redis feature-store demo server listening on http://{args.host}:{args.port}") + print( + f"Using Redis at {args.redis_host}:{args.redis_port}" + f" with key prefix '{args.key_prefix}'" + f" (batch TTL {args.batch_ttl_seconds}s," + f" streaming TTL {args.streaming_ttl_seconds}s)" + ) + print(f"Materialized {seeded} user(s); streaming worker running.") + + server = ThreadingHTTPServer((args.host, args.port), FeatureStoreDemoHandler) + try: + server.serve_forever() + except KeyboardInterrupt: + pass + finally: + worker.stop() + + +if __name__ == "__main__": + main() diff --git a/content/develop/use-cases/feature-store/redis-py/feature_store.py b/content/develop/use-cases/feature-store/redis-py/feature_store.py new file mode 100644 index 0000000000..57f377e994 --- /dev/null +++ b/content/develop/use-cases/feature-store/redis-py/feature_store.py @@ -0,0 +1,337 @@ +""" +Redis online feature store backed by per-entity Hashes. + +Each entity (here, a user) lives at a deterministic key such as +``fs:user:{id}``. The hash holds every feature for that entity as one +field per feature — batch-materialized aggregates (refreshed on a daily +cycle) alongside streaming-updated signals (refreshed every few +seconds). One ``HMGET`` returns whichever subset the model needs in +one network round trip. + +Two TTL layers solve the *mixed staleness* problem: + +* A key-level ``EXPIRE`` aligned with the batch materialization cycle + causes the whole entity to disappear if its batch refresher fails, + so inference sees a missing entity (which the model handler can + detect and fall back on) rather than silently outdated values. +* A per-field ``HEXPIRE`` on each streaming field gives that field its + own shorter expiry, independent of the rest of the hash. When the + streaming pipeline stops updating a field, the field self-cleans + while the rest of the entity stays populated. + +``HEXPIRE`` and ``HTTL`` require Redis 7.4 or later. ``redis-py`` +exposes them as ``hexpire`` / ``httl`` from version 5.1. + +Concurrency is by construction: Redis is single-threaded per shard, so +overlapping ``HSET`` calls from a batch job and a streaming worker on +the same entity hash are applied atomically without locks or version +columns. +""" + +from __future__ import annotations + +from threading import Lock +from typing import Iterable, Mapping, Optional, Union + +import redis + + +FeatureValue = Union[str, int, float, bool] +FeatureMap = Mapping[str, FeatureValue] + + +# Default batch feature schema. Daily aggregates computed offline and +# bulk-loaded once per materialization cycle. +DEFAULT_BATCH_FIELDS: tuple[str, ...] = ( + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d", +) + +# Default streaming feature schema. Updated by the streaming worker as +# new events arrive, with a per-field TTL so each field self-expires +# when its upstream pipeline stops. +DEFAULT_STREAMING_FIELDS: tuple[str, ...] = ( + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country", +) + + +class RedisFeatureStore: + """Online feature store helper for one entity type (default: user).""" + + def __init__( + self, + redis_client: Optional[redis.Redis] = None, + key_prefix: str = "fs:user:", + batch_ttl_seconds: int = 24 * 60 * 60, + streaming_ttl_seconds: int = 5 * 60, + ) -> None: + self.redis = redis_client or redis.Redis( + host="localhost", + port=6379, + decode_responses=True, + ) + self.key_prefix = key_prefix + self.batch_ttl_seconds = batch_ttl_seconds + self.streaming_ttl_seconds = streaming_ttl_seconds + + self._stats_lock = Lock() + self._batch_writes_total = 0 + self._streaming_writes_total = 0 + self._reads_total = 0 + self._read_fields_total = 0 + + # ------------------------------------------------------------------ + # Key helpers + # ------------------------------------------------------------------ + + def key_for(self, entity_id: str) -> str: + return f"{self.key_prefix}{entity_id}" + + # ------------------------------------------------------------------ + # Batch ingestion (materialization) + # ------------------------------------------------------------------ + + def bulk_load( + self, + rows: Mapping[str, FeatureMap], + ttl_seconds: Optional[int] = None, + ) -> int: + """Materialize a batch of entities into Redis. + + ``rows`` is ``{entity_id: {field: value, ...}}``. One ``HSET`` + plus one ``EXPIRE`` per entity, all pipelined into a single + round trip. The key-level ``EXPIRE`` is what makes the whole + entity disappear if a future batch run fails — inference reads + the missing entity rather than silently outdated values. + """ + ttl = self.batch_ttl_seconds if ttl_seconds is None else ttl_seconds + pipe = self.redis.pipeline(transaction=False) + for entity_id, fields in rows.items(): + key = self.key_for(entity_id) + pipe.hset(key, mapping={k: _encode(v) for k, v in fields.items()}) + pipe.expire(key, ttl) + pipe.execute() + with self._stats_lock: + self._batch_writes_total += len(rows) + return len(rows) + + def update_batch_feature( + self, + entity_id: str, + field: str, + value: FeatureValue, + ) -> None: + """Update a single batch feature without touching the key TTL. + + Used by the demo's "manually refresh one user" lever; in a real + pipeline batch updates always flow through ``bulk_load``. + """ + self.redis.hset(self.key_for(entity_id), field, _encode(value)) + with self._stats_lock: + self._batch_writes_total += 1 + + # ------------------------------------------------------------------ + # Streaming ingestion + # ------------------------------------------------------------------ + + def update_streaming( + self, + entity_id: str, + fields: FeatureMap, + ttl_seconds: Optional[int] = None, + ) -> None: + """Write streaming features with a per-field TTL. + + Each field carries its own ``HEXPIRE`` so it self-expires + independently of the rest of the hash. If the streaming + pipeline stops, the streaming fields drop out while the + batch-materialized fields remain populated under their longer + key-level ``EXPIRE``. + """ + if not fields: + return + ttl = self.streaming_ttl_seconds if ttl_seconds is None else ttl_seconds + key = self.key_for(entity_id) + encoded = {name: _encode(value) for name, value in fields.items()} + + pipe = self.redis.pipeline(transaction=False) + pipe.hset(key, mapping=encoded) + pipe.hexpire(key, ttl, *encoded.keys()) + pipe.execute() + with self._stats_lock: + self._streaming_writes_total += len(encoded) + + # ------------------------------------------------------------------ + # Inference reads + # ------------------------------------------------------------------ + + def get_features( + self, + entity_id: str, + field_names: Optional[Iterable[str]] = None, + ) -> dict[str, str]: + """Retrieve a subset of features for one entity. + + ``HMGET`` returns the requested fields in one round trip. Pass + ``field_names=None`` to fetch the entire hash with ``HGETALL`` + — useful for debugging but rarely the right call on the + request path, where the model knows exactly which features it + consumes. + """ + key = self.key_for(entity_id) + if field_names is None: + data = self.redis.hgetall(key) + with self._stats_lock: + self._reads_total += 1 + self._read_fields_total += len(data) + return data + + names = list(field_names) + if not names: + return {} + values = self.redis.hmget(key, names) + with self._stats_lock: + self._reads_total += 1 + self._read_fields_total += sum(1 for v in values if v is not None) + return {n: v for n, v in zip(names, values) if v is not None} + + def batch_get_features( + self, + entity_ids: Iterable[str], + field_names: Iterable[str], + ) -> dict[str, dict[str, str]]: + """Pipeline ``HMGET`` across many entities for batch scoring. + + Hundreds of entities in one round trip. The model can then + score them all without further network calls. + """ + ids = list(entity_ids) + names = list(field_names) + if not ids or not names: + return {} + + pipe = self.redis.pipeline(transaction=False) + for entity_id in ids: + pipe.hmget(self.key_for(entity_id), names) + rows = pipe.execute() + + out: dict[str, dict[str, str]] = {} + seen_fields = 0 + for entity_id, values in zip(ids, rows): + row = {n: v for n, v in zip(names, values) if v is not None} + out[entity_id] = row + seen_fields += len(row) + with self._stats_lock: + self._reads_total += len(ids) + self._read_fields_total += seen_fields + return out + + # ------------------------------------------------------------------ + # TTL inspection (used by the demo UI) + # ------------------------------------------------------------------ + + def key_ttl_seconds(self, entity_id: str) -> int: + """Seconds until the entity key expires. + + Returns ``-1`` if no key-level TTL is set, ``-2`` if the key + doesn't exist. + """ + return int(self.redis.ttl(self.key_for(entity_id))) + + def field_ttls_seconds( + self, + entity_id: str, + field_names: Iterable[str], + ) -> dict[str, int]: + """Per-field TTL via ``HTTL`` (Redis 7.4+). + + Each value mirrors the ``TTL`` convention: positive means + seconds remaining, ``-1`` means no TTL on the field, ``-2`` + means the field doesn't exist on this hash. + """ + names = list(field_names) + if not names: + return {} + ttls = self.redis.httl(self.key_for(entity_id), *names) + return {n: int(t) for n, t in zip(names, ttls)} + + # ------------------------------------------------------------------ + # Demo housekeeping + # ------------------------------------------------------------------ + + def list_entity_ids(self, limit: int = 200) -> list[str]: + """Enumerate entity IDs by scanning ``key_prefix*``. + + ``SCAN`` is non-blocking; the demo uses it to populate UI + dropdowns, not as a serving primitive. + """ + ids: list[str] = [] + prefix_len = len(self.key_prefix) + for key in self.redis.scan_iter(match=f"{self.key_prefix}*", count=200): + ids.append(key[prefix_len:]) + if len(ids) >= limit: + break + return sorted(ids) + + def count_entities(self) -> int: + """Count entities currently in the store (via ``SCAN``).""" + count = 0 + for _ in self.redis.scan_iter(match=f"{self.key_prefix}*", count=500): + count += 1 + return count + + def delete_entity(self, entity_id: str) -> int: + return int(self.redis.delete(self.key_for(entity_id))) + + def reset(self) -> int: + """Drop every entity under ``key_prefix``. Used by the demo reset path. + + Scans in batches and ``DEL``s them in one pipeline per batch, + so a large demo dataset doesn't load the server with one big + synchronous delete. + """ + deleted = 0 + batch: list[str] = [] + for key in self.redis.scan_iter(match=f"{self.key_prefix}*", count=500): + batch.append(key) + if len(batch) >= 500: + deleted += int(self.redis.delete(*batch)) + batch.clear() + if batch: + deleted += int(self.redis.delete(*batch)) + return deleted + + def stats(self) -> dict[str, int]: + with self._stats_lock: + return { + "batch_writes_total": self._batch_writes_total, + "streaming_writes_total": self._streaming_writes_total, + "reads_total": self._reads_total, + "read_fields_total": self._read_fields_total, + } + + def reset_stats(self) -> None: + with self._stats_lock: + self._batch_writes_total = 0 + self._streaming_writes_total = 0 + self._reads_total = 0 + self._read_fields_total = 0 + + +def _encode(value: FeatureValue) -> str: + """Encode a feature value as a string for Hash storage. + + Booleans become ``"true"`` / ``"false"`` (not ``"True"`` / ``"False"``) + so they round-trip cleanly through other clients and ``redis-cli``. + """ + if isinstance(value, bool): + return "true" if value else "false" + return str(value) diff --git a/content/develop/use-cases/feature-store/redis-py/streaming_worker.py b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py new file mode 100644 index 0000000000..410a180690 --- /dev/null +++ b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py @@ -0,0 +1,146 @@ +""" +Streaming feature updater for the demo. + +Stands in for whatever Flink, Kafka Streams, or bespoke service computes +the real-time features in a real deployment. In production this code +lives in the streaming layer; here it runs as a daemon thread next to +the demo server so the page can start, pause, and resume it from the UI. + +Every tick it picks a few random users and writes a new value for each +streaming feature, with a per-field ``HEXPIRE`` so the field self-expires +if the worker is paused. Pause the worker for longer than +``streaming_ttl_seconds`` and the streaming fields drop out of the hash +while the batch fields remain populated under the longer key-level TTL — +the *mixed staleness* story made visible. +""" + +from __future__ import annotations + +import random +import threading +import time +from typing import Optional + +from feature_store import RedisFeatureStore + + +DEVICE_IDS = ( + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2", +) +SESSION_COUNTRIES = ("US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL") + + +class StreamingWorker: + """Background thread that updates streaming features on a tick.""" + + def __init__( + self, + store: RedisFeatureStore, + tick_seconds: float = 1.0, + users_per_tick: int = 5, + seed: int = 1337, + ) -> None: + self.store = store + self.tick_seconds = tick_seconds + self.users_per_tick = users_per_tick + self._rng = random.Random(seed) + + self._thread: Optional[threading.Thread] = None + self._stop_event = threading.Event() + self._paused = threading.Event() + + self._lock = threading.Lock() + self._tick_count = 0 + self._writes_count = 0 + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def start(self) -> None: + if self._thread is not None and self._thread.is_alive(): + return + self._stop_event.clear() + self._paused.clear() + self._thread = threading.Thread( + target=self._run, name="streaming-worker", daemon=True, + ) + self._thread.start() + + def stop(self) -> None: + self._stop_event.set() + thread = self._thread + if thread is not None: + thread.join(timeout=2.0) + self._thread = None + + def pause(self) -> None: + self._paused.set() + + def resume(self) -> None: + self._paused.clear() + + @property + def is_paused(self) -> bool: + return self._paused.is_set() + + @property + def is_running(self) -> bool: + return self._thread is not None and self._thread.is_alive() + + # ------------------------------------------------------------------ + # Tick + # ------------------------------------------------------------------ + + def _run(self) -> None: + while not self._stop_event.is_set(): + if self._paused.is_set(): + time.sleep(0.05) + continue + try: + self._tick() + except Exception as exc: + print(f"[streaming-worker] tick failed: {exc}") + self._stop_event.wait(timeout=self.tick_seconds) + + def _tick(self) -> None: + ids = self.store.list_entity_ids(limit=500) + if not ids: + return + chosen = self._rng.sample(ids, k=min(self.users_per_tick, len(ids))) + now_ms = int(time.time() * 1000) + writes = 0 + for entity_id in chosen: + fields = { + "last_login_ts": now_ms, + "last_device_id": self._rng.choice(DEVICE_IDS), + "tx_count_5m": self._rng.randint(0, 12), + "failed_logins_15m": self._rng.choices( + (0, 1, 2, 5), weights=(70, 20, 8, 2), k=1, + )[0], + "session_country": self._rng.choice(SESSION_COUNTRIES), + } + self.store.update_streaming(entity_id, fields) + writes += len(fields) + with self._lock: + self._tick_count += 1 + self._writes_count += writes + + # ------------------------------------------------------------------ + # Stats + # ------------------------------------------------------------------ + + def stats(self) -> dict: + with self._lock: + return { + "running": self.is_running, + "paused": self.is_paused, + "tick_count": self._tick_count, + "writes_count": self._writes_count, + } + + def reset_stats(self) -> None: + with self._lock: + self._tick_count = 0 + self._writes_count = 0 From 3b455e5015eb47b01f90a7fd25ff1efe0f6c61c4 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 13:25:26 +0100 Subject: [PATCH 02/20] DOC-6661 Codex review issues --- .../feature-store/redis-py/_index.md | 51 ++++++++++++++----- .../feature-store/redis-py/demo_server.py | 22 ++++++-- .../feature-store/redis-py/feature_store.py | 24 +++++++-- 3 files changed, 77 insertions(+), 20 deletions(-) diff --git a/content/develop/use-cases/feature-store/redis-py/_index.md b/content/develop/use-cases/feature-store/redis-py/_index.md index 9ca06fb221..82afa60094 100644 --- a/content/develop/use-cases/feature-store/redis-py/_index.md +++ b/content/develop/use-cases/feature-store/redis-py/_index.md @@ -213,11 +213,19 @@ def bulk_load( ... ``` -`transaction=False` switches the pipeline from `MULTI/EXEC` to a plain command -batch: there's no all-or-nothing semantic, just a network optimization. That -is the right choice here — each user's `HSET` + `EXPIRE` is independent, and -wrapping the whole thing in a transaction would block the server for the -duration of the batch. +`transaction=False` skips the `MULTI/EXEC` wrapper that +[`pipeline`]({{< relref "/develop/clients/redis-py/transpipe" >}}) defaults to +— commands still queue and ship in one round trip, but they execute as +independent commands rather than as one atomic block. That is the right +choice here: each user's `HSET` + `EXPIRE` pair is independent of every +other user's, and an all-or-nothing transaction would block the server for +the duration of the batch. It does *not* make the `HSET` + `EXPIRE` pair +atomic — in the extremely unlikely event the server crashes between the two, +the entity exists without a key-level TTL until the next batch run re-pins +it. For an ingestion script that runs end-to-end every cycle this is fine; +if you need the pair to be inseparable, wrap each user in its own tiny +`MULTI/EXEC` or a Lua script (see [`EVAL`]({{< relref "/commands/eval" >}}) / +[Eval scripting]({{< relref "/develop/programmability/eval-intro" >}})). In production, the equivalent of this script runs as an offline pipeline (a Spark or Feast `materialize` job) that reads from the warehouse and writes @@ -249,10 +257,14 @@ def update_streaming( ``` [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* -hash fields, not on the whole key. The two commands here are sent in one round -trip but they could in principle run any order — the `HSET` always wins because -the field name is the same in both calls; in practice they run in pipeline -order on the server, so the field is written, then its TTL is applied. +hash fields, not on the whole key. The two commands are sent in one round trip +and Redis executes them in pipeline order: the `HSET` runs first and creates +or overwrites the fields, then `HEXPIRE` attaches a TTL to each of those same +fields. `HEXPIRE` returns one status code per field — `1` if the TTL was set, +`-2` if the field doesn't exist — so the helper raises if any code is anything +other than `1`. That makes the "every streaming write renews its TTL" +invariant fail loudly rather than silently leaving a streaming field with no +expiry attached. If a streaming pipeline stops, the streaming fields drop out one by one as their per-field TTLs elapse — there is no application-side cleaner involved. @@ -331,7 +343,7 @@ One round trip for the whole batch — the demo regularly returns 100 users in 2-3 ms against a local Redis. On a real network the round trip dominates; pipelining is what keeps batch scoring practical. -For very large batches on a clustered deployment, the same shape generalises +For very large batches on a clustered deployment, the same shape generalizes to one pipeline per shard: bucket the entity IDs by their hash slot (`cluster.keyslot(key)`), then issue one pipeline against each shard in parallel. `redis-py`'s @@ -349,8 +361,10 @@ and resume it; in production this code would live in the streaming layer. Every tick the worker picks a few random users, generates a new value for each streaming feature, and calls `store.update_streaming(user_id, fields)`. The -demo defaults to 5 users per tick at 1-second intervals — enough that within a -minute every user in a 200-user store has been touched at least once. +demo defaults to 5 users per tick at 1-second intervals — so a 200-user store +sees roughly half its users refreshed in the first minute, and most after a +few minutes. Drop `--seed-users` or raise `--users-per-tick` if you'd rather +have every user touched quickly. ```python def _tick(self) -> None: @@ -532,6 +546,16 @@ The server is read/write against your local Redis. The default key prefix is ## Production usage +The guidance below focuses on the production concerns that are specific to +running a feature store on Redis. For the generic redis-py production checklist +— connection-pool sizing, +[TLS and AUTH]({{< relref "/develop/clients/redis-py/connect#connect-to-your-production-redis-with-tls" >}}), +[exception handling]({{< relref "/develop/clients/redis-py/produsage#exception-handling" >}}), +and the rest — see the +[redis-py production usage guide]({{< relref "/develop/clients/redis-py/produsage" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + ### Pick the batch TTL to outlast a failed refresher The whole-entity `EXPIRE` is your safety net against silent staleness from a @@ -580,7 +604,8 @@ write applies `HEXPIRE` *every time*. If a streaming worker writes a field without renewing its TTL, the field carries whatever expiry was there before — possibly none, possibly stale — and the mixed-staleness invariant breaks. Keep the `HSET` and `HEXPIRE` in the same pipeline (or, even safer, in the -same Lua script if you don't trust the call site). +same [Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). ### Avoid HGETALL on the request path diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py index 5a5b1ddd2c..268c0e9db1 100644 --- a/content/develop/use-cases/feature-store/redis-py/demo_server.py +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -512,9 +512,20 @@ def materialize(self, count: int, ttl_seconds: int) -> dict: return {"loaded": loaded, "ttl_seconds": ttl_seconds, "elapsed_ms": elapsed_ms} def reset(self) -> dict: - deleted = self.store.reset() - self.store.reset_stats() - self.worker.reset_stats() + # Pause the streaming worker around the DEL sweep so a concurrent + # tick can't recreate a user that was just enumerated for deletion + # (streaming HSET creates the key if it's missing, and that would + # leave behind a streaming-only hash with no key-level TTL). + was_paused = self.worker.is_paused + if self.worker.is_running and not was_paused: + self.worker.pause() + try: + deleted = self.store.reset() + self.store.reset_stats() + self.worker.reset_stats() + finally: + if self.worker.is_running and not was_paused: + self.worker.resume() return {"deleted": deleted} def toggle_worker(self) -> dict: @@ -658,12 +669,15 @@ def _handle_inspect(self, query: dict[str, list[str]]) -> None: # ---- State assembly ------------------------------------------------- def _build_state(self) -> dict: + # The dropdown only needs a manageable list — cap at 500 — but the + # displayed user count should be the real total, not the cap, or the + # UI silently understates how many users are in the store. ids = self.store.list_entity_ids(limit=500) return { "key_prefix": self.store.key_prefix, "batch_ttl_seconds": self.store.batch_ttl_seconds, "streaming_ttl_seconds": self.store.streaming_ttl_seconds, - "entity_count": len(ids), + "entity_count": self.store.count_entities(), "entity_ids": ids, "stats": self.store.stats(), "worker": self.worker.stats(), diff --git a/content/develop/use-cases/feature-store/redis-py/feature_store.py b/content/develop/use-cases/feature-store/redis-py/feature_store.py index 57f377e994..b97280a879 100644 --- a/content/develop/use-cases/feature-store/redis-py/feature_store.py +++ b/content/develop/use-cases/feature-store/redis-py/feature_store.py @@ -165,7 +165,17 @@ def update_streaming( pipe = self.redis.pipeline(transaction=False) pipe.hset(key, mapping=encoded) pipe.hexpire(key, ttl, *encoded.keys()) - pipe.execute() + _, expire_result = pipe.execute() + # HEXPIRE returns one status code per field: 1 = TTL set, 2 = skipped + # under a conditional flag (NX/XX/GT/LT), 0 = no such field, -2 = no + # such key. We just HSET every field on the same call, so any code + # other than 1 means the per-field TTL invariant didn't hold — the + # mixed-staleness story relies on every streaming field carrying a + # fresh TTL after the write. + if expire_result is None or any(int(code) != 1 for code in expire_result): + raise RuntimeError( + f"HEXPIRE did not set every field TTL for {key}: {expire_result}" + ) with self._stats_lock: self._streaming_writes_total += len(encoded) @@ -261,6 +271,14 @@ def field_ttls_seconds( if not names: return {} ttls = self.redis.httl(self.key_for(entity_id), *names) + # redis-py 7.x returns a flat list of integers — including `-2`s for + # every field when the key itself is missing. Older / future versions + # have reported `None` for a missing key or a singleton list-of-list + # in pipeline contexts, so normalize both shapes before the zip below. + if ttls is None: + ttls = [-2] * len(names) + elif len(ttls) == 1 and isinstance(ttls[0], list): + ttls = ttls[0] return {n: int(t) for n, t in zip(names, ttls)} # ------------------------------------------------------------------ @@ -294,8 +312,8 @@ def delete_entity(self, entity_id: str) -> int: def reset(self) -> int: """Drop every entity under ``key_prefix``. Used by the demo reset path. - Scans in batches and ``DEL``s them in one pipeline per batch, - so a large demo dataset doesn't load the server with one big + Scans in batches and issues one variadic ``DEL`` per batch, so a + large demo dataset doesn't land on the server as one giant synchronous delete. """ deleted = 0 From 6a9815c5310f884e50ec118a4f26488c2512b5d5 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 14:04:58 +0100 Subject: [PATCH 03/20] DOC-6661 draft node-redis example --- .../develop/use-cases/feature-store/_index.md | 1 + .../use-cases/feature-store/nodejs/_index.md | 704 +++++++++++++++ .../feature-store/nodejs/buildFeatures.js | 157 ++++ .../feature-store/nodejs/demoServer.js | 804 ++++++++++++++++++ .../feature-store/nodejs/featureStore.js | 451 ++++++++++ .../feature-store/nodejs/package.json | 17 + .../feature-store/nodejs/streamingWorker.js | 178 ++++ 7 files changed, 2312 insertions(+) create mode 100644 content/develop/use-cases/feature-store/nodejs/_index.md create mode 100644 content/develop/use-cases/feature-store/nodejs/buildFeatures.js create mode 100644 content/develop/use-cases/feature-store/nodejs/demoServer.js create mode 100644 content/develop/use-cases/feature-store/nodejs/featureStore.js create mode 100644 content/develop/use-cases/feature-store/nodejs/package.json create mode 100644 content/develop/use-cases/feature-store/nodejs/streamingWorker.js diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index e793433eec..dcfb2cda24 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -156,3 +156,4 @@ updates real-time features with per-field TTL, retrieve any subset of features for a single user under 1 ms, and pipeline batch reads across a hundred users. * [redis-py (Python)]({{< relref "/develop/use-cases/feature-store/redis-py" >}}) +* [node-redis (Node.js)]({{< relref "/develop/use-cases/feature-store/nodejs" >}}) diff --git a/content/develop/use-cases/feature-store/nodejs/_index.md b/content/develop/use-cases/feature-store/nodejs/_index.md new file mode 100644 index 0000000000..06c38e9d48 --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/_index.md @@ -0,0 +1,704 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Node.js with node-redis +linkTitle: node-redis example (Node.js) +title: Redis feature store with node-redis +weight: 2 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Node.js with [`node-redis`]({{< relref "/develop/clients/nodejs" >}}). It +includes a local web server built with the Node.js standard `http` module so +you can bulk-load a batch of users with a key-level TTL, run a streaming +worker that overwrites real-time features with per-field TTL, retrieve any +subset of features for one user under 1 ms, and pipeline `HMGET` across a +hundred users for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis [Hash]({{< relref "/develop/data-types/hashes" >}}) +at a deterministic key — `fs:user:{id}`. The hash holds every feature for that +entity as one field per feature: batch-materialized aggregates (refreshed once +a day) alongside streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`buildFeatures.js` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`streamingWorker.js` — the demo's stand-in for a Flink / Kafka Streams job. +The inference panel of the demo server reads any subset of those features +through `featureStore.js`'s helper class. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features as +events arrive, and an **inference path** that reads features on the request +side. + +### Batch path (per materialization cycle) + +1. The batch job calls `synthesizeUsers(N)` (in production, the equivalent + computation lives in an offline pipeline against the warehouse). The result + is `{userId: {field: value, ...}}` for every user in this cycle. +2. `store.bulkLoad(rows, ttlSeconds)` batches one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through + [`multi().exec()`]({{< relref "/develop/clients/nodejs/transpipe" >}}), so + the whole batch ships in a single round trip. The `HSET` writes every batch + field; the `EXPIRE` is what makes the entity disappear if the next batch + run fails, so inference reads a missing entity rather than silently + outdated values. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming layer +computes whatever real-time signals fall out of that event and calls +`store.updateStreaming(userId, fields, ttlSeconds)`. That batches: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field values. + Redis is single-threaded per shard, so this is atomic against any + concurrent batch write on the same hash — no version columns, no locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the fields + that were written, with the streaming TTL. Each streaming field carries + its own per-field expiry independent of the rest of the hash. Stop the + worker and these fields drop out one by one as their TTLs elapse, while + the batch fields remain populated under the longer key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is owned by + the model, not the store). +2. It calls `store.getFeatures(userId, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values in + the same order as the requested fields, with `null` for any field that + doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.batchGetFeatures(userIds, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` users + in a single network round trip via + [`multi().exec()`]({{< relref "/develop/clients/nodejs/transpipe" >}}). + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/nodejs/featureStore.js)): + +```javascript +const { createClient } = require("redis"); +const { FeatureStore } = require("./featureStore"); + +const client = createClient({ socket: { host: "localhost", port: 6379 } }); +client.on("error", (err) => console.error("Redis client error:", err)); +await client.connect(); + +const store = new FeatureStore({ + redisClient: client, + keyPrefix: "fs:user:", + batchTtlSeconds: 24 * 60 * 60, // whole-entity TTL aligned with the daily batch cycle + streamingTtlSeconds: 5 * 60, // per-field TTL on each streaming feature +}); + +// Batch materialization: one HSET + EXPIRE per user, all batched. +await store.bulkLoad({ + u0001: { country_iso: "US", risk_segment: "low", + tx_count_7d: 14, avg_amount_30d: 92.40, + account_age_days: 612, chargeback_count_180d: 0 }, + u0002: { country_iso: "GB", risk_segment: "medium", + tx_count_7d: 47, avg_amount_30d: 220.10, + account_age_days: 1840, chargeback_count_180d: 1 }, +}); + +// Streaming write: HSET + HEXPIRE on just the fields that changed. +await store.updateStreaming("u0001", { + last_login_ts: 1716998413541, + last_device_id: "ios-9f02", + tx_count_5m: 3, + failed_logins_15m: 0, + session_country: "US", +}); + +// Inference read: HMGET of whatever the model needs. +const features = await store.getFeatures("u0001", [ + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m", +]); + +// Batch scoring: pipelined HMGET across many users. +const batch = await store.batchGetFeatures( + ["u0001", "u0002", "u0003"], + ["risk_segment", "tx_count_5m", "failed_logins_15m"], +); +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes-on-the-wire, so the helper encodes booleans as `"true"` / +`"false"` and everything else with `String(value)`. The model server is +responsible for parsing back to the right type, the same way it would when +reading any serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +The batch fields sit under the key-level `EXPIRE`. The streaming fields each +carry their own [`HEXPIRE`]({{< relref "/commands/hexpire" >}}). If the +streaming pipeline stops, the streaming fields drop one by one as their +per-field TTLs elapse; the batch fields stay until the daily key-level +`EXPIRE` fires (or the next batch cycle re-pins them). + +### Bulk-loading batch features + +`bulkLoad` batches one `HSET` and one `EXPIRE` per user into a single round +trip through node-redis's `multi()`. With 500 users that's 1000 commands in +one network call — Redis processes them sequentially on the server side but +the client only pays one RTT. + +```javascript +async bulkLoad(rows, ttlSeconds) { + const ttl = ttlSeconds ?? this.batchTtlSeconds; + const ids = Object.keys(rows); + if (ids.length === 0) return 0; + + const pipe = this.redis.multi(); + for (const entityId of ids) { + const key = this.keyFor(entityId); + const encoded = {}; + for (const [name, value] of Object.entries(rows[entityId])) { + encoded[name] = encode(value); + } + pipe.hSet(key, encoded); + pipe.expire(key, ttl); + } + await pipe.exec(); + ... +} +``` + +`multi()` in node-redis 5 wraps the batched commands in `MULTI/EXEC`, so the +whole batch runs as one transaction on the server. That gives all-or-nothing +semantics inside the batch but does block the server for its duration, which +is what you want for an ingestion script that runs end-to-end — not for a +hot-path serving call. (See +[transactions and pipelining]({{< relref "/develop/clients/nodejs/transpipe" >}}) +for the full mental model.) + +In production, the equivalent of this script runs as an offline pipeline (a +Spark or Feast `materialize` job) that reads from the warehouse and writes +into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`updateStreaming` is the linchpin of the mixed-staleness story: + +```javascript +async updateStreaming(entityId, fields, ttlSeconds) { + const names = Object.keys(fields); + if (names.length === 0) return; + const ttl = ttlSeconds ?? this.streamingTtlSeconds; + const key = this.keyFor(entityId); + const encoded = {}; + for (const [name, value] of Object.entries(fields)) { + encoded[name] = encode(value); + } + + const [, expireResult] = await this.redis + .multi() + .hSet(key, encoded) + .hExpire(key, names, ttl) + .exec(); + if (!Array.isArray(expireResult) || + expireResult.some((code) => Number(code) !== 1)) { + throw new Error( + `HEXPIRE did not set every field TTL for ${key}: ` + + JSON.stringify(expireResult), + ); + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. Note node-redis's argument order: +`hExpire(key, fields, seconds)` — the fields come *before* the TTL, the +opposite of `EXPIRE(key, seconds)`. The two commands are sent in one round +trip and Redis executes them in pipeline order: the `HSET` runs first and +creates or overwrites the fields, then `HEXPIRE` attaches a TTL to each of +those same fields. `HEXPIRE` returns one status code per field — `1` if the +TTL was set, `-2` if the field doesn't exist — so the helper throws if any +code is anything other than `1`. That makes the "every streaming write +renews its TTL" invariant fail loudly rather than silently leaving a +streaming field with no expiry attached. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse — there is no application-side cleaner involved. +[`HTTL`]({{< relref "/commands/httl" >}}) lets the model side inspect the +remaining TTL on any field, which is useful both for debugging ("why is this +feature missing?" → "it expired three seconds ago") and as a freshness signal +in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level TTL +> commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, +> `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in Redis 7.4. On older +> Redis builds you would have to put streaming features on their own keys +> (one key per feature, or one key per feature group) and set a key-level +> `EXPIRE` instead — at the cost of giving up the single-`HMGET` retrieval. + +### Inference reads with HMGET + +`getFeatures` is one `HMGET`: + +```javascript +async getFeatures(entityId, fieldNames = null) { + const key = this.keyFor(entityId); + if (fieldNames === null || fieldNames === undefined) { + return await this.redis.hGetAll(key); + } + const names = [...fieldNames]; + if (names.length === 0) return {}; + const values = await this.redis.hmGet(key, names); + const out = {}; + for (let i = 0; i < names.length; i += 1) { + const v = values[i]; + if (v !== null && v !== undefined) out[names[i]] = v; + } + return out; +} +``` + +The model knows exactly which features it consumes, so the request path +always takes the `HMGET` branch with an explicit field list — that's the +sub-millisecond path. `HGETALL` is the right call for debugging (which is +what the demo's "Inspect" panel does) but not for serving: it forces Redis +to serialize every field, including ones the model doesn't need. + +Fields that don't exist (because they were never written, or because they +expired) come back as `null`. The helper drops them from the result object +so the caller sees only the features that are actually available. A real +model server would either treat missing values as a feature ("this user has +no streaming signal yet") or fall back to a default from the model's +training data. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```javascript +async batchGetFeatures(entityIds, fieldNames) { + const ids = [...entityIds]; + const names = [...fieldNames]; + if (ids.length === 0 || names.length === 0) return {}; + + const pipe = this.redis.multi(); + for (const entityId of ids) { + pipe.hmGet(this.keyFor(entityId), names); + } + const rows = await pipe.exec(); + + const out = {}; + for (let i = 0; i < ids.length; i += 1) { + const values = rows[i] || []; + const row = {}; + for (let j = 0; j < names.length; j += 1) { + const v = values[j]; + if (v !== null && v !== undefined) row[names[j]] = v; + } + out[ids[i]] = row; + } + return out; +} +``` + +One round trip for the whole batch — the demo regularly returns 100 users in +2-3 ms against a local Redis. On a real network the round trip dominates; +pipelining is what keeps batch scoring practical. + +For very large batches on a clustered deployment, the same shape generalizes +to one pipeline per shard. node-redis's +[cluster client](https://github.com/redis/node-redis/blob/master/docs/clustering.md) +dispatches the per-user `hmGet` calls to the right shard transparently — you +still pay one round trip per shard rather than one for the whole batch. For +very latency-sensitive batch inference, group users by hash slot +(`cluster.calculateSlot(key)`) and issue one `multi().exec()` per shard in +parallel. + +## The streaming worker + +`streamingWorker.js` is the demo's stand-in for whatever Flink, Kafka Streams, +or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/nodejs/streamingWorker.js)). +It runs as a `setTimeout` loop next to the demo server so the UI can start, +pause, and resume it; in production this code would live in the streaming +layer. + +Every tick the worker picks a few random users, generates a new value for each +streaming feature, and calls `store.updateStreaming(userId, fields)`. The +demo defaults to 5 users per tick at 1-second intervals — so a 200-user store +sees roughly half its users refreshed in the first minute, and most after a +few minutes. Drop `--seed-users` or raise `--users-per-tick` if you'd rather +have every user touched quickly. + +```javascript +async _tick() { + const ids = await this.store.listEntityIds(500); + if (ids.length === 0) return; + const chosen = this.rng.sample(ids, this.usersPerTick); + const nowMs = Date.now(); + for (const entityId of chosen) { + const fields = { + last_login_ts: nowMs, + last_device_id: this.rng.choice(DEVICE_IDS), + tx_count_5m: this.rng.int(0, 12), + failed_logins_15m: this.rng.weightedChoice( + FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS, + ), + session_country: this.rng.choice(SESSION_COUNTRIES), + }; + await this.store.updateStreaming(entityId, fields); + } +} +``` + +Pausing the worker is what shows off the mixed-staleness behavior: leave it +paused for longer than `streamingTtlSeconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields remain +under the longer key-level `EXPIRE`. The demo's `Pause / resume` button lets +you see this happen in real time. + +The worker uses a `setTimeout` chain rather than `setInterval`, so a slow +tick (or a paused state) never queues up overlapping work — each tick fully +finishes before the next one schedules. That mirrors how a real Flink +operator backpressures: don't accept the next input until the current one +has been committed. + +## The batch builder + +`buildFeatures.js` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/nodejs/buildFeatures.js)). +It generates synthetic feature rows and calls `store.bulkLoad` once. The +synthesis itself is not the point — in a real deployment the equivalent code +reads from the offline store (Snowflake, BigQuery, Iceberg) and writes the +resulting hashes into Redis. + +```javascript +function synthesizeUsers(count, seed = 42) { + const rng = makeRng(seed); + const users = {}; + for (let i = 1; i <= count; i += 1) { + const uid = `u${String(i).padStart(4, "0")}`; + users[uid] = { + country_iso: rng.choice(COUNTRY_CHOICES), + risk_segment: rng.weightedChoice(RISK_SEGMENTS, RISK_WEIGHTS), + account_age_days: rng.int(7, 2400), + tx_count_7d: rng.int(0, 80), + avg_amount_30d: Number(rng.float(5, 350).toFixed(2)), + chargeback_count_180d: rng.weightedChoice( + CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS, + ), + }; + } + return users; +} +``` + +You can run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +node buildFeatures.js --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which is +how a typical operator would pre-seed a feature store from the command line +when debugging. + +## The interactive demo + +`demoServer.js` runs a Node.js `http` server on port 8086. The HTML page lets +you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. Drop the TTL to 30 s and watch the entire store expire on + schedule — the same thing that happens if a daily refresher fails. +* See the **store state** at a glance: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status (running / paused, ticks completed, + writes performed) and **pause or resume** it. Leave it paused for longer + than the streaming TTL to watch streaming fields drop out. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users and see + the total elapsed time plus the per-user breakdown. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL — the right view for debugging "why is this feature missing?" at + read time. + +The server holds one `FeatureStore` instance and one `StreamingWorker` for +the lifetime of the process. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Batched `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the demo + relies on per-field TTL for the mixed-staleness story. +* **Node.js 18 or later.** +* The `node-redis` client. Install it with: + + ```bash + npm install "redis@^5" + ``` + + Field-level TTL bindings (`hExpire`, `hTTL`) ship in node-redis 5. + +If your Redis server is running elsewhere, start the demo with `--redis-host` +and `--redis-port`. + +## Running the demo + +### Get the source files + +The demo consists of five files. Download them from the +[`nodejs` source folder](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/nodejs) +on GitHub, or grab them with `curl`: + +```bash +mkdir feature-store-nodejs-demo && cd feature-store-nodejs-demo +BASE=https://raw.githubusercontent.com/redis/docs/main/content/develop/use-cases/feature-store/nodejs +curl -O $BASE/package.json +curl -O $BASE/featureStore.js +curl -O $BASE/buildFeatures.js +curl -O $BASE/streamingWorker.js +curl -O $BASE/demoServer.js +npm install +``` + +### Start the demo server + +From that directory: + +```bash +node demoServer.js +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8086 +Using Redis at localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +By default the demo wipes the configured key prefix on startup so each run +starts from a clean state. Pass `--no-reset` to keep any existing data, or +`--key-prefix ` to point the demo at a different prefix entirely. + +Open [http://127.0.0.1:8086](http://127.0.0.1:8086) in a browser. Useful +things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming subset + — you'll see batch fields with no per-field TTL (covered by the key-level + TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a 100-user + batch read. +* Click **Pause / resume** on the streaming worker and leave it paused for + ~5 minutes (or restart the server with `--streaming-ttl-seconds 30` to + make it visible in seconds). Re-run **Read features** on any user and + watch the streaming fields disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Bulk-load** with a short TTL (say 30 seconds) and watch the user + count fall to zero on the next minute — the same thing that happens if a + daily batch run fails to land. +* Click **Reset** to drop every user and start over. + +The server is read/write against your local Redis. The default key prefix is +`fs:user:`. Pass `--no-reset` to keep existing data across restarts, or +`--redis-host` / `--redis-port` to point at a different Redis. + +## Production usage + +The guidance below focuses on the production concerns that are specific to +running a feature store on Redis. For the generic node-redis production +checklist — connection pooling, TLS and AUTH, error handling, retry policy — +see the [node-redis client guide]({{< relref "/develop/clients/nodejs" >}}) +and the +[connect-with-TLS recipe]({{< relref "/develop/clients/nodejs/connect#connect-to-your-production-redis-with-tls" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness from a +broken batch pipeline. Set it longer than your worst-case batch outage so a +single missed run doesn't take the feature store offline, but short enough +that a sustained outage causes loud failures (missing entities) rather than +quiet ones (yesterday's features being scored as today's). The standard +choice is one cycle of "expected refresh interval × 2" — for a daily batch, +48 hours; for a 6-hour batch, 12 hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't churn +features needlessly, but short enough that a stalled worker causes visible +freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the schema in +your offline store. The batch materialization step is your chance to flatten +joins, encode categoricals, and project to whatever shape the model server +wants — so the request path is exactly one `HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own schema; the +serving pipeline reads from Redis with the flattened serving schema. Keeping +those two pipelines as the same code path is what prevents training-serving +skew. + +### Pipeline batch reads across shards + +On a single Redis instance, pipelining `HMGET` across `N` users through +`multi().exec()` is one round trip. On a Redis Cluster, the keys land on +different shards — node-redis's cluster client dispatches each `hmGet` to +the right shard transparently, but you still pay one round trip per shard +rather than one for the whole batch. For very latency-sensitive batch +inference, group users by hash slot and issue one `multi().exec()` per +shard in parallel. + +For a small number of frequently-queried users (a top-N customer list, for +example), a hash tag like `fs:user:{vip}:u0001` forces the keys onto the +same shard and lets one pipeline serve them all in one round trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the streaming +write applies `HEXPIRE` *every time*. If a streaming worker writes a field +without renewing its TTL, the field carries whatever expiry was there before +— possibly none, possibly stale — and the mixed-staleness invariant breaks. +Keep the `HSET` and `HEXPIRE` in the same `multi()` (or, even safer, in the +same [Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model doesn't +need. With dozens of features per entity, that is wasted serialization work +on the server and wasted bandwidth on the wire. Always specify the field list +explicitly with `hmGet` in the model server. + +The exception is debugging and feature-set discovery, where you genuinely +want the full hash. The demo's "Inspect" button uses `hGetAll` for exactly +this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the hash +(either it was never written, or it expired); `-1` means the field has no +TTL set (and is therefore covered only by the key-level `EXPIRE`); any +positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on streaming + features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL aligned + with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one network + round trip — see + [transactions and pipelining]({{< relref "/develop/clients/nodejs/transpipe" >}}). + +See the [node-redis documentation]({{< relref "/develop/clients/nodejs" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the deeper +conceptual model — including the listpack encoding that makes small hashes +particularly compact in memory, which matters at feature-store scale. diff --git a/content/develop/use-cases/feature-store/nodejs/buildFeatures.js b/content/develop/use-cases/feature-store/nodejs/buildFeatures.js new file mode 100644 index 0000000000..0b49fa97cf --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/buildFeatures.js @@ -0,0 +1,157 @@ +#!/usr/bin/env node +"use strict"; + +/** + * Synthesize a small batch of users with realistic-looking features and + * bulk-load them into Redis with a 24-hour key-level TTL. + * + * Stands in for the nightly Spark / Feast materialization job in a real + * deployment. In production the equivalent of this script lives in an + * offline pipeline that reads from the offline store and writes the + * serving-time hashes into Redis via `HSET` + `EXPIRE`. + */ + +const { createClient } = require("redis"); +const { FeatureStore } = require("./featureStore"); + +const COUNTRY_CHOICES = [ + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", +]; +const RISK_SEGMENTS = ["low", "medium", "high"]; +const RISK_WEIGHTS = [70, 25, 5]; +const CHARGEBACK_BUCKETS = [0, 1, 2, 3]; +const CHARGEBACK_WEIGHTS = [85, 10, 4, 1]; + +/** + * Deterministic LCG so the synthetic data is reproducible across runs + * without pulling in a third-party PRNG. Not for any other purpose. + */ +function makeRng(seed) { + let state = (seed >>> 0) || 1; + return { + next() { + // Numerical Recipes LCG constants. + state = (Math.imul(state, 1664525) + 1013904223) >>> 0; + return state / 0x1_0000_0000; + }, + int(min, max) { + return Math.floor(this.next() * (max - min + 1)) + min; + }, + float(min, max) { + return this.next() * (max - min) + min; + }, + choice(items) { + return items[this.int(0, items.length - 1)]; + }, + weightedChoice(items, weights) { + const total = weights.reduce((a, b) => a + b, 0); + let r = this.next() * total; + for (let i = 0; i < items.length; i += 1) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + }, + }; +} + +/** + * Generate `count` synthetic user feature rows. + * + * The shape mirrors a small fraud-scoring feature set: country and + * risk segment as TAG-like categorical features, plus a few numeric + * aggregates over recent windows. + * + * @param {number} count + * @param {number} [seed=42] + */ +function synthesizeUsers(count, seed = 42) { + const rng = makeRng(seed); + const users = {}; + for (let i = 1; i <= count; i += 1) { + const uid = `u${String(i).padStart(4, "0")}`; + users[uid] = { + country_iso: rng.choice(COUNTRY_CHOICES), + risk_segment: rng.weightedChoice(RISK_SEGMENTS, RISK_WEIGHTS), + account_age_days: rng.int(7, 2400), + tx_count_7d: rng.int(0, 80), + avg_amount_30d: Number(rng.float(5, 350).toFixed(2)), + chargeback_count_180d: rng.weightedChoice( + CHARGEBACK_BUCKETS, + CHARGEBACK_WEIGHTS, + ), + }; + } + return users; +} + +function parseArgs(argv) { + const opts = { + redisHost: "localhost", + redisPort: 6379, + count: 200, + ttlSeconds: 24 * 60 * 60, + keyPrefix: "fs:user:", + seed: 42, + }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + const next = () => argv[i + 1]; + switch (arg) { + case "--redis-host": opts.redisHost = next(); i += 1; break; + case "--redis-port": opts.redisPort = Number(next()); i += 1; break; + case "--count": opts.count = Number(next()); i += 1; break; + case "--ttl-seconds": opts.ttlSeconds = Number(next()); i += 1; break; + case "--key-prefix": opts.keyPrefix = next(); i += 1; break; + case "--seed": opts.seed = Number(next()); i += 1; break; + case "-h": + case "--help": + console.log( + "Usage: node buildFeatures.js " + + "[--redis-host H] [--redis-port P] [--count N] " + + "[--ttl-seconds S] [--key-prefix PREFIX] [--seed N]", + ); + process.exit(0); + break; + default: + console.error(`Unknown argument: ${arg}`); + process.exit(2); + } + } + return opts; +} + +async function main() { + const opts = parseArgs(process.argv.slice(2)); + + const client = createClient({ + socket: { host: opts.redisHost, port: opts.redisPort }, + }); + client.on("error", (err) => console.error("Redis client error:", err)); + await client.connect(); + + const store = new FeatureStore({ + redisClient: client, + keyPrefix: opts.keyPrefix, + batchTtlSeconds: opts.ttlSeconds, + }); + + const rows = synthesizeUsers(opts.count, opts.seed); + await store.bulkLoad(rows); + + console.log( + `Materialized ${Object.keys(rows).length} users at ${opts.keyPrefix}* ` + + `with a ${opts.ttlSeconds}s key-level TTL.`, + ); + + await client.quit(); +} + +if (require.main === module) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} + +module.exports = { synthesizeUsers }; diff --git a/content/develop/use-cases/feature-store/nodejs/demoServer.js b/content/develop/use-cases/feature-store/nodejs/demoServer.js new file mode 100644 index 0000000000..0898b5a802 --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/demoServer.js @@ -0,0 +1,804 @@ +#!/usr/bin/env node +"use strict"; + +/** + * Redis feature-store demo server (Node.js). + * + * Run this file and visit http://localhost:8086 to watch an online + * feature store at work: a batch materialization loads N users with a + * 24-hour key-level TTL, a streaming worker overwrites a handful of + * users' real-time features every second with a per-field `HEXPIRE`, + * and the inference panel reads any subset of features for any user + * with `HMGET` in a single round trip. + * + * Use the UI to: + * + * - Bulk-load (re-materialize) the batch features, optionally with a + * short TTL so you can watch a whole entity expire on schedule. + * - Pause the streaming worker and watch the streaming fields drop + * out via `HEXPIRE` while the batch fields remain populated under + * the longer key-level TTL — the *mixed staleness* story made + * visible. + * - Pull features for one user (`HMGET`) and see the value, per-field + * TTL, and read latency. + * - Batch-score N users in one round trip and see the per-entity / + * per-round-trip latency split. + * - Inspect a single user's hash in detail with field-level TTLs. + */ + +const http = require("http"); +const { URL, URLSearchParams } = require("url"); +const { performance } = require("perf_hooks"); +const { createClient } = require("redis"); + +const { + FeatureStore, + DEFAULT_BATCH_FIELDS, + DEFAULT_STREAMING_FIELDS, +} = require("./featureStore"); +const { StreamingWorker } = require("./streamingWorker"); +const { synthesizeUsers } = require("./buildFeatures"); + + +const HTML_TEMPLATE = ` + + + + + Redis Feature Store Demo (Node.js) + + + +
+
node-redis + Node.js standard http module
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + inside a multi(), so the whole batch ships in + one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + multi(). One network round trip for the whole + batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +`; + + +class FeatureStoreDemo { + /** + * @param {object} options + * @param {FeatureStore} options.store + * @param {StreamingWorker} options.worker + * @param {number} options.seed + */ + constructor({ store, worker, seed }) { + this.store = store; + this.worker = worker; + this.seed = seed; + } + + async materialize(count, ttlSeconds) { + const rows = synthesizeUsers(count, this.seed); + const start = performance.now(); + const loaded = await this.store.bulkLoad(rows, ttlSeconds); + const elapsedMs = performance.now() - start; + return { loaded, ttl_seconds: ttlSeconds, elapsed_ms: elapsedMs }; + } + + async reset() { + // Pause the streaming worker around the DEL sweep so a concurrent + // tick can't recreate a user that was just enumerated for deletion + // (streaming HSET creates the key if it's missing, and that would + // leave behind a streaming-only hash with no key-level TTL). + const wasPaused = this.worker.paused; + if (this.worker.running && !wasPaused) this.worker.pause(); + try { + const deleted = await this.store.reset(); + this.store.resetStats(); + this.worker.resetStats(); + return { deleted }; + } finally { + if (this.worker.running && !wasPaused) this.worker.resume(); + } + } + + toggleWorker() { + if (!this.worker.running) this.worker.start(); + if (this.worker.paused) this.worker.resume(); + else this.worker.pause(); + return { paused: this.worker.paused, running: this.worker.running }; + } +} + + +function readBody(req) { + return new Promise((resolve, reject) => { + const chunks = []; + req.on("data", (c) => chunks.push(c)); + req.on("end", () => resolve(Buffer.concat(chunks).toString("utf8"))); + req.on("error", reject); + }); +} + + +function sendJson(res, payload, status = 200) { + res.writeHead(status, { "Content-Type": "application/json" }); + res.end(JSON.stringify(payload)); +} + + +function sendHtml(res, html, status = 200) { + res.writeHead(status, { "Content-Type": "text/html; charset=utf-8" }); + res.end(html); +} + + +function renderHtmlPage(store, worker) { + return HTML_TEMPLATE + .replaceAll("__KEY_PREFIX__", store.keyPrefix) + .replaceAll("__STREAM_TTL__", String(store.streamingTtlSeconds)) + .replaceAll("__USERS_PER_TICK__", String(worker.usersPerTick)) + .replaceAll("__BATCH_FIELDS_JSON__", JSON.stringify([...DEFAULT_BATCH_FIELDS])) + .replaceAll("__STREAM_FIELDS_JSON__", JSON.stringify([...DEFAULT_STREAMING_FIELDS])); +} + + +async function handleRequest(req, res, ctx) { + const url = new URL(req.url, `http://${req.headers.host || "localhost"}`); + const { store, worker, demo } = ctx; + + try { + if (req.method === "GET" && (url.pathname === "/" || url.pathname === "/index.html")) { + sendHtml(res, renderHtmlPage(store, worker)); + return; + } + if (req.method === "GET" && url.pathname === "/state") { + const ids = await store.listEntityIds(500); + // listEntityIds caps at 500 for the dropdown; report the true total + // separately so the UI's "users in store" doesn't silently truncate. + const entityCount = await store.countEntities(); + sendJson(res, { + key_prefix: store.keyPrefix, + batch_ttl_seconds: store.batchTtlSeconds, + streaming_ttl_seconds: store.streamingTtlSeconds, + entity_count: entityCount, + entity_ids: ids, + stats: store.stats(), + worker: worker.statsSnapshot(), + }); + return; + } + if (req.method === "GET" && url.pathname === "/inspect") { + const user = (url.searchParams.get("user") || "").trim(); + if (!user) { sendJson(res, { error: "user is required" }, 400); return; } + const full = await store.getFeatures(user, null); + if (Object.keys(full).length === 0) { + const keyTtl = await store.keyTtlSeconds(user); + sendJson(res, { exists: false, key_ttl_seconds: keyTtl }); + return; + } + const fieldNames = Object.keys(full); + const ttls = await store.fieldTtlsSeconds(user, fieldNames); + const keyTtl = await store.keyTtlSeconds(user); + const fields = fieldNames + .map((name) => ({ name, value: full[name], ttl_seconds: ttls[name] ?? -1 })) + .sort((a, b) => a.name.localeCompare(b.name)); + sendJson(res, { + exists: true, + key_ttl_seconds: keyTtl, + fields, + }); + return; + } + + if (req.method !== "POST") { + res.writeHead(404).end(); + return; + } + + const body = await readBody(req); + const params = new URLSearchParams(body); + + if (url.pathname === "/bulk-load") { + const count = Math.max(1, Math.min(2000, Number(params.get("count") || "200"))); + const ttl = Math.max(5, Math.min(172_800, Number(params.get("ttl") || "86400"))); + sendJson(res, await demo.materialize(count, ttl)); + return; + } + if (url.pathname === "/reset") { + sendJson(res, await demo.reset()); + return; + } + if (url.pathname === "/worker/toggle") { + sendJson(res, demo.toggleWorker()); + return; + } + if (url.pathname === "/read") { + const user = (params.get("user") || "").trim(); + if (!user) { sendJson(res, { error: "user is required" }, 400); return; } + const fields = params.getAll("field").filter((f) => f); + const start = performance.now(); + const values = fields.length ? await store.getFeatures(user, fields) : {}; + const elapsedMs = performance.now() - start; + const ttls = fields.length ? await store.fieldTtlsSeconds(user, fields) : {}; + const keyTtl = await store.keyTtlSeconds(user); + sendJson(res, { + requested: fields, + values, + ttls, + key_ttl_seconds: keyTtl, + returned_count: Object.keys(values).length, + elapsed_ms: elapsedMs, + }); + return; + } + if (url.pathname === "/batch-read") { + const count = Math.max(1, Math.min(500, Number(params.get("count") || "100"))); + let fields = params.getAll("field").filter((f) => f); + if (fields.length === 0) { + fields = [...DEFAULT_STREAMING_FIELDS, "risk_segment"]; + } + let ids = await store.listEntityIds(2000); + if (ids.length > count) ids = ids.slice(0, count); + const start = performance.now(); + const rows = await store.batchGetFeatures(ids, fields); + const elapsedMs = performance.now() - start; + const sample = ids.slice(0, 10).map((id) => ({ + id, + field_count: Object.keys(rows[id] || {}).length, + })); + sendJson(res, { + entity_count: ids.length, + field_count: fields.length, + elapsed_ms: elapsedMs, + sample, + }); + return; + } + + res.writeHead(404).end(); + } catch (err) { + console.error(`[demo] ${req.method} ${url.pathname} failed:`, err); + sendJson(res, { error: err.message || "internal error" }, 500); + } +} + + +function parseArgs(argv) { + const opts = { + host: "127.0.0.1", + port: 8086, + redisHost: "localhost", + redisPort: 6379, + keyPrefix: "fs:user:", + batchTtlSeconds: 24 * 60 * 60, + streamingTtlSeconds: 5 * 60, + usersPerTick: 5, + seedUsers: 200, + resetOnStart: true, + }; + for (let i = 0; i < argv.length; i += 1) { + const arg = argv[i]; + const next = () => argv[i + 1]; + switch (arg) { + case "--host": opts.host = next(); i += 1; break; + case "--port": opts.port = Number(next()); i += 1; break; + case "--redis-host": opts.redisHost = next(); i += 1; break; + case "--redis-port": opts.redisPort = Number(next()); i += 1; break; + case "--key-prefix": opts.keyPrefix = next(); i += 1; break; + case "--batch-ttl-seconds": + opts.batchTtlSeconds = Number(next()); i += 1; break; + case "--streaming-ttl-seconds": + opts.streamingTtlSeconds = Number(next()); i += 1; break; + case "--users-per-tick": + opts.usersPerTick = Number(next()); i += 1; break; + case "--seed-users": opts.seedUsers = Number(next()); i += 1; break; + case "--no-reset": opts.resetOnStart = false; break; + case "-h": + case "--help": + console.log( + "Usage: node demoServer.js [--host H] [--port P] " + + "[--redis-host H] [--redis-port P] [--key-prefix PFX] " + + "[--batch-ttl-seconds S] [--streaming-ttl-seconds S] " + + "[--users-per-tick N] [--seed-users N] [--no-reset]", + ); + process.exit(0); + break; + default: + console.error(`Unknown argument: ${arg}`); + process.exit(2); + } + } + return opts; +} + + +async function main() { + const opts = parseArgs(process.argv.slice(2)); + + const client = createClient({ + socket: { host: opts.redisHost, port: opts.redisPort }, + }); + client.on("error", (err) => console.error("Redis client error:", err)); + await client.connect(); + + const store = new FeatureStore({ + redisClient: client, + keyPrefix: opts.keyPrefix, + batchTtlSeconds: opts.batchTtlSeconds, + streamingTtlSeconds: opts.streamingTtlSeconds, + }); + const worker = new StreamingWorker({ + store, + usersPerTick: opts.usersPerTick, + }); + const demo = new FeatureStoreDemo({ store, worker, seed: 42 }); + + if (opts.resetOnStart) { + console.log( + `Dropping any existing users under '${opts.keyPrefix}*' for a` + + " clean demo run (pass --no-reset to keep them).", + ); + await store.reset(); + store.resetStats(); + } + const { loaded: seeded } = await demo.materialize( + opts.seedUsers, + opts.batchTtlSeconds, + ); + + worker.start(); + + const server = http.createServer((req, res) => { + handleRequest(req, res, { store, worker, demo }).catch((err) => { + console.error("[demo] handler crashed:", err); + try { res.writeHead(500).end(); } catch (_) { /* socket already closed */ } + }); + }); + + await new Promise((resolve) => server.listen(opts.port, opts.host, resolve)); + console.log( + `Redis feature-store demo server listening on http://${opts.host}:${opts.port}`, + ); + console.log( + `Using Redis at ${opts.redisHost}:${opts.redisPort}` + + ` with key prefix '${opts.keyPrefix}'` + + ` (batch TTL ${opts.batchTtlSeconds}s,` + + ` streaming TTL ${opts.streamingTtlSeconds}s)`, + ); + console.log(`Materialized ${seeded} user(s); streaming worker running.`); + + const shutdown = async (signal) => { + console.log(`\nReceived ${signal}, shutting down...`); + await worker.stop(); + server.close(); + await client.quit(); + process.exit(0); + }; + process.on("SIGINT", () => shutdown("SIGINT")); + process.on("SIGTERM", () => shutdown("SIGTERM")); +} + + +if (require.main === module) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/content/develop/use-cases/feature-store/nodejs/featureStore.js b/content/develop/use-cases/feature-store/nodejs/featureStore.js new file mode 100644 index 0000000000..5ec1467593 --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/featureStore.js @@ -0,0 +1,451 @@ +"use strict"; + +/** + * Redis online feature store backed by per-entity Hashes. + * + * Each entity (here, a user) lives at a deterministic key such as + * `fs:user:{id}`. The hash holds every feature for that entity as one + * field per feature — batch-materialized aggregates (refreshed on a + * daily cycle) alongside streaming-updated signals (refreshed every + * few seconds). One `HMGET` returns whichever subset the model needs + * in one network round trip. + * + * Two TTL layers solve the *mixed staleness* problem: + * + * - A key-level `EXPIRE` aligned with the batch materialization + * cycle causes the whole entity to disappear if its batch + * refresher fails, so inference sees a missing entity (which the + * model handler can detect and fall back on) rather than silently + * outdated values. + * - A per-field `HEXPIRE` on each streaming field gives that field + * its own shorter expiry, independent of the rest of the hash. + * When the streaming pipeline stops updating a field, the field + * self-cleans while the rest of the entity stays populated. + * + * `HEXPIRE` and `HTTL` require Redis 7.4 or later. node-redis 5 + * exposes them as `hExpire` and `hTTL`. + * + * Concurrency is by construction: Redis is single-threaded per shard, + * so overlapping `HSET` calls from a batch job and a streaming worker + * on the same entity hash are applied atomically without locks or + * version columns. + */ + +/** + * @typedef {string|number|boolean} FeatureValue + * @typedef {Record} FeatureMap + */ + +/** + * Default batch feature schema. Daily aggregates computed offline and + * bulk-loaded once per materialization cycle. + */ +const DEFAULT_BATCH_FIELDS = Object.freeze([ + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d", +]); + +/** + * Default streaming feature schema. Updated by the streaming worker as + * new events arrive, with a per-field TTL so each field self-expires + * when its upstream pipeline stops. + */ +const DEFAULT_STREAMING_FIELDS = Object.freeze([ + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country", +]); + +/** + * Encode a feature value as a string for hash storage. + * + * Booleans become `"true"` / `"false"` (not `"True"` / `"False"`) so + * they round-trip cleanly through other clients and `redis-cli`. + * + * @param {FeatureValue} value + * @returns {string} + */ +function encode(value) { + if (typeof value === "boolean") return value ? "true" : "false"; + return String(value); +} + +class FeatureStore { + /** + * @param {object} options + * @param {import("redis").RedisClientType} options.redisClient + * @param {string} [options.keyPrefix="fs:user:"] + * @param {number} [options.batchTtlSeconds=86400] + * @param {number} [options.streamingTtlSeconds=300] + */ + constructor({ + redisClient, + keyPrefix = "fs:user:", + batchTtlSeconds = 24 * 60 * 60, + streamingTtlSeconds = 5 * 60, + } = {}) { + if (!redisClient) { + throw new Error("A connected redisClient is required."); + } + this.redis = redisClient; + this.keyPrefix = keyPrefix; + this.batchTtlSeconds = batchTtlSeconds; + this.streamingTtlSeconds = streamingTtlSeconds; + + // Node.js is single-threaded for JS execution, so plain numbers + // are safe for counters. No lock needed. + this.batchWritesTotal = 0; + this.streamingWritesTotal = 0; + this.readsTotal = 0; + this.readFieldsTotal = 0; + } + + // --- Key helpers --------------------------------------------------- + + /** @param {string} entityId */ + keyFor(entityId) { + return `${this.keyPrefix}${entityId}`; + } + + // --- Batch ingestion (materialization) ----------------------------- + + /** + * Materialize a batch of entities into Redis. + * + * `rows` is `{entityId: {field: value, ...}}`. One `HSET` plus one + * `EXPIRE` per entity, all batched into a single round trip through + * `multi().exec()`. The key-level `EXPIRE` is what makes the whole + * entity disappear if a future batch run fails — inference reads + * the missing entity rather than silently outdated values. + * + * @param {Record} rows + * @param {number} [ttlSeconds] + * @returns {Promise} + */ + async bulkLoad(rows, ttlSeconds) { + const ttl = ttlSeconds ?? this.batchTtlSeconds; + const ids = Object.keys(rows); + if (ids.length === 0) return 0; + + const pipe = this.redis.multi(); + for (const entityId of ids) { + const key = this.keyFor(entityId); + const fields = rows[entityId]; + const encoded = {}; + for (const [name, value] of Object.entries(fields)) { + encoded[name] = encode(value); + } + pipe.hSet(key, encoded); + pipe.expire(key, ttl); + } + await pipe.exec(); + this.batchWritesTotal += ids.length; + return ids.length; + } + + /** + * Update a single batch feature without touching the key TTL. + * + * Used by the demo's "manually refresh one user" lever; in a real + * pipeline batch updates always flow through `bulkLoad`. + * + * @param {string} entityId + * @param {string} field + * @param {FeatureValue} value + * @returns {Promise} + */ + async updateBatchFeature(entityId, field, value) { + await this.redis.hSet(this.keyFor(entityId), field, encode(value)); + this.batchWritesTotal += 1; + } + + // --- Streaming ingestion ------------------------------------------- + + /** + * Write streaming features with a per-field TTL. + * + * Each field carries its own `HEXPIRE` so it self-expires + * independently of the rest of the hash. If the streaming pipeline + * stops, the streaming fields drop out while the batch-materialized + * fields remain populated under their longer key-level `EXPIRE`. + * + * `HEXPIRE` returns one status code per field: 1 = TTL set, + * 2 = skipped under a conditional flag, 0 = no such field, + * -2 = no such key. We just `HSET` every field on the same call, + * so any code other than 1 means the per-field TTL invariant did + * not hold — the mixed-staleness story relies on every streaming + * field carrying a fresh TTL after the write, so failure is loud. + * + * @param {string} entityId + * @param {FeatureMap} fields + * @param {number} [ttlSeconds] + * @returns {Promise} + */ + async updateStreaming(entityId, fields, ttlSeconds) { + const names = Object.keys(fields); + if (names.length === 0) return; + const ttl = ttlSeconds ?? this.streamingTtlSeconds; + const key = this.keyFor(entityId); + const encoded = {}; + for (const [name, value] of Object.entries(fields)) { + encoded[name] = encode(value); + } + + const [, expireResult] = await this.redis + .multi() + .hSet(key, encoded) + .hExpire(key, names, ttl) + .exec(); + if (!Array.isArray(expireResult) || + expireResult.some((code) => Number(code) !== 1)) { + throw new Error( + `HEXPIRE did not set every field TTL for ${key}: ` + + JSON.stringify(expireResult), + ); + } + this.streamingWritesTotal += names.length; + } + + // --- Inference reads ----------------------------------------------- + + /** + * Retrieve a subset of features for one entity. + * + * `HMGET` returns the requested fields in one round trip. Pass + * `fieldNames=null` (the default) to fetch the entire hash with + * `HGETALL` — useful for debugging but rarely the right call on the + * request path, where the model knows exactly which features it + * consumes. + * + * @param {string} entityId + * @param {string[] | null} [fieldNames] + * @returns {Promise>} + */ + async getFeatures(entityId, fieldNames = null) { + const key = this.keyFor(entityId); + if (fieldNames === null || fieldNames === undefined) { + const data = await this.redis.hGetAll(key); + this.readsTotal += 1; + this.readFieldsTotal += Object.keys(data).length; + return data; + } + const names = [...fieldNames]; + if (names.length === 0) return {}; + const values = await this.redis.hmGet(key, names); + const out = {}; + let returned = 0; + for (let i = 0; i < names.length; i += 1) { + const v = values[i]; + if (v !== null && v !== undefined) { + out[names[i]] = v; + returned += 1; + } + } + this.readsTotal += 1; + this.readFieldsTotal += returned; + return out; + } + + /** + * Pipeline `HMGET` across many entities for batch scoring. + * + * Hundreds of entities in one round trip. The model can then score + * them all without further network calls. + * + * @param {Iterable} entityIds + * @param {Iterable} fieldNames + * @returns {Promise>>} + */ + async batchGetFeatures(entityIds, fieldNames) { + const ids = [...entityIds]; + const names = [...fieldNames]; + if (ids.length === 0 || names.length === 0) return {}; + + const pipe = this.redis.multi(); + for (const entityId of ids) { + pipe.hmGet(this.keyFor(entityId), names); + } + const rows = await pipe.exec(); + + const out = {}; + let seenFields = 0; + for (let i = 0; i < ids.length; i += 1) { + const values = rows[i] || []; + const row = {}; + for (let j = 0; j < names.length; j += 1) { + const v = values[j]; + if (v !== null && v !== undefined) { + row[names[j]] = v; + seenFields += 1; + } + } + out[ids[i]] = row; + } + this.readsTotal += ids.length; + this.readFieldsTotal += seenFields; + return out; + } + + // --- TTL inspection (used by the demo UI) -------------------------- + + /** + * Seconds until the entity key expires. + * + * Returns `-1` if no key-level TTL is set, `-2` if the key doesn't + * exist. + * + * @param {string} entityId + * @returns {Promise} + */ + async keyTtlSeconds(entityId) { + return Number(await this.redis.ttl(this.keyFor(entityId))); + } + + /** + * Per-field TTL via `HTTL` (Redis 7.4+). + * + * Each value mirrors the `TTL` convention: positive means seconds + * remaining, `-1` means no TTL on the field, `-2` means the field + * doesn't exist on this hash (or the key itself is missing). + * + * Normalized for forward-compat: some client versions can report + * `null` for a missing key or a singleton list-of-list in pipeline + * contexts. Both shapes collapse back to the flat list shape that + * matches the field order passed in. + * + * @param {string} entityId + * @param {Iterable} fieldNames + * @returns {Promise>} + */ + async fieldTtlsSeconds(entityId, fieldNames) { + const names = [...fieldNames]; + if (names.length === 0) return {}; + let ttls = await this.redis.hTTL(this.keyFor(entityId), names); + if (ttls === null || ttls === undefined) { + ttls = names.map(() => -2); + } else if (Array.isArray(ttls) && ttls.length === 1 && Array.isArray(ttls[0])) { + ttls = ttls[0]; + } + const out = {}; + for (let i = 0; i < names.length; i += 1) { + out[names[i]] = Number(ttls[i]); + } + return out; + } + + // --- Demo housekeeping --------------------------------------------- + + /** + * Enumerate entity IDs by scanning `keyPrefix*`. + * + * `SCAN` is non-blocking; the demo uses it to populate UI dropdowns, + * not as a serving primitive. + * + * @param {number} [limit=200] + * @returns {Promise} + */ + async listEntityIds(limit = 200) { + const ids = []; + const prefixLen = this.keyPrefix.length; + for await (const key of this.redis.scanIterator({ + MATCH: `${this.keyPrefix}*`, + COUNT: 200, + })) { + // node-redis 5 yields each batch as an array; older majors yield one key. + if (Array.isArray(key)) { + for (const k of key) { + ids.push(k.slice(prefixLen)); + if (ids.length >= limit) break; + } + } else { + ids.push(key.slice(prefixLen)); + } + if (ids.length >= limit) break; + } + ids.sort(); + return ids.slice(0, limit); + } + + /** + * Count entities currently in the store (via `SCAN`). + * @returns {Promise} + */ + async countEntities() { + let count = 0; + for await (const key of this.redis.scanIterator({ + MATCH: `${this.keyPrefix}*`, + COUNT: 500, + })) { + count += Array.isArray(key) ? key.length : 1; + } + return count; + } + + /** + * @param {string} entityId + * @returns {Promise} + */ + async deleteEntity(entityId) { + return Number(await this.redis.del(this.keyFor(entityId))); + } + + /** + * Drop every entity under `keyPrefix`. Used by the demo reset path. + * + * Scans in batches and issues one variadic `DEL` per batch, so a + * large demo dataset doesn't land on the server as one giant + * synchronous delete. + * + * @returns {Promise} + */ + async reset() { + let deleted = 0; + let batch = []; + const flush = async () => { + if (batch.length === 0) return; + deleted += Number(await this.redis.del(batch)); + batch = []; + }; + for await (const key of this.redis.scanIterator({ + MATCH: `${this.keyPrefix}*`, + COUNT: 500, + })) { + if (Array.isArray(key)) { + batch.push(...key); + } else { + batch.push(key); + } + if (batch.length >= 500) await flush(); + } + await flush(); + return deleted; + } + + stats() { + return { + batch_writes_total: this.batchWritesTotal, + streaming_writes_total: this.streamingWritesTotal, + reads_total: this.readsTotal, + read_fields_total: this.readFieldsTotal, + }; + } + + resetStats() { + this.batchWritesTotal = 0; + this.streamingWritesTotal = 0; + this.readsTotal = 0; + this.readFieldsTotal = 0; + } +} + +module.exports = { + FeatureStore, + DEFAULT_BATCH_FIELDS, + DEFAULT_STREAMING_FIELDS, +}; diff --git a/content/develop/use-cases/feature-store/nodejs/package.json b/content/develop/use-cases/feature-store/nodejs/package.json new file mode 100644 index 0000000000..0cc11c8a7b --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/package.json @@ -0,0 +1,17 @@ +{ + "name": "redis-feature-store-nodejs-demo", + "version": "1.0.0", + "private": true, + "description": "Redis online feature store demo with node-redis and the Node.js standard http module.", + "main": "demoServer.js", + "scripts": { + "start": "node demoServer.js", + "build": "node buildFeatures.js" + }, + "dependencies": { + "redis": "^5.0.0" + }, + "engines": { + "node": ">=18" + } +} diff --git a/content/develop/use-cases/feature-store/nodejs/streamingWorker.js b/content/develop/use-cases/feature-store/nodejs/streamingWorker.js new file mode 100644 index 0000000000..4b381965e0 --- /dev/null +++ b/content/develop/use-cases/feature-store/nodejs/streamingWorker.js @@ -0,0 +1,178 @@ +"use strict"; + +/** + * Streaming feature updater for the demo. + * + * Stands in for whatever Flink, Kafka Streams, or bespoke service + * computes the real-time features in a real deployment. In production + * this code lives in the streaming layer; here it runs as an async + * timer next to the demo server so the page can start, pause, and + * resume it from the UI. + * + * Every tick it picks a few random users and writes a new value for + * each streaming feature, with a per-field `HEXPIRE` so the field + * self-expires if the worker is paused. Pause the worker for longer + * than `streamingTtlSeconds` and the streaming fields drop out of the + * hash while the batch fields remain populated under the longer + * key-level TTL — the *mixed staleness* story made visible. + */ + +const DEVICE_IDS = [ + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2", +]; +const SESSION_COUNTRIES = [ + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", +]; +const FAILED_LOGIN_BUCKETS = [0, 1, 2, 5]; +const FAILED_LOGIN_WEIGHTS = [70, 20, 8, 2]; + +function makeRng(seed) { + let state = (seed >>> 0) || 1; + return { + next() { + state = (Math.imul(state, 1664525) + 1013904223) >>> 0; + return state / 0x1_0000_0000; + }, + int(min, max) { + return Math.floor(this.next() * (max - min + 1)) + min; + }, + choice(items) { + return items[this.int(0, items.length - 1)]; + }, + weightedChoice(items, weights) { + const total = weights.reduce((a, b) => a + b, 0); + let r = this.next() * total; + for (let i = 0; i < items.length; i += 1) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + }, + sample(items, k) { + const pool = [...items]; + const out = []; + const n = Math.min(k, pool.length); + for (let i = 0; i < n; i += 1) { + const idx = this.int(0, pool.length - 1); + out.push(pool.splice(idx, 1)[0]); + } + return out; + }, + }; +} + +class StreamingWorker { + /** + * @param {object} options + * @param {import("./featureStore").FeatureStore} options.store + * @param {number} [options.tickSeconds=1.0] + * @param {number} [options.usersPerTick=5] + * @param {number} [options.seed=1337] + */ + constructor({ store, tickSeconds = 1.0, usersPerTick = 5, seed = 1337 } = {}) { + if (!store) throw new Error("store is required"); + this.store = store; + this.tickSeconds = tickSeconds; + this.usersPerTick = usersPerTick; + this.rng = makeRng(seed); + + this.running = false; + this.paused = false; + this.tickCount = 0; + this.writesCount = 0; + this._timer = null; + this._tickInFlight = false; + } + + // --- Lifecycle ----------------------------------------------------- + + start() { + if (this.running) return; + this.running = true; + this.paused = false; + this._schedule(); + } + + async stop() { + this.running = false; + if (this._timer) { + clearTimeout(this._timer); + this._timer = null; + } + // Wait for any in-flight tick to settle so we don't leak a write + // that completes after the caller has moved on. + while (this._tickInFlight) await new Promise((r) => setTimeout(r, 20)); + } + + pause() { this.paused = true; } + resume() { this.paused = false; } + + // --- Tick ---------------------------------------------------------- + + _schedule() { + if (!this.running) return; + this._timer = setTimeout( + () => this._run().catch((err) => + console.error("[streaming-worker] tick failed:", err), + ), + this.tickSeconds * 1000, + ); + } + + async _run() { + if (!this.running) return; + if (this.paused) { + this._schedule(); + return; + } + this._tickInFlight = true; + try { + await this._tick(); + } finally { + this._tickInFlight = false; + this._schedule(); + } + } + + async _tick() { + const ids = await this.store.listEntityIds(500); + if (ids.length === 0) return; + const chosen = this.rng.sample(ids, this.usersPerTick); + const nowMs = Date.now(); + let writes = 0; + for (const entityId of chosen) { + const fields = { + last_login_ts: nowMs, + last_device_id: this.rng.choice(DEVICE_IDS), + tx_count_5m: this.rng.int(0, 12), + failed_logins_15m: this.rng.weightedChoice( + FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS, + ), + session_country: this.rng.choice(SESSION_COUNTRIES), + }; + await this.store.updateStreaming(entityId, fields); + writes += Object.keys(fields).length; + } + this.tickCount += 1; + this.writesCount += writes; + } + + // --- Stats --------------------------------------------------------- + + statsSnapshot() { + return { + running: this.running, + paused: this.paused, + tick_count: this.tickCount, + writes_count: this.writesCount, + }; + } + + resetStats() { + this.tickCount = 0; + this.writesCount = 0; + } +} + +module.exports = { StreamingWorker }; From cf2472caec7ed719a859334a1f6796f4eee45632 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 14:18:56 +0100 Subject: [PATCH 04/20] DOC-6661 Codex review issues --- .../use-cases/feature-store/nodejs/_index.md | 49 ++++++++++++------- .../feature-store/nodejs/demoServer.js | 9 +++- .../feature-store/nodejs/featureStore.js | 17 ++++--- .../feature-store/nodejs/streamingWorker.js | 14 ++++++ .../feature-store/redis-py/demo_server.py | 1 + 5 files changed, 64 insertions(+), 26 deletions(-) diff --git a/content/develop/use-cases/feature-store/nodejs/_index.md b/content/develop/use-cases/feature-store/nodejs/_index.md index 06c38e9d48..150c06967a 100644 --- a/content/develop/use-cases/feature-store/nodejs/_index.md +++ b/content/develop/use-cases/feature-store/nodejs/_index.md @@ -225,11 +225,15 @@ async bulkLoad(rows, ttlSeconds) { } ``` -`multi()` in node-redis 5 wraps the batched commands in `MULTI/EXEC`, so the -whole batch runs as one transaction on the server. That gives all-or-nothing -semantics inside the batch but does block the server for its duration, which -is what you want for an ingestion script that runs end-to-end — not for a -hot-path serving call. (See +`multi().exec()` in node-redis 5 wraps the batched commands in `MULTI/EXEC`, +so Redis runs the queued commands contiguously and returns the replies in +order. Note that Redis transactions do *not* roll back commands that already +succeeded if a later command returns an error — node-redis surfaces those +errors by rejecting `exec()` with a `MultiErrorReply` whose `replies` array +still contains the successful results. For independent bulk-ingestion +commands that don't need the `MULTI/EXEC` wrapper at all, +`multi().execAsPipeline()` ships the same batch in one round trip with +slightly lower server-side overhead. (See [transactions and pipelining]({{< relref "/develop/clients/nodejs/transpipe" >}}) for the full mental model.) @@ -367,14 +371,19 @@ One round trip for the whole batch — the demo regularly returns 100 users in 2-3 ms against a local Redis. On a real network the round trip dominates; pipelining is what keeps batch scoring practical. -For very large batches on a clustered deployment, the same shape generalizes -to one pipeline per shard. node-redis's +For very large batches on a clustered deployment, the shape changes: a single +`multi().exec()` is bound to one shard, because `MULTI/EXEC` cannot span hash +slots, so the same `batchGetFeatures` call can only serve keys that hash to +the same shard. node-redis's [cluster client](https://github.com/redis/node-redis/blob/master/docs/clustering.md) -dispatches the per-user `hmGet` calls to the right shard transparently — you -still pay one round trip per shard rather than one for the whole batch. For -very latency-sensitive batch inference, group users by hash slot -(`cluster.calculateSlot(key)`) and issue one `multi().exec()` per shard in -parallel. +routes non-pipelined `hmGet` calls to the right shard transparently — so on a +cluster, fan out `await Promise.all(ids.map((id) => client.hmGet(...)))` and +the client pipelines per-shard for you. For very latency-sensitive batch +inference where the request-side cost of that fan-out matters, group the IDs +by hash slot ahead of time and issue one `multi().exec()` per shard in +parallel: each shard's batch then runs as one round trip. A hash tag like +`fs:user:{vip}:u0001` forces a known set of keys onto the same shard so one +`multi()` can cover all of them in a single round trip. ## The streaming worker @@ -620,16 +629,18 @@ skew. ### Pipeline batch reads across shards On a single Redis instance, pipelining `HMGET` across `N` users through -`multi().exec()` is one round trip. On a Redis Cluster, the keys land on -different shards — node-redis's cluster client dispatches each `hmGet` to -the right shard transparently, but you still pay one round trip per shard -rather than one for the whole batch. For very latency-sensitive batch -inference, group users by hash slot and issue one `multi().exec()` per -shard in parallel. +`multi().exec()` is one round trip. A Redis Cluster is different in two ways: +`MULTI/EXEC` is bound to one shard, so a single `multi()` cannot span keys +that hash to different shards; and the keys for a typical user batch will +land on multiple shards. For batch reads on a cluster, fan out parallel +`hmGet` calls with `Promise.all` — node-redis's cluster client pipelines +the calls per-shard automatically — or, for tighter control, group the IDs +by hash slot ahead of time and issue one `multi().exec()` per shard in +parallel. For a small number of frequently-queried users (a top-N customer list, for example), a hash tag like `fs:user:{vip}:u0001` forces the keys onto the -same shard and lets one pipeline serve them all in one round trip. +same shard and lets one `multi().exec()` serve them all in one round trip. ### Make HEXPIRE part of every streaming write diff --git a/content/develop/use-cases/feature-store/nodejs/demoServer.js b/content/develop/use-cases/feature-store/nodejs/demoServer.js index 0898b5a802..ff69a35e9e 100644 --- a/content/develop/use-cases/feature-store/nodejs/demoServer.js +++ b/content/develop/use-cases/feature-store/nodejs/demoServer.js @@ -373,6 +373,7 @@ const HTML_TEMPLATE = ` if (!confirm("Drop every user from the store?")) return; const r = await fetch("/reset", { method: "POST" }); const d = await r.json(); + if (!r.ok) { setStatus(d.error || "Reset failed.", "error"); return; } setStatus(\`Reset. Dropped \${d.deleted} user(s).\`, "ok"); await refresh(); }); @@ -506,8 +507,14 @@ class FeatureStoreDemo { // tick can't recreate a user that was just enumerated for deletion // (streaming HSET creates the key if it's missing, and that would // leave behind a streaming-only hash with no key-level TTL). + // pause() only blocks *future* ticks — we also have to await + // waitForIdle() so an already-running tick finishes its + // updateStreaming loop before we start enumerating keys. const wasPaused = this.worker.paused; - if (this.worker.running && !wasPaused) this.worker.pause(); + if (this.worker.running) { + if (!wasPaused) this.worker.pause(); + await this.worker.waitForIdle(); + } try { const deleted = await this.store.reset(); this.store.resetStats(); diff --git a/content/develop/use-cases/feature-store/nodejs/featureStore.js b/content/develop/use-cases/feature-store/nodejs/featureStore.js index 5ec1467593..b044aba7d1 100644 --- a/content/develop/use-cases/feature-store/nodejs/featureStore.js +++ b/content/develop/use-cases/feature-store/nodejs/featureStore.js @@ -175,12 +175,17 @@ class FeatureStore { * stops, the streaming fields drop out while the batch-materialized * fields remain populated under their longer key-level `EXPIRE`. * - * `HEXPIRE` returns one status code per field: 1 = TTL set, - * 2 = skipped under a conditional flag, 0 = no such field, - * -2 = no such key. We just `HSET` every field on the same call, - * so any code other than 1 means the per-field TTL invariant did - * not hold — the mixed-staleness story relies on every streaming - * field carrying a fresh TTL after the write, so failure is loud. + * `HEXPIRE` returns one status code per field: + * 1 = TTL set / updated, + * 2 = the expiry was 0 or in the past, so Redis deleted the field + * instead of applying a TTL, + * 0 = an `NX | XX | GT | LT` conditional flag was specified and not + * met (we never use one here), + * -2 = no such field, or no such key. + * We just `HSET` every field on the same call, so any code other + * than 1 means the per-field TTL invariant did not hold — the + * mixed-staleness story relies on every streaming field carrying a + * fresh TTL after the write, so failure is loud. * * @param {string} entityId * @param {FeatureMap} fields diff --git a/content/develop/use-cases/feature-store/nodejs/streamingWorker.js b/content/develop/use-cases/feature-store/nodejs/streamingWorker.js index 4b381965e0..d3b52f0786 100644 --- a/content/develop/use-cases/feature-store/nodejs/streamingWorker.js +++ b/content/develop/use-cases/feature-store/nodejs/streamingWorker.js @@ -102,6 +102,20 @@ class StreamingWorker { } // Wait for any in-flight tick to settle so we don't leak a write // that completes after the caller has moved on. + await this.waitForIdle(); + } + + /** + * Wait until any in-flight tick has finished its current `await` + * sequence. `pause()` only stops *future* ticks from running — it + * does not interrupt one that is already mid-flight. Callers that + * need a quiesced worker (a reset that's about to DEL every entity, + * for example) must pause AND await this before they touch state + * the tick might still be writing to. + * + * @returns {Promise} + */ + async waitForIdle() { while (this._tickInFlight) await new Promise((r) => setTimeout(r, 20)); } diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py index 268c0e9db1..d09f48ad00 100644 --- a/content/develop/use-cases/feature-store/redis-py/demo_server.py +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -382,6 +382,7 @@ if (!confirm("Drop every user from the store?")) return; const r = await fetch("/reset", { method: "POST" }); const d = await r.json(); + if (!r.ok) { setStatus(d.error || "Reset failed.", "error"); return; } setStatus(`Reset. Dropped ${d.deleted} user(s).`, "ok"); await refresh(); }); From f78e88cdb50a4d041e80ab706c546f70ed958111 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 15:51:55 +0100 Subject: [PATCH 05/20] DOC-6661 Go and Jedis after Codex review --- .../develop/use-cases/feature-store/_index.md | 2 + .../use-cases/feature-store/go/_index.md | 751 ++++++++++++ .../feature-store/go/build_features.go | 116 ++ .../go/cmd/build_features/main.go | 18 + .../feature-store/go/cmd/demo_server/main.go | 22 + .../use-cases/feature-store/go/demo_server.go | 954 ++++++++++++++++ .../feature-store/go/feature_store.go | 495 ++++++++ .../develop/use-cases/feature-store/go/go.mod | 11 + .../develop/use-cases/feature-store/go/go.sum | 22 + .../feature-store/go/streaming_worker.go | 231 ++++ .../java-jedis/BuildFeatures.java | 113 ++ .../feature-store/java-jedis/DemoServer.java | 1014 +++++++++++++++++ .../java-jedis/FeatureStore.java | 450 ++++++++ .../java-jedis/StreamingWorker.java | 220 ++++ .../feature-store/java-jedis/_index.md | 735 ++++++++++++ .../feature-store/java-jedis/pom.xml | 88 ++ 16 files changed, 5242 insertions(+) create mode 100644 content/develop/use-cases/feature-store/go/_index.md create mode 100644 content/develop/use-cases/feature-store/go/build_features.go create mode 100644 content/develop/use-cases/feature-store/go/cmd/build_features/main.go create mode 100644 content/develop/use-cases/feature-store/go/cmd/demo_server/main.go create mode 100644 content/develop/use-cases/feature-store/go/demo_server.go create mode 100644 content/develop/use-cases/feature-store/go/feature_store.go create mode 100644 content/develop/use-cases/feature-store/go/go.mod create mode 100644 content/develop/use-cases/feature-store/go/go.sum create mode 100644 content/develop/use-cases/feature-store/go/streaming_worker.go create mode 100644 content/develop/use-cases/feature-store/java-jedis/BuildFeatures.java create mode 100644 content/develop/use-cases/feature-store/java-jedis/DemoServer.java create mode 100644 content/develop/use-cases/feature-store/java-jedis/FeatureStore.java create mode 100644 content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java create mode 100644 content/develop/use-cases/feature-store/java-jedis/_index.md create mode 100644 content/develop/use-cases/feature-store/java-jedis/pom.xml diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index dcfb2cda24..65d8ca2d9c 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -157,3 +157,5 @@ for a single user under 1 ms, and pipeline batch reads across a hundred users. * [redis-py (Python)]({{< relref "/develop/use-cases/feature-store/redis-py" >}}) * [node-redis (Node.js)]({{< relref "/develop/use-cases/feature-store/nodejs" >}}) +* [go-redis (Go)]({{< relref "/develop/use-cases/feature-store/go" >}}) +* [Jedis (Java)]({{< relref "/develop/use-cases/feature-store/java-jedis" >}}) diff --git a/content/develop/use-cases/feature-store/go/_index.md b/content/develop/use-cases/feature-store/go/_index.md new file mode 100644 index 0000000000..b8be523e93 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/_index.md @@ -0,0 +1,751 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Go with go-redis +linkTitle: go-redis example (Go) +title: Redis feature store with go-redis +weight: 3 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Go with [`go-redis`]({{< relref "/develop/clients/go" >}}). It includes a +local web server built with Go's standard `net/http` package so you can +bulk-load a batch of users with a key-level TTL, run a streaming worker that +overwrites real-time features with per-field TTL, retrieve any subset of +features for one user under 1 ms, and pipeline `HMGET` across a hundred users +for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`build_features.go` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`streaming_worker.go` — the demo's stand-in for a Flink / Kafka Streams job. +The inference handlers of the demo server read any subset of those features +through `feature_store.go`'s helper type. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity in + one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `SynthesizeUsers(N, seed)` (in production, the + equivalent computation lives in an offline pipeline against the warehouse). + The result is `map[string]FeatureMap` for every user in this cycle. +2. `store.BulkLoad(ctx, rows, ttl)` batches one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through go-redis's + [`Pipeline`]({{< relref "/develop/clients/go/transpipe" >}}), so the whole + batch ships in a single round trip. The `HSET` writes every batch field; + the `EXPIRE` is what makes the entity disappear if the next batch run + fails, so inference reads a missing entity rather than silently outdated + values. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming +layer computes whatever real-time signals fall out of that event and calls +`store.UpdateStreaming(ctx, userID, fields, ttl)`. That batches: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field values. + Redis is single-threaded per shard, so this is atomic against any + concurrent batch write on the same hash — no version columns, no locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the fields + that were written, with the streaming TTL. Each streaming field carries + its own per-field expiry independent of the rest of the hash. Stop the + worker and these fields drop out one by one as their TTLs elapse, while + the batch fields remain populated under the longer key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is owned by + the model, not the store). +2. It calls `store.GetFeatures(ctx, userID, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values in + the same order as the requested fields, with `nil` for any field that + doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.BatchGetFeatures(ctx, userIDs, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` users + in a single network round trip. + +## The feature-store helper + +The `FeatureStore` type wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/go/feature_store.go)): + +```go +package main + +import ( + "context" + "fmt" + "time" + + "github.com/redis/go-redis/v9" + fs "featurestore" +) + +func main() { + ctx := context.Background() + rdb := redis.NewClient(&redis.Options{Addr: "localhost:6379"}) + defer rdb.Close() + + store := fs.NewFeatureStore(rdb, + "fs:user:", + 24*time.Hour, // whole-entity TTL aligned with the daily batch cycle + 5*time.Minute, // per-field TTL on each streaming feature + ) + + // Batch materialization: one HSET + EXPIRE per user, all pipelined. + rows := map[string]fs.FeatureMap{ + "u0001": {"country_iso": "US", "risk_segment": "low", + "tx_count_7d": 14, "avg_amount_30d": 92.40, + "account_age_days": 612, "chargeback_count_180d": 0}, + "u0002": {"country_iso": "GB", "risk_segment": "medium", + "tx_count_7d": 47, "avg_amount_30d": 220.10, + "account_age_days": 1840, "chargeback_count_180d": 1}, + } + store.BulkLoad(ctx, rows, store.BatchTTL) + + // Streaming write: HSET + HEXPIRE on just the fields that changed. + store.UpdateStreaming(ctx, "u0001", fs.FeatureMap{ + "last_login_ts": time.Now().UnixMilli(), + "last_device_id": "ios-9f02", + "tx_count_5m": 3, + "failed_logins_15m": 0, + "session_country": "US", + }, store.StreamingTTL) + + // Inference read: HMGET of whatever the model needs. + features, _ := store.GetFeatures(ctx, "u0001", []string{ + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m", + }) + fmt.Println(features) + + // Batch scoring: pipelined HMGET across many users. + batch, _ := store.BatchGetFeatures(ctx, + []string{"u0001", "u0002", "u0003"}, + []string{"risk_segment", "tx_count_5m", "failed_logins_15m"}, + ) + fmt.Println(batch) +} +``` + +### Package layout + +Go won't let `package main` live in the same directory as another package, so +the runnable entry points live in `cmd/`: + +```text +feature-store/go/ +├── go.mod +├── feature_store.go (package featurestore) +├── build_features.go (package featurestore; SynthesizeUsers + CLI) +├── streaming_worker.go (package featurestore) +├── demo_server.go (package featurestore; RunDemoServer) +└── cmd/ + ├── build_features/main.go (package main, shim → fs.BuildFeaturesCLI) + └── demo_server/main.go (package main, shim → fs.RunDemoServer) +``` + +Build and run with `go run ./cmd/demo_server`. The shim is the only `main` +package; everything else is library code. + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes on the wire, so the helper encodes booleans as `"true"` / +`"false"` and renders numbers with `strconv`. The model server is responsible +for parsing back to the right type, the same way it would when reading any +serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +The batch fields sit under the key-level `EXPIRE`. The streaming fields each +carry their own [`HEXPIRE`]({{< relref "/commands/hexpire" >}}). If the +streaming pipeline stops, the streaming fields drop one by one as their +per-field TTLs elapse; the batch fields stay until the daily key-level +`EXPIRE` fires (or the next batch cycle re-pins them). + +### Bulk-loading batch features + +`BulkLoad` pipelines one `HSET` and one `EXPIRE` per user. With 500 users +that's 1000 commands in one network call — Redis processes them sequentially +on the server side but the client only pays one RTT. + +```go +func (fs *FeatureStore) BulkLoad(ctx context.Context, rows map[string]FeatureMap, ttl time.Duration) (int, error) { + if ttl == 0 { + ttl = fs.BatchTTL + } + if len(rows) == 0 { + return 0, nil + } + pipe := fs.rdb.Pipeline() + for entityID, fields := range rows { + key := fs.KeyFor(entityID) + encoded := make(map[string]any, len(fields)) + for name, value := range fields { + encoded[name] = encode(value) + } + pipe.HSet(ctx, key, encoded) + pipe.Expire(ctx, key, ttl) + } + if _, err := pipe.Exec(ctx); err != nil { + return 0, fmt.Errorf("bulk load: %w", err) + } + ... +} +``` + +go-redis's `Pipeline` is a *non-transactional* batch: commands queue up and +ship in one round trip, but they don't run inside a `MULTI/EXEC` block. +That's the right choice here because each user's `HSET` + `EXPIRE` pair is +independent of every other user's, and an all-or-nothing transaction would +block the server for the duration of the batch. For the rare case where the +pair has to be inseparable (a server crash between the two would leave the +entity without a key-level TTL) you would wrap each user in `rdb.TxPipeline()` +or a Lua script (see [`EVAL`]({{< relref "/commands/eval" >}}) / +[Eval scripting]({{< relref "/develop/programmability/eval-intro" >}})). For +a daily ingestion job that runs end-to-end every cycle, the next run re-pins +the TTL — no extra machinery needed. + +In production, the equivalent of this script runs as an offline pipeline (a +Spark or Feast `materialize` job) that reads from the warehouse and writes +into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`UpdateStreaming` is the linchpin of the mixed-staleness story: + +```go +func (fs *FeatureStore) UpdateStreaming(ctx context.Context, entityID string, fields FeatureMap, ttl time.Duration) error { + if len(fields) == 0 { + return nil + } + if ttl == 0 { + ttl = fs.StreamingTTL + } + key := fs.KeyFor(entityID) + encoded := make(map[string]any, len(fields)) + names := make([]string, 0, len(fields)) + for name, value := range fields { + encoded[name] = encode(value) + names = append(names, name) + } + pipe := fs.rdb.Pipeline() + pipe.HSet(ctx, key, encoded) + hexpireCmd := pipe.HExpire(ctx, key, ttl, names...) + if _, err := pipe.Exec(ctx); err != nil { + return fmt.Errorf("update streaming: %w", err) + } + codes, _ := hexpireCmd.Result() + for _, code := range codes { + if code != 1 { + return fmt.Errorf("HEXPIRE did not set every field TTL for %s: %v", key, codes) + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. The two commands are sent in one round +trip and Redis executes them in pipeline order: the `HSET` runs first and +creates or overwrites the fields, then `HEXPIRE` attaches a TTL to each of +those same fields. `HEXPIRE` returns one status code per field — `1` if the +TTL was set, `2` if the expiry was 0 or in the past (so Redis deleted the +field instead), `0` if an `NX | XX | GT | LT` conditional flag was set and +not met (we never use one here), `-2` if the field doesn't exist on the key. +The helper returns an error if any code is anything other than `1`, so the +"every streaming write renews its TTL" invariant fails loudly rather than +silently leaving a streaming field with no expiry attached. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse — there is no application-side cleaner involved. +[`HTTL`]({{< relref "/commands/httl" >}}) lets the model side inspect the +remaining TTL on any field, which is useful both for debugging ("why is this +feature missing?" → "it expired three seconds ago") and as a freshness signal +in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level TTL +> commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, +> `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in Redis 7.4. On older +> Redis builds you would have to put streaming features on their own keys +> (one key per feature, or one key per feature group) and set a key-level +> `EXPIRE` instead — at the cost of giving up the single-`HMGET` retrieval. + +### Inference reads with HMGET + +`GetFeatures` is one `HMGET`: + +```go +func (fs *FeatureStore) GetFeatures(ctx context.Context, entityID string, fieldNames []string) (map[string]string, error) { + key := fs.KeyFor(entityID) + if fieldNames == nil { + return fs.rdb.HGetAll(ctx, key).Result() + } + if len(fieldNames) == 0 { + return map[string]string{}, nil + } + values, err := fs.rdb.HMGet(ctx, key, fieldNames...).Result() + if err != nil { + return nil, err + } + out := make(map[string]string, len(fieldNames)) + for i, name := range fieldNames { + if s, ok := values[i].(string); ok { + out[name] = s + } + } + return out, nil +} +``` + +The model knows exactly which features it consumes, so the request path +always takes the `HMGET` branch with an explicit field list — that's the +sub-millisecond path. `HGETALL` is the right call for debugging (which is +what the demo's "Inspect" panel does) but not for serving: it forces Redis +to serialize every field, including ones the model doesn't need. + +Fields that don't exist (because they were never written, or because they +expired) come back as `nil` (a typed `nil`, not a `string` empty). The helper +drops them from the result map so the caller sees only the features that +are actually available. A real model server would either treat missing +values as a feature ("this user has no streaming signal yet") or fall back +to a default from the model's training data. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```go +func (fs *FeatureStore) BatchGetFeatures(ctx context.Context, entityIDs, fieldNames []string) (map[string]map[string]string, error) { + if len(entityIDs) == 0 || len(fieldNames) == 0 { + return map[string]map[string]string{}, nil + } + pipe := fs.rdb.Pipeline() + cmds := make([]*redis.SliceCmd, len(entityIDs)) + for i, id := range entityIDs { + cmds[i] = pipe.HMGet(ctx, fs.KeyFor(id), fieldNames...) + } + if _, err := pipe.Exec(ctx); err != nil && !errors.Is(err, redis.Nil) { + return nil, err + } + out := make(map[string]map[string]string, len(entityIDs)) + for i, id := range entityIDs { + values, _ := cmds[i].Result() + row := make(map[string]string, len(fieldNames)) + for j, name := range fieldNames { + if s, ok := values[j].(string); ok { + row[name] = s + } + } + out[id] = row + } + return out, nil +} +``` + +One round trip for the whole batch — the demo regularly returns 100 users in +1-2 ms against a local Redis. On a real network the round trip dominates; +pipelining is what keeps batch scoring practical. + +A Redis Cluster is different in two ways: a single `Pipeline.Exec` is bound +to one shard, because non-cross-slot pipelines can only target one node; and +the keys for a typical user batch will land on multiple shards. For batch +reads on a cluster, use the +[`ClusterClient`]({{< relref "/develop/clients/go/connect" >}}) — its +`Pipeline` knows how to dispatch per-shard, so you pay one round trip per +shard rather than one for the whole batch. A hash tag like +`fs:user:{vip}:u0001` forces a known set of keys onto the same shard so one +pipeline can cover all of them in a single round trip. + +## The streaming worker + +`streaming_worker.go` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/go/streaming_worker.go)). +It runs as a goroutine next to the demo server so the UI can start, pause, +and resume it; in production this code would live in the streaming layer. + +Every tick the worker picks a few random users, generates a new value for +each streaming feature, and calls `store.UpdateStreaming(ctx, userID, fields, 0)`. +The demo defaults to 5 users per tick at 1-second intervals — so a 200-user +store sees roughly half its users refreshed in the first minute, and most +after a few minutes. Raise `--users-per-tick` or drop `--seed-users` if +you'd rather touch every user quickly. + +```go +func (w *StreamingWorker) doTick(ctx context.Context) error { + ids, err := w.store.ListEntityIDs(ctx, 500) + if err != nil { + return err + } + if len(ids) == 0 { + return nil + } + chosen := w.rng.Perm(len(ids))[:w.usersPerTick] + nowMs := time.Now().UnixMilli() + for _, idx := range chosen { + fields := FeatureMap{ + "last_login_ts": nowMs, + "last_device_id": w.choice(deviceIDs), + "tx_count_5m": w.intn(13), + "failed_logins_15m": w.weightedInt(failedLoginBuckets, failedLoginWeights), + "session_country": w.choice(sessionCountries), + } + if err := w.store.UpdateStreaming(ctx, ids[idx], fields, 0); err != nil { + return err + } + } + return nil +} +``` + +Pausing the worker is what shows off the mixed-staleness behavior: leave it +paused for longer than `streamingTTL` and the streaming fields disappear +from every user's hash one by one, while the batch fields remain under the +longer key-level `EXPIRE`. The demo's `Pause / resume` button lets you see +this happen in real time. + +`Pause()` only blocks *future* ticks from running — the goroutine simply +skips its turn on the next ticker fire. A reset that's about to `DEL` every +key needs to wait out an already-running tick too, which is what +`WaitForIdle()` is for: the demo's `Reset` handler calls `worker.Pause()` +*and* `worker.WaitForIdle()` before it issues the `DEL` sweep, so a +mid-flight tick can't recreate a user under a streaming-only hash with no +key-level TTL. + +## The batch builder + +`build_features.go` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/go/build_features.go)). +It generates synthetic feature rows and calls `store.BulkLoad` once. The +synthesis itself is not the point — in a real deployment the equivalent +code reads from the offline store (Snowflake, BigQuery, Iceberg) and writes +the resulting hashes into Redis. + +```go +func SynthesizeUsers(count int, seed int64) map[string]FeatureMap { + rng := rand.New(rand.NewSource(seed)) + users := make(map[string]FeatureMap, count) + for i := 1; i <= count; i++ { + uid := fmt.Sprintf("u%04d", i) + users[uid] = FeatureMap{ + "country_iso": countryChoices[rng.Intn(len(countryChoices))], + "risk_segment": weightedChoiceString(rng, riskSegments, riskWeights), + "account_age_days": rng.Intn(2400-7+1) + 7, + "tx_count_7d": rng.Intn(81), + "avg_amount_30d": roundTo2(rng.Float64()*345.0 + 5.0), + "chargeback_count_180d": weightedChoiceInt(rng, chargebackBuckets, chargebackWeights), + } + } + return users +} +``` + +You can run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +go run ./cmd/build_features --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which is +how a typical operator would pre-seed a feature store from the command line +when debugging. + +## The interactive demo + +`demo_server.go` runs a `net/http` server on port 8087. The HTML page lets +you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. Drop the TTL to 30 s and watch the entire store expire on + schedule — the same thing that happens if a daily refresher fails. +* See the **store state** at a glance: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status (running / paused, ticks completed, + writes performed) and **pause or resume** it. Leave it paused for longer + than the streaming TTL to watch streaming fields drop out. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users and see + the total elapsed time plus the per-user breakdown. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL — the right view for debugging "why is this feature missing?" at + read time. + +The server holds one `FeatureStore` and one `StreamingWorker` for the +lifetime of the process. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the + demo relies on per-field TTL for the mixed-staleness story. +* **Go 1.21 or later.** +* The `go-redis` v9 client. The demo's `go.mod` pins + `github.com/redis/go-redis/v9 v9.18.0` or later. + +If your Redis server is running elsewhere, start the demo with `--redis-addr`. + +## Running the demo + +### Get the source files + +The demo lives in a small Go module under +[`feature-store/go`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/go). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/go +go mod tidy +``` + +### Start the demo server + +From the module directory: + +```bash +go run ./cmd/demo_server +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8087 +Using Redis at localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +By default the demo wipes the configured key prefix on startup so each run +starts from a clean state. Pass `--no-reset` to keep any existing data, or +`--key-prefix ` to point the demo at a different prefix entirely. + +Open [http://127.0.0.1:8087](http://127.0.0.1:8087) in a browser. Useful +things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused for + ~5 minutes (or restart the server with `--streaming-ttl-seconds 30` to + make it visible in seconds). Re-run **Read features** on any user and + watch the streaming fields disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Bulk-load** with a short TTL (say 30 seconds) and watch the user + count fall to zero on the next minute — the same thing that happens if a + daily batch run fails to land. +* Click **Reset** to drop every user and start over. + +The server is read/write against your local Redis. The default key prefix +is `fs:user:`. Pass `--no-reset` to keep existing data across restarts, or +`--redis-addr` to point at a different Redis. + +## Production usage + +The guidance below focuses on the production concerns that are specific to +running a feature store on Redis. For the generic go-redis production +checklist — connection-pool sizing, TLS, ACL, context cancellation, and +retry policy — see the +[go-redis production usage guide]({{< relref "/develop/clients/go/produsage" >}}) +and the +[connect-with-TLS recipe]({{< relref "/develop/clients/go/connect#connect-to-your-production-redis-with-tls" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness from a +broken batch pipeline. Set it longer than your worst-case batch outage so a +single missed run doesn't take the feature store offline, but short enough +that a sustained outage causes loud failures (missing entities) rather than +quiet ones (yesterday's features being scored as today's). The standard +choice is one cycle of "expected refresh interval × 2" — for a daily batch, +48 hours; for a 6-hour batch, 12 hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker causes +visible freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the schema +in your offline store. The batch materialization step is your chance to +flatten joins, encode categoricals, and project to whatever shape the model +server wants — so the request path is exactly one `HMGET` and zero +transforms. + +The training pipeline reads from the offline store with its own schema; the +serving pipeline reads from Redis with the flattened serving schema. +Keeping those two pipelines as the same code path is what prevents +training-serving skew. + +### Pipeline batch reads across shards + +On a single Redis instance, pipelining `HMGET` across `N` users through +`Pipeline.Exec` is one round trip. A Redis Cluster is different: a single +`Pipeline.Exec` is bound to one shard, because non-cross-slot pipelines can +only target one node, and the keys for a typical user batch will land on +multiple shards. For batch reads on a cluster, use the +[`ClusterClient`]({{< relref "/develop/clients/go/connect" >}}) — its +`Pipeline` knows how to bucket commands per-shard and ship one batch per +shard in parallel. For a small number of frequently-queried users (a +top-N customer list, for example), a hash tag like `fs:user:{vip}:u0001` +forces a known set of keys onto the same shard so one pipeline can cover +all of them in a single round trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the streaming +write applies `HEXPIRE` *every time*. If a streaming worker writes a field +without renewing its TTL, the field carries whatever expiry was there +before — possibly none, possibly stale — and the mixed-staleness invariant +breaks. Keep the `HSET` and `HEXPIRE` in the same pipeline (or, even safer, +in the same [Lua script]({{< relref "/develop/programmability/eval-intro" >}}) +if you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model doesn't +need. With dozens of features per entity, that is wasted serialization work +on the server and wasted bandwidth on the wire. Always specify the field +list explicitly with `HMGet` in the model server. + +The exception is debugging and feature-set discovery, where you genuinely +want the full hash. The demo's "Inspect" button uses `HGetAll` for exactly +this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the hash +(either it was never written, or it expired); `-1` means the field has no +TTL set (and is therefore covered only by the key-level `EXPIRE`); any +positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on streaming + features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL aligned + with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one network + round trip — see + [transactions and pipelining]({{< relref "/develop/clients/go/transpipe" >}}). + +See the [go-redis documentation]({{< relref "/develop/clients/go" >}}) for +the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the deeper +conceptual model — including the listpack encoding that makes small hashes +particularly compact in memory, which matters at feature-store scale. diff --git a/content/develop/use-cases/feature-store/go/build_features.go b/content/develop/use-cases/feature-store/go/build_features.go new file mode 100644 index 0000000000..4609cf7787 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/build_features.go @@ -0,0 +1,116 @@ +package featurestore + +import ( + "context" + "flag" + "fmt" + "math/rand" + "os" + "time" + + "github.com/redis/go-redis/v9" +) + +// Country choices and risk segments used by the synthetic batch +// generator. These are not the point of the demo — in production the +// equivalent code reads from the offline store (Snowflake, BigQuery, +// Iceberg) and writes the resulting hashes into Redis. +var ( + countryChoices = []string{"US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"} + riskSegments = []string{"low", "medium", "high"} + riskWeights = []int{70, 25, 5} + chargebackBuckets = []int{0, 1, 2, 3} + chargebackWeights = []int{85, 10, 4, 1} +) + +// SynthesizeUsers generates count synthetic user feature rows. +// +// The shape mirrors a small fraud-scoring feature set: country and +// risk segment as TAG-like categorical features, plus a few numeric +// aggregates over recent windows. +func SynthesizeUsers(count int, seed int64) map[string]FeatureMap { + rng := rand.New(rand.NewSource(seed)) + users := make(map[string]FeatureMap, count) + for i := 1; i <= count; i++ { + uid := fmt.Sprintf("u%04d", i) + users[uid] = FeatureMap{ + "country_iso": countryChoices[rng.Intn(len(countryChoices))], + "risk_segment": weightedChoiceString(rng, riskSegments, riskWeights), + "account_age_days": rng.Intn(2400-7+1) + 7, + "tx_count_7d": rng.Intn(81), + "avg_amount_30d": roundTo2(rng.Float64()*345.0 + 5.0), + "chargeback_count_180d": weightedChoiceInt(rng, chargebackBuckets, chargebackWeights), + } + } + return users +} + +// BuildFeaturesCLI is the entry point for cmd/build_features/main.go. +// It parses CLI flags, opens a Redis client, and bulk-loads the +// synthetic batch into Redis with a configurable key-level TTL. +func BuildFeaturesCLI(args []string) error { + fs := flag.NewFlagSet("build_features", flag.ExitOnError) + redisAddr := fs.String("redis-addr", "localhost:6379", "Redis host:port") + count := fs.Int("count", 200, "Number of synthetic users to materialize") + ttlSeconds := fs.Int("ttl-seconds", int(24*time.Hour/time.Second), "Key-level TTL in seconds (default 24h)") + keyPrefix := fs.String("key-prefix", "fs:user:", "Hash key prefix for each user") + seed := fs.Int64("seed", 42, "PRNG seed") + if err := fs.Parse(args); err != nil { + return err + } + + ctx := context.Background() + rdb := redis.NewClient(&redis.Options{Addr: *redisAddr}) + defer rdb.Close() + + store := NewFeatureStore(rdb, *keyPrefix, + time.Duration(*ttlSeconds)*time.Second, 0) + + rows := SynthesizeUsers(*count, *seed) + loaded, err := store.BulkLoad(ctx, rows, store.BatchTTL) + if err != nil { + return err + } + fmt.Fprintf(os.Stdout, + "Materialized %d users at %s* with a %ds key-level TTL.\n", + loaded, *keyPrefix, *ttlSeconds) + return nil +} + +// --------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------- + +func weightedChoiceString(rng *rand.Rand, items []string, weights []int) string { + total := 0 + for _, w := range weights { + total += w + } + r := rng.Intn(total) + for i, w := range weights { + r -= w + if r < 0 { + return items[i] + } + } + return items[len(items)-1] +} + +func weightedChoiceInt(rng *rand.Rand, items []int, weights []int) int { + total := 0 + for _, w := range weights { + total += w + } + r := rng.Intn(total) + for i, w := range weights { + r -= w + if r < 0 { + return items[i] + } + } + return items[len(items)-1] +} + +func roundTo2(v float64) float64 { + return float64(int64(v*100+0.5)) / 100.0 +} diff --git a/content/develop/use-cases/feature-store/go/cmd/build_features/main.go b/content/develop/use-cases/feature-store/go/cmd/build_features/main.go new file mode 100644 index 0000000000..198e8ccb23 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/cmd/build_features/main.go @@ -0,0 +1,18 @@ +// Tiny shim that drives the batch materialization flow from the +// parent ``featurestore`` package. Run with: +// +// go run ./cmd/build_features --count 500 --ttl-seconds 3600 +package main + +import ( + "log" + "os" + + fs "featurestore" +) + +func main() { + if err := fs.BuildFeaturesCLI(os.Args[1:]); err != nil { + log.Fatal(err) + } +} diff --git a/content/develop/use-cases/feature-store/go/cmd/demo_server/main.go b/content/develop/use-cases/feature-store/go/cmd/demo_server/main.go new file mode 100644 index 0000000000..da253e83d0 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/cmd/demo_server/main.go @@ -0,0 +1,22 @@ +// Tiny shim that runs the demo server defined in the parent +// ``featurestore`` package. Build with: +// +// go build -o demo_server ./cmd/demo_server +// +// Or run directly: +// +// go run ./cmd/demo_server --port 8087 +package main + +import ( + "log" + "os" + + fs "featurestore" +) + +func main() { + if err := fs.RunDemoServer(os.Args[1:]); err != nil { + log.Fatal(err) + } +} diff --git a/content/develop/use-cases/feature-store/go/demo_server.go b/content/develop/use-cases/feature-store/go/demo_server.go new file mode 100644 index 0000000000..2777da8a3c --- /dev/null +++ b/content/develop/use-cases/feature-store/go/demo_server.go @@ -0,0 +1,954 @@ +// Redis feature-store demo server (Go). +// +// Create a tiny main.go shim in cmd/demo_server (Go's package main +// cannot live in the same directory as package featurestore): +// +// package main +// +// import ( +// "log" +// "os" +// +// fs "featurestore" +// ) +// +// func main() { +// if err := fs.RunDemoServer(os.Args[1:]); err != nil { +// log.Fatal(err) +// } +// } +// +// Build and run with: +// +// go run ./cmd/demo_server +// +// Then visit http://localhost:8087. +// +// Use the UI to: +// +// - Bulk-load (re-materialize) the batch features, optionally with a +// short TTL so you can watch a whole entity expire on schedule. +// - Pause the streaming worker and watch the streaming fields drop +// out via HEXPIRE while the batch fields remain populated under +// the longer key-level TTL — the *mixed staleness* story made +// visible. +// - Pull features for one user (HMGET) and see the value, per-field +// TTL, and read latency. +// - Batch-score N users in one round trip and see the per-entity / +// per-round-trip latency split. +// - Inspect a single user's hash in detail with field-level TTLs. +package featurestore + +import ( + "context" + "encoding/json" + "flag" + "fmt" + "log" + "net/http" + "sort" + "strconv" + "strings" + "sync" + "time" + + "github.com/redis/go-redis/v9" +) + +// FeatureStoreDemo wires the FeatureStore and StreamingWorker +// together with reset / materialize / toggle helpers used by the +// HTTP handlers. +type FeatureStoreDemo struct { + store *FeatureStore + worker *StreamingWorker + seed int64 + + mu sync.Mutex +} + +// NewFeatureStoreDemo bundles the store and worker for the HTTP +// server. seed is the PRNG seed used by the batch synthesizer. +func NewFeatureStoreDemo(store *FeatureStore, worker *StreamingWorker, seed int64) *FeatureStoreDemo { + return &FeatureStoreDemo{store: store, worker: worker, seed: seed} +} + +// Materialize bulk-loads `count` synthetic users with the supplied +// key-level TTL. +func (d *FeatureStoreDemo) Materialize(ctx context.Context, count int, ttl time.Duration) (loaded int, elapsed time.Duration, err error) { + d.mu.Lock() + defer d.mu.Unlock() + rows := SynthesizeUsers(count, d.seed) + start := time.Now() + loaded, err = d.store.BulkLoad(ctx, rows, ttl) + elapsed = time.Since(start) + return +} + +// Reset drops every entity under the key prefix. Pauses the +// streaming worker around the DEL sweep so a concurrent tick can't +// recreate a user that was just enumerated for deletion (streaming +// HSET creates the key if it's missing, and that would leave behind +// a streaming-only hash with no key-level TTL). Pause() only blocks +// *future* ticks — WaitForIdle() flushes an already-running tick +// before the DEL sweep starts. +func (d *FeatureStoreDemo) Reset(ctx context.Context) (int64, error) { + d.mu.Lock() + defer d.mu.Unlock() + wasPaused := d.worker.IsPaused() + if d.worker.IsRunning() { + if !wasPaused { + d.worker.Pause() + } + d.worker.WaitForIdle() + } + defer func() { + if d.worker.IsRunning() && !wasPaused { + d.worker.Resume() + } + }() + deleted, err := d.store.Reset(ctx) + if err != nil { + return deleted, err + } + d.store.ResetStats() + d.worker.ResetStats() + return deleted, nil +} + +// ToggleWorker pauses or resumes the streaming worker. Starts the +// goroutine if it wasn't running. +func (d *FeatureStoreDemo) ToggleWorker(ctx context.Context) (paused, running bool) { + d.mu.Lock() + defer d.mu.Unlock() + if !d.worker.IsRunning() { + d.worker.Start(ctx) + } + if d.worker.IsPaused() { + d.worker.Resume() + } else { + d.worker.Pause() + } + return d.worker.IsPaused(), d.worker.IsRunning() +} + +// ------------------------------------------------------------------- +// HTTP handlers +// ------------------------------------------------------------------- + +type httpServer struct { + store *FeatureStore + worker *StreamingWorker + demo *FeatureStoreDemo +} + +func (s *httpServer) handler() http.Handler { + mux := http.NewServeMux() + mux.HandleFunc("/", s.handleIndex) + mux.HandleFunc("/state", s.handleState) + mux.HandleFunc("/inspect", s.handleInspect) + mux.HandleFunc("/bulk-load", s.handleBulkLoad) + mux.HandleFunc("/reset", s.handleReset) + mux.HandleFunc("/worker/toggle", s.handleToggleWorker) + mux.HandleFunc("/read", s.handleRead) + mux.HandleFunc("/batch-read", s.handleBatchRead) + return mux +} + +func (s *httpServer) handleIndex(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/" && r.URL.Path != "/index.html" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "text/html; charset=utf-8") + w.Write([]byte(s.htmlPage())) +} + +func (s *httpServer) handleState(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + ctx := r.Context() + ids, err := s.store.ListEntityIDs(ctx, 500) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + // Cap the dropdown list at 500 but report the true count + // separately so the UI doesn't silently understate the store. + count, err := s.store.CountEntities(ctx) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + jsonResponse(w, http.StatusOK, map[string]any{ + "key_prefix": s.store.KeyPrefix, + "batch_ttl_seconds": int(s.store.BatchTTL.Seconds()), + "streaming_ttl_seconds": int(s.store.StreamingTTL.Seconds()), + "entity_count": count, + "entity_ids": ids, + "stats": s.store.Stats(), + "worker": s.worker.Stats(), + }) +} + +func (s *httpServer) handleInspect(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodGet { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + user := strings.TrimSpace(r.URL.Query().Get("user")) + if user == "" { + jsonError(w, fmt.Errorf("user is required"), http.StatusBadRequest) + return + } + ctx := r.Context() + full, err := s.store.GetFeatures(ctx, user, nil) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + keyTTL, err := s.store.KeyTTLSeconds(ctx, user) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + if len(full) == 0 { + jsonResponse(w, http.StatusOK, map[string]any{ + "exists": false, + "key_ttl_seconds": keyTTL, + }) + return + } + names := make([]string, 0, len(full)) + for n := range full { + names = append(names, n) + } + ttls, err := s.store.FieldTTLsSeconds(ctx, user, names) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + rows := make([]map[string]any, 0, len(names)) + for _, n := range names { + rows = append(rows, map[string]any{ + "name": n, + "value": full[n], + "ttl_seconds": ttls[n], + }) + } + sort.Slice(rows, func(i, j int) bool { + return rows[i]["name"].(string) < rows[j]["name"].(string) + }) + jsonResponse(w, http.StatusOK, map[string]any{ + "exists": true, + "key_ttl_seconds": keyTTL, + "fields": rows, + }) +} + +func (s *httpServer) handleBulkLoad(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if err := r.ParseForm(); err != nil { + jsonError(w, err, http.StatusBadRequest) + return + } + count := clampInt(parseInt(r.FormValue("count"), 200), 1, 2000) + ttlSeconds := clampInt(parseInt(r.FormValue("ttl"), 86400), 5, 172800) + loaded, elapsed, err := s.demo.Materialize(r.Context(), count, time.Duration(ttlSeconds)*time.Second) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + jsonResponse(w, http.StatusOK, map[string]any{ + "loaded": loaded, + "ttl_seconds": ttlSeconds, + "elapsed_ms": float64(elapsed.Microseconds()) / 1000.0, + }) +} + +func (s *httpServer) handleReset(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + deleted, err := s.demo.Reset(r.Context()) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + jsonResponse(w, http.StatusOK, map[string]any{"deleted": deleted}) +} + +func (s *httpServer) handleToggleWorker(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + paused, running := s.demo.ToggleWorker(r.Context()) + jsonResponse(w, http.StatusOK, map[string]any{ + "paused": paused, + "running": running, + }) +} + +func (s *httpServer) handleRead(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if err := r.ParseForm(); err != nil { + jsonError(w, err, http.StatusBadRequest) + return + } + user := strings.TrimSpace(r.FormValue("user")) + if user == "" { + jsonError(w, fmt.Errorf("user is required"), http.StatusBadRequest) + return + } + fields := nonEmpty(r.Form["field"]) + ctx := r.Context() + start := time.Now() + var values map[string]string + if len(fields) > 0 { + var err error + values, err = s.store.GetFeatures(ctx, user, fields) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + } else { + values = map[string]string{} + } + elapsed := time.Since(start) + ttls := map[string]int64{} + if len(fields) > 0 { + var err error + ttls, err = s.store.FieldTTLsSeconds(ctx, user, fields) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + } + keyTTL, err := s.store.KeyTTLSeconds(ctx, user) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + jsonResponse(w, http.StatusOK, map[string]any{ + "requested": fields, + "values": values, + "ttls": ttls, + "key_ttl_seconds": keyTTL, + "returned_count": len(values), + "elapsed_ms": float64(elapsed.Microseconds()) / 1000.0, + }) +} + +func (s *httpServer) handleBatchRead(w http.ResponseWriter, r *http.Request) { + if r.Method != http.MethodPost { + http.Error(w, "method not allowed", http.StatusMethodNotAllowed) + return + } + if err := r.ParseForm(); err != nil { + jsonError(w, err, http.StatusBadRequest) + return + } + count := clampInt(parseInt(r.FormValue("count"), 100), 1, 500) + fields := nonEmpty(r.Form["field"]) + if len(fields) == 0 { + fields = append([]string{}, DefaultStreamingFields...) + fields = append(fields, "risk_segment") + } + ctx := r.Context() + ids, err := s.store.ListEntityIDs(ctx, int64(count*2)) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + if len(ids) > count { + ids = ids[:count] + } + start := time.Now() + rows, err := s.store.BatchGetFeatures(ctx, ids, fields) + if err != nil { + jsonError(w, err, http.StatusInternalServerError) + return + } + elapsed := time.Since(start) + sampleN := 10 + if sampleN > len(ids) { + sampleN = len(ids) + } + sample := make([]map[string]any, sampleN) + for i := 0; i < sampleN; i++ { + sample[i] = map[string]any{ + "id": ids[i], + "field_count": len(rows[ids[i]]), + } + } + jsonResponse(w, http.StatusOK, map[string]any{ + "entity_count": len(ids), + "field_count": len(fields), + "elapsed_ms": float64(elapsed.Microseconds()) / 1000.0, + "sample": sample, + }) +} + +// ------------------------------------------------------------------- +// HTTP plumbing +// ------------------------------------------------------------------- + +func jsonResponse(w http.ResponseWriter, status int, payload any) { + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(status) + _ = json.NewEncoder(w).Encode(payload) +} + +func jsonError(w http.ResponseWriter, err error, status int) { + jsonResponse(w, status, map[string]any{"error": err.Error()}) +} + +func parseInt(s string, def int) int { + if s == "" { + return def + } + n, err := strconv.Atoi(s) + if err != nil { + return def + } + return n +} + +func clampInt(n, low, high int) int { + if n < low { + return low + } + if n > high { + return high + } + return n +} + +func nonEmpty(in []string) []string { + out := make([]string, 0, len(in)) + for _, v := range in { + if v != "" { + out = append(out, v) + } + } + return out +} + +func (s *httpServer) htmlPage() string { + batchFieldsJSON, _ := json.Marshal(DefaultBatchFields) + streamFieldsJSON, _ := json.Marshal(DefaultStreamingFields) + return strings.NewReplacer( + "__KEY_PREFIX__", s.store.KeyPrefix, + "__STREAM_TTL__", strconv.Itoa(int(s.store.StreamingTTL.Seconds())), + "__USERS_PER_TICK__", strconv.Itoa(s.worker.usersPerTick), + "__BATCH_FIELDS_JSON__", string(batchFieldsJSON), + "__STREAM_FIELDS_JSON__", string(streamFieldsJSON), + ).Replace(htmlTemplate) +} + +// RunDemoServer parses CLI flags, opens a Redis client, seeds the +// store, starts the streaming worker, and serves HTTP. Intended to be +// called from cmd/demo_server/main.go. +func RunDemoServer(args []string) error { + fs := flag.NewFlagSet("demo_server", flag.ExitOnError) + host := fs.String("host", "127.0.0.1", "HTTP bind host") + port := fs.Int("port", 8087, "HTTP bind port") + redisAddr := fs.String("redis-addr", "localhost:6379", "Redis host:port") + keyPrefix := fs.String("key-prefix", "fs:user:", "Hash key prefix") + batchTTLSeconds := fs.Int("batch-ttl-seconds", 24*60*60, "Key-level TTL on bulk-loaded users") + streamingTTLSeconds := fs.Int("streaming-ttl-seconds", 5*60, "Per-field TTL on streaming features") + usersPerTick := fs.Int("users-per-tick", 5, "Streaming users per tick") + seedUsers := fs.Int("seed-users", 200, "Users to materialize on startup") + noReset := fs.Bool("no-reset", false, "Keep any existing data under --key-prefix on startup") + if err := fs.Parse(args); err != nil { + return err + } + + ctx := context.Background() + rdb := redis.NewClient(&redis.Options{Addr: *redisAddr}) + defer rdb.Close() + + store := NewFeatureStore(rdb, *keyPrefix, + time.Duration(*batchTTLSeconds)*time.Second, + time.Duration(*streamingTTLSeconds)*time.Second) + worker := NewStreamingWorker(store, time.Second, *usersPerTick, 1337) + demo := NewFeatureStoreDemo(store, worker, 42) + + if !*noReset { + fmt.Printf("Dropping any existing users under '%s*' for a clean demo run (pass --no-reset to keep them).\n", *keyPrefix) + if _, err := store.Reset(ctx); err != nil { + return fmt.Errorf("reset on start: %w", err) + } + store.ResetStats() + } + seeded, _, err := demo.Materialize(ctx, *seedUsers, store.BatchTTL) + if err != nil { + return fmt.Errorf("seed materialize: %w", err) + } + + worker.Start(ctx) + defer worker.Stop() + + srv := &httpServer{store: store, worker: worker, demo: demo} + addr := fmt.Sprintf("%s:%d", *host, *port) + hs := &http.Server{Addr: addr, Handler: srv.handler()} + + fmt.Printf("Redis feature-store demo server listening on http://%s\n", addr) + fmt.Printf("Using Redis at %s with key prefix '%s' (batch TTL %ds, streaming TTL %ds)\n", + *redisAddr, *keyPrefix, *batchTTLSeconds, *streamingTTLSeconds) + fmt.Printf("Materialized %d user(s); streaming worker running.\n", seeded) + + if err := hs.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.Fatalf("listen: %v", err) + } + return nil +} + +const htmlTemplate = ` + + + + + Redis Feature Store Demo (Go) + + + +
+
go-redis + Go standard net/http
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one go-redis Pipeline, so the whole + batch ships in one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via go-redis + Pipeline.Exec. One network round trip for the + whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +` diff --git a/content/develop/use-cases/feature-store/go/feature_store.go b/content/develop/use-cases/feature-store/go/feature_store.go new file mode 100644 index 0000000000..437148f296 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/feature_store.go @@ -0,0 +1,495 @@ +// Package featurestore is a Redis online feature store backed by per-entity +// Hashes. +// +// Each entity (here, a user) lives at a deterministic key such as +// "fs:user:{id}". The hash holds every feature for that entity as one +// field per feature — batch-materialized aggregates (refreshed on a +// daily cycle) alongside streaming-updated signals (refreshed every +// few seconds). One HMGET returns whichever subset the model needs in +// one network round trip. +// +// Two TTL layers solve the *mixed staleness* problem: +// +// - A key-level EXPIRE aligned with the batch materialization cycle +// causes the whole entity to disappear if its batch refresher +// fails, so inference sees a missing entity (which the model +// handler can detect and fall back on) rather than silently +// outdated values. +// - A per-field HEXPIRE on each streaming field gives that field +// its own shorter expiry, independent of the rest of the hash. +// When the streaming pipeline stops updating a field, the field +// self-cleans while the rest of the entity stays populated. +// +// HEXPIRE and HTTL require Redis 7.4 or later. The go-redis v9 client +// exposes them as HExpire and HTTL on *redis.Client. +// +// Concurrency is by construction: Redis is single-threaded per shard, +// so overlapping HSET calls from a batch job and a streaming worker +// on the same entity hash are applied atomically without locks or +// version columns. +package featurestore + +import ( + "context" + "errors" + "fmt" + "sort" + "strconv" + "sync/atomic" + "time" + + "github.com/redis/go-redis/v9" +) + +// FeatureValue is the concrete type a single feature may take before +// it gets serialized as a Redis hash field. Hash field values are +// strings on the wire; the helper renders these types into strings +// via encode() so booleans round-trip cleanly through redis-cli. +type FeatureValue any + +// FeatureMap is the set of fields written for one entity. +type FeatureMap map[string]FeatureValue + +// DefaultBatchFields is the schema bulk-loaded once per batch cycle. +var DefaultBatchFields = []string{ + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d", +} + +// DefaultStreamingFields is the schema updated by the streaming worker +// with a per-field HEXPIRE so each field self-expires when its +// upstream pipeline stops. +var DefaultStreamingFields = []string{ + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country", +} + +// Stats holds the helper's in-process counters. Read with FeatureStore.Stats. +type Stats struct { + BatchWritesTotal int64 `json:"batch_writes_total"` + StreamingWritesTotal int64 `json:"streaming_writes_total"` + ReadsTotal int64 `json:"reads_total"` + ReadFieldsTotal int64 `json:"read_fields_total"` +} + +// FeatureStore wraps a *redis.Client and exposes the four feature-store +// paths: batch ingest (BulkLoad), streaming ingest (UpdateStreaming), +// inference read (GetFeatures), and batch scoring (BatchGetFeatures). +type FeatureStore struct { + rdb *redis.Client + KeyPrefix string + BatchTTL time.Duration + StreamingTTL time.Duration + + batchWritesTotal atomic.Int64 + streamingWritesTotal atomic.Int64 + readsTotal atomic.Int64 + readFieldsTotal atomic.Int64 +} + +// NewFeatureStore returns a FeatureStore backed by rdb. Defaults match +// the Python and Node.js demos: a 24-hour key-level TTL and a 5-minute +// per-field streaming TTL. +func NewFeatureStore(rdb *redis.Client, keyPrefix string, batchTTL, streamingTTL time.Duration) *FeatureStore { + if keyPrefix == "" { + keyPrefix = "fs:user:" + } + if batchTTL == 0 { + batchTTL = 24 * time.Hour + } + if streamingTTL == 0 { + streamingTTL = 5 * time.Minute + } + return &FeatureStore{ + rdb: rdb, + KeyPrefix: keyPrefix, + BatchTTL: batchTTL, + StreamingTTL: streamingTTL, + } +} + +// KeyFor returns the Redis key for an entity ID. +func (fs *FeatureStore) KeyFor(entityID string) string { + return fs.KeyPrefix + entityID +} + +// ------------------------------------------------------------------- +// Batch ingestion (materialization) +// ------------------------------------------------------------------- + +// BulkLoad materializes a batch of entities into Redis. rows is +// keyed by entity ID. One HSET plus one EXPIRE per entity, batched +// through go-redis's Pipeline so the whole batch ships in a single +// round trip. The key-level EXPIRE is what makes the entity +// disappear if a future batch run fails — inference reads the +// missing entity rather than silently outdated values. +func (fs *FeatureStore) BulkLoad(ctx context.Context, rows map[string]FeatureMap, ttl time.Duration) (int, error) { + if ttl == 0 { + ttl = fs.BatchTTL + } + if len(rows) == 0 { + return 0, nil + } + + pipe := fs.rdb.Pipeline() + for entityID, fields := range rows { + key := fs.KeyFor(entityID) + encoded := make(map[string]any, len(fields)) + for name, value := range fields { + encoded[name] = encode(value) + } + pipe.HSet(ctx, key, encoded) + pipe.Expire(ctx, key, ttl) + } + if _, err := pipe.Exec(ctx); err != nil { + return 0, fmt.Errorf("bulk load: %w", err) + } + fs.batchWritesTotal.Add(int64(len(rows))) + return len(rows), nil +} + +// UpdateBatchFeature overwrites one batch feature without touching +// the key TTL. Used by the demo's "manually refresh one user" lever; +// real pipelines flow through BulkLoad. +func (fs *FeatureStore) UpdateBatchFeature(ctx context.Context, entityID, field string, value FeatureValue) error { + if err := fs.rdb.HSet(ctx, fs.KeyFor(entityID), field, encode(value)).Err(); err != nil { + return err + } + fs.batchWritesTotal.Add(1) + return nil +} + +// ------------------------------------------------------------------- +// Streaming ingestion +// ------------------------------------------------------------------- + +// UpdateStreaming writes streaming features with a per-field TTL. +// +// Each field carries its own HEXPIRE so it self-expires +// independently of the rest of the hash. If the streaming pipeline +// stops, the streaming fields drop out while the batch-materialized +// fields remain populated under their longer key-level EXPIRE. +// +// HEXPIRE returns one status code per field: +// +// - 1: TTL set / updated +// - 2: the expiry was 0 or in the past, so Redis deleted the field +// instead of applying a TTL +// - 0: an NX | XX | GT | LT conditional flag was specified and not +// met (we never use one here) +// - -2: no such field, or no such key +// +// Since we just HSET every field on the same call, any code other +// than 1 means the per-field TTL invariant did not hold — the +// mixed-staleness story relies on every streaming field carrying a +// fresh TTL after the write, so failure is loud. +func (fs *FeatureStore) UpdateStreaming(ctx context.Context, entityID string, fields FeatureMap, ttl time.Duration) error { + if len(fields) == 0 { + return nil + } + if ttl == 0 { + ttl = fs.StreamingTTL + } + key := fs.KeyFor(entityID) + encoded := make(map[string]any, len(fields)) + names := make([]string, 0, len(fields)) + for name, value := range fields { + encoded[name] = encode(value) + names = append(names, name) + } + + pipe := fs.rdb.Pipeline() + pipe.HSet(ctx, key, encoded) + hexpireCmd := pipe.HExpire(ctx, key, ttl, names...) + if _, err := pipe.Exec(ctx); err != nil { + return fmt.Errorf("update streaming: %w", err) + } + codes, err := hexpireCmd.Result() + if err != nil { + return fmt.Errorf("update streaming: HEXPIRE: %w", err) + } + for _, code := range codes { + if code != 1 { + return fmt.Errorf("HEXPIRE did not set every field TTL for %s: %v", key, codes) + } + } + fs.streamingWritesTotal.Add(int64(len(fields))) + return nil +} + +// ------------------------------------------------------------------- +// Inference reads +// ------------------------------------------------------------------- + +// GetFeatures returns a subset of features for one entity. Pass +// fieldNames=nil to fetch the full hash with HGETALL — useful for +// debugging but rarely the right call on the request path, where the +// model knows exactly which features it consumes. +func (fs *FeatureStore) GetFeatures(ctx context.Context, entityID string, fieldNames []string) (map[string]string, error) { + key := fs.KeyFor(entityID) + if fieldNames == nil { + out, err := fs.rdb.HGetAll(ctx, key).Result() + if err != nil { + return nil, err + } + fs.readsTotal.Add(1) + fs.readFieldsTotal.Add(int64(len(out))) + return out, nil + } + if len(fieldNames) == 0 { + return map[string]string{}, nil + } + values, err := fs.rdb.HMGet(ctx, key, fieldNames...).Result() + if err != nil { + return nil, err + } + out := make(map[string]string, len(fieldNames)) + for i, name := range fieldNames { + if values[i] == nil { + continue + } + s, ok := values[i].(string) + if !ok { + continue + } + out[name] = s + } + fs.readsTotal.Add(1) + fs.readFieldsTotal.Add(int64(len(out))) + return out, nil +} + +// BatchGetFeatures pipelines HMGET across many entities for batch +// scoring. Returns one map per entity ID, in input order. +func (fs *FeatureStore) BatchGetFeatures(ctx context.Context, entityIDs, fieldNames []string) (map[string]map[string]string, error) { + if len(entityIDs) == 0 || len(fieldNames) == 0 { + return map[string]map[string]string{}, nil + } + + pipe := fs.rdb.Pipeline() + cmds := make([]*redis.SliceCmd, len(entityIDs)) + for i, id := range entityIDs { + cmds[i] = pipe.HMGet(ctx, fs.KeyFor(id), fieldNames...) + } + if _, err := pipe.Exec(ctx); err != nil && !errors.Is(err, redis.Nil) { + return nil, fmt.Errorf("batch get features: %w", err) + } + + out := make(map[string]map[string]string, len(entityIDs)) + var seen int64 + for i, id := range entityIDs { + values, err := cmds[i].Result() + if err != nil { + return nil, fmt.Errorf("batch get features: %s: %w", id, err) + } + row := make(map[string]string, len(fieldNames)) + for j, name := range fieldNames { + if values[j] == nil { + continue + } + if s, ok := values[j].(string); ok { + row[name] = s + seen++ + } + } + out[id] = row + } + fs.readsTotal.Add(int64(len(entityIDs))) + fs.readFieldsTotal.Add(seen) + return out, nil +} + +// ------------------------------------------------------------------- +// TTL inspection (used by the demo UI) +// ------------------------------------------------------------------- + +// KeyTTLSeconds returns the seconds until the entity key expires: +// positive means TTL remaining, -1 means no key-level TTL set, +// -2 means the key doesn't exist. +func (fs *FeatureStore) KeyTTLSeconds(ctx context.Context, entityID string) (int64, error) { + d, err := fs.rdb.TTL(ctx, fs.KeyFor(entityID)).Result() + if err != nil { + return 0, err + } + // go-redis returns time.Duration(-1) for "no TTL" and + // time.Duration(-2) for "missing key" (both literal nanosecond + // values, not seconds). Positive durations carry the real TTL. + if d < 0 { + return int64(d), nil + } + return int64(d.Seconds()), nil +} + +// FieldTTLsSeconds returns the per-field TTL for each named field +// via HTTL. Each value mirrors the TTL convention: positive means +// seconds remaining, -1 means the field has no TTL set, -2 means +// the field doesn't exist on this hash (or the key itself is +// missing). +func (fs *FeatureStore) FieldTTLsSeconds(ctx context.Context, entityID string, fieldNames []string) (map[string]int64, error) { + if len(fieldNames) == 0 { + return map[string]int64{}, nil + } + codes, err := fs.rdb.HTTL(ctx, fs.KeyFor(entityID), fieldNames...).Result() + if err != nil { + return nil, err + } + // HTTL on a missing key returns an array of -2s, one per field, so + // the loop below produces the same shape as a present-but-empty + // hash would. No defensive shim needed for this client. + out := make(map[string]int64, len(fieldNames)) + for i, name := range fieldNames { + if i < len(codes) { + out[name] = codes[i] + } else { + out[name] = -2 + } + } + return out, nil +} + +// ------------------------------------------------------------------- +// Demo housekeeping +// ------------------------------------------------------------------- + +// ListEntityIDs returns up to limit entity IDs by scanning +// keyPrefix*. SCAN is non-blocking and is used to populate UI +// dropdowns, not as a serving primitive. The result is sorted. +func (fs *FeatureStore) ListEntityIDs(ctx context.Context, limit int64) ([]string, error) { + if limit <= 0 { + limit = 200 + } + pattern := fs.KeyPrefix + "*" + prefixLen := len(fs.KeyPrefix) + ids := make([]string, 0, limit) + iter := fs.rdb.Scan(ctx, 0, pattern, 200).Iterator() + for iter.Next(ctx) { + k := iter.Val() + if len(k) <= prefixLen { + continue + } + ids = append(ids, k[prefixLen:]) + if int64(len(ids)) >= limit { + break + } + } + if err := iter.Err(); err != nil { + return nil, err + } + sort.Strings(ids) + return ids, nil +} + +// CountEntities returns the true count of entities under the key +// prefix. Iterates SCAN without an in-memory cap so the UI can report +// the real total even when more keys exist than the dropdown lists. +func (fs *FeatureStore) CountEntities(ctx context.Context) (int64, error) { + var n int64 + pattern := fs.KeyPrefix + "*" + iter := fs.rdb.Scan(ctx, 0, pattern, 500).Iterator() + for iter.Next(ctx) { + n++ + } + if err := iter.Err(); err != nil { + return 0, err + } + return n, nil +} + +// DeleteEntity drops one entity by ID. Returns 1 if a key was +// deleted, 0 otherwise. +func (fs *FeatureStore) DeleteEntity(ctx context.Context, entityID string) (int64, error) { + return fs.rdb.Del(ctx, fs.KeyFor(entityID)).Result() +} + +// Reset drops every entity under the key prefix. Used by the demo +// reset path. Scans in batches and issues one variadic DEL per batch, +// so a large demo dataset doesn't land on the server as one giant +// synchronous delete. +func (fs *FeatureStore) Reset(ctx context.Context) (int64, error) { + var deleted int64 + pattern := fs.KeyPrefix + "*" + batch := make([]string, 0, 500) + flush := func() error { + if len(batch) == 0 { + return nil + } + n, err := fs.rdb.Del(ctx, batch...).Result() + if err != nil { + return err + } + deleted += n + batch = batch[:0] + return nil + } + iter := fs.rdb.Scan(ctx, 0, pattern, 500).Iterator() + for iter.Next(ctx) { + batch = append(batch, iter.Val()) + if len(batch) >= 500 { + if err := flush(); err != nil { + return deleted, err + } + } + } + if err := iter.Err(); err != nil { + return deleted, err + } + if err := flush(); err != nil { + return deleted, err + } + return deleted, nil +} + +// Stats returns a snapshot of the in-process counters. +func (fs *FeatureStore) Stats() Stats { + return Stats{ + BatchWritesTotal: fs.batchWritesTotal.Load(), + StreamingWritesTotal: fs.streamingWritesTotal.Load(), + ReadsTotal: fs.readsTotal.Load(), + ReadFieldsTotal: fs.readFieldsTotal.Load(), + } +} + +// ResetStats zeroes every counter. +func (fs *FeatureStore) ResetStats() { + fs.batchWritesTotal.Store(0) + fs.streamingWritesTotal.Store(0) + fs.readsTotal.Store(0) + fs.readFieldsTotal.Store(0) +} + +// encode renders a feature value as a string for hash storage. +// Booleans become "true" / "false" so they round-trip cleanly through +// other clients and redis-cli. +func encode(value FeatureValue) string { + switch v := value.(type) { + case nil: + return "" + case string: + return v + case bool: + if v { + return "true" + } + return "false" + case int: + return strconv.FormatInt(int64(v), 10) + case int32: + return strconv.FormatInt(int64(v), 10) + case int64: + return strconv.FormatInt(v, 10) + case float32: + return strconv.FormatFloat(float64(v), 'f', -1, 32) + case float64: + return strconv.FormatFloat(v, 'f', -1, 64) + default: + return fmt.Sprintf("%v", v) + } +} + diff --git a/content/develop/use-cases/feature-store/go/go.mod b/content/develop/use-cases/feature-store/go/go.mod new file mode 100644 index 0000000000..ee884c7a20 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/go.mod @@ -0,0 +1,11 @@ +module featurestore + +go 1.21 + +require github.com/redis/go-redis/v9 v9.18.0 + +require ( + github.com/cespare/xxhash/v2 v2.3.0 // indirect + github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect + go.uber.org/atomic v1.11.0 // indirect +) diff --git a/content/develop/use-cases/feature-store/go/go.sum b/content/develop/use-cases/feature-store/go/go.sum new file mode 100644 index 0000000000..e25b1f4d0a --- /dev/null +++ b/content/develop/use-cases/feature-store/go/go.sum @@ -0,0 +1,22 @@ +github.com/bsm/ginkgo/v2 v2.12.0 h1:Ny8MWAHyOepLGlLKYmXG4IEkioBysk6GpaRTLC8zwWs= +github.com/bsm/ginkgo/v2 v2.12.0/go.mod h1:SwYbGRRDovPVboqFv0tPTcG1sN61LM1Z4ARdbAV9g4c= +github.com/bsm/gomega v1.27.10 h1:yeMWxP2pV2fG3FgAODIY8EiRE3dy0aeFYt4l7wh6yKA= +github.com/bsm/gomega v1.27.10/go.mod h1:JyEr/xRbxbtgWNi8tIEVPUYZ5Dzef52k01W3YH0H+O0= +github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= +github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= +github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f h1:lO4WD4F/rVNCu3HqELle0jiPLLBs70cWOduZpkS1E78= +github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f/go.mod h1:cuUVRXasLTGF7a8hSLbxyZXjz+1KgoB3wDUb6vlszIc= +github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= +github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= +github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/redis/go-redis/v9 v9.18.0 h1:pMkxYPkEbMPwRdenAzUNyFNrDgHx9U+DrBabWNfSRQs= +github.com/redis/go-redis/v9 v9.18.0/go.mod h1:k3ufPphLU5YXwNTUcCRXGxUoF1fqxnhFQmscfkCoDA0= +github.com/stretchr/testify v1.3.0 h1:TivCn/peBQ7UY8ooIcPgZFpTNSz0Q2U6UrFlUfqbe0Q= +github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= +github.com/zeebo/xxh3 v1.0.2 h1:xZmwmqxHZA8AI603jOQ0tMqmBr9lPeFwGg6d+xy9DC0= +github.com/zeebo/xxh3 v1.0.2/go.mod h1:5NWz9Sef7zIDm2JHfFlcQvNekmcEl9ekUZQQKCYaDcA= +go.uber.org/atomic v1.11.0 h1:ZvwS0R+56ePWxUNi+Atn9dWONBPp/AUETXlHW0DxSjE= +go.uber.org/atomic v1.11.0/go.mod h1:LUxbIzbOniOlMKjJjyPfpl4v+PKK2cNJn91OQbhoJI0= diff --git a/content/develop/use-cases/feature-store/go/streaming_worker.go b/content/develop/use-cases/feature-store/go/streaming_worker.go new file mode 100644 index 0000000000..5e350899e3 --- /dev/null +++ b/content/develop/use-cases/feature-store/go/streaming_worker.go @@ -0,0 +1,231 @@ +package featurestore + +import ( + "context" + "fmt" + "log" + "math/rand" + "sync" + "sync/atomic" + "time" +) + +// Streaming feature updater for the demo. +// +// Stands in for whatever Flink, Kafka Streams, or bespoke service +// computes the real-time features in a real deployment. In production +// this code lives in the streaming layer; here it runs as a goroutine +// next to the demo server so the page can start, pause, and resume it +// from the UI. +// +// Every tick the worker picks a few random users and writes a new +// value for each streaming feature, with a per-field HEXPIRE so the +// field self-expires if the worker is paused. Pause the worker for +// longer than StreamingTTL and the streaming fields drop out of the +// hash while the batch fields remain populated under the longer +// key-level TTL — the *mixed staleness* story made visible. + +var ( + deviceIDs = []string{"ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", "web-chr-1", "web-saf-1", "web-ff-2"} + sessionCountries = []string{"US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"} + failedLoginBuckets = []int{0, 1, 2, 5} + failedLoginWeights = []int{70, 20, 8, 2} +) + +// WorkerStats is the JSON-friendly view of a StreamingWorker's +// state. The demo UI polls this every refresh. +type WorkerStats struct { + Running bool `json:"running"` + Paused bool `json:"paused"` + TickCount int64 `json:"tick_count"` + WritesCount int64 `json:"writes_count"` +} + +// StreamingWorker writes random streaming features on a tick. +type StreamingWorker struct { + store *FeatureStore + tick time.Duration + usersPerTick int + rng *rand.Rand + rngMu sync.Mutex + + running atomic.Bool + paused atomic.Bool + tickInFlight atomic.Bool + tickCount atomic.Int64 + writesCount atomic.Int64 + stopCh chan struct{} + doneCh chan struct{} +} + +// NewStreamingWorker constructs a worker that touches usersPerTick +// users every tick. +func NewStreamingWorker(store *FeatureStore, tick time.Duration, usersPerTick int, seed int64) *StreamingWorker { + if tick == 0 { + tick = time.Second + } + if usersPerTick == 0 { + usersPerTick = 5 + } + return &StreamingWorker{ + store: store, + tick: tick, + usersPerTick: usersPerTick, + rng: rand.New(rand.NewSource(seed)), + } +} + +// Start launches the goroutine that ticks. Safe to call when the +// worker is already running (no-op in that case). +func (w *StreamingWorker) Start(ctx context.Context) { + if !w.running.CompareAndSwap(false, true) { + return + } + w.paused.Store(false) + w.stopCh = make(chan struct{}) + w.doneCh = make(chan struct{}) + go w.run(ctx) +} + +// Stop signals the worker to exit and waits for any in-flight tick +// to settle. Safe to call multiple times. +func (w *StreamingWorker) Stop() { + if !w.running.CompareAndSwap(true, false) { + return + } + close(w.stopCh) + <-w.doneCh +} + +// Pause prevents new ticks from running. An already-running tick is +// not interrupted; use WaitForIdle to wait for it. +func (w *StreamingWorker) Pause() { w.paused.Store(true) } + +// Resume re-enables ticks. +func (w *StreamingWorker) Resume() { w.paused.Store(false) } + +// IsPaused returns whether the worker is paused. +func (w *StreamingWorker) IsPaused() bool { return w.paused.Load() } + +// IsRunning returns whether the goroutine is active. +func (w *StreamingWorker) IsRunning() bool { return w.running.Load() } + +// WaitForIdle blocks until any in-flight tick has finished its +// current updateStreaming loop. Pause() only stops *future* ticks +// from running — it does not interrupt one that is already +// mid-flight. Callers that need a quiesced worker (a reset that's +// about to DEL every entity, for example) must Pause() AND +// WaitForIdle() before they touch state the tick might still be +// writing to. +func (w *StreamingWorker) WaitForIdle() { + for w.tickInFlight.Load() { + time.Sleep(20 * time.Millisecond) + } +} + +// Stats returns a snapshot of the worker's counters and state. +func (w *StreamingWorker) Stats() WorkerStats { + return WorkerStats{ + Running: w.IsRunning(), + Paused: w.IsPaused(), + TickCount: w.tickCount.Load(), + WritesCount: w.writesCount.Load(), + } +} + +// ResetStats zeroes the tick and writes counters. +func (w *StreamingWorker) ResetStats() { + w.tickCount.Store(0) + w.writesCount.Store(0) +} + +func (w *StreamingWorker) run(ctx context.Context) { + defer close(w.doneCh) + t := time.NewTicker(w.tick) + defer t.Stop() + for { + select { + case <-w.stopCh: + return + case <-ctx.Done(): + return + case <-t.C: + if w.paused.Load() { + continue + } + w.tickInFlight.Store(true) + if err := w.doTick(ctx); err != nil { + log.Printf("[streaming-worker] tick failed: %v", err) + } + w.tickInFlight.Store(false) + } + } +} + +func (w *StreamingWorker) doTick(ctx context.Context) error { + ids, err := w.store.ListEntityIDs(ctx, 500) + if err != nil { + return fmt.Errorf("list entity ids: %w", err) + } + if len(ids) == 0 { + return nil + } + + w.rngMu.Lock() + n := w.usersPerTick + if n > len(ids) { + n = len(ids) + } + chosen := w.rng.Perm(len(ids))[:n] + picks := make([]string, n) + for i, idx := range chosen { + picks[i] = ids[idx] + } + w.rngMu.Unlock() + + nowMs := time.Now().UnixMilli() + for _, id := range picks { + fields := FeatureMap{ + "last_login_ts": nowMs, + "last_device_id": w.choice(deviceIDs), + "tx_count_5m": w.intn(13), + "failed_logins_15m": w.weightedInt(failedLoginBuckets, failedLoginWeights), + "session_country": w.choice(sessionCountries), + } + if err := w.store.UpdateStreaming(ctx, id, fields, 0); err != nil { + return fmt.Errorf("update streaming for %s: %w", id, err) + } + w.writesCount.Add(int64(len(fields))) + } + w.tickCount.Add(1) + return nil +} + +func (w *StreamingWorker) choice(items []string) string { + w.rngMu.Lock() + defer w.rngMu.Unlock() + return items[w.rng.Intn(len(items))] +} + +func (w *StreamingWorker) intn(n int) int { + w.rngMu.Lock() + defer w.rngMu.Unlock() + return w.rng.Intn(n) +} + +func (w *StreamingWorker) weightedInt(items []int, weights []int) int { + w.rngMu.Lock() + defer w.rngMu.Unlock() + total := 0 + for _, x := range weights { + total += x + } + r := w.rng.Intn(total) + for i, x := range weights { + r -= x + if r < 0 { + return items[i] + } + } + return items[len(items)-1] +} diff --git a/content/develop/use-cases/feature-store/java-jedis/BuildFeatures.java b/content/develop/use-cases/feature-store/java-jedis/BuildFeatures.java new file mode 100644 index 0000000000..670a07cfae --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/BuildFeatures.java @@ -0,0 +1,113 @@ +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import redis.clients.jedis.JedisPool; + +/** + * Synthesize a small batch of users with realistic-looking features + * and bulk-load them into Redis with a 24-hour key-level TTL. + * + *

Stands in for the nightly Spark / Feast materialization job in a + * real deployment. In production the equivalent of this script lives + * in an offline pipeline that reads from the offline store and writes + * the serving-time hashes into Redis via {@code HSET} + {@code EXPIRE}.

+ * + *

Run with: {@code mvn exec:java -Dexec.mainClass=BuildFeatures -Dexec.args="--count 500"}

+ */ +public class BuildFeatures { + + private static final List COUNTRY_CHOICES = List.of( + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"); + private static final List RISK_SEGMENTS = List.of("low", "medium", "high"); + private static final int[] RISK_WEIGHTS = {70, 25, 5}; + private static final int[] CHARGEBACK_BUCKETS = {0, 1, 2, 3}; + private static final int[] CHARGEBACK_WEIGHTS = {85, 10, 4, 1}; + + /** + * Generate {@code count} synthetic user feature rows. The shape + * mirrors a small fraud-scoring feature set: country and risk + * segment as TAG-like categorical features, plus a few numeric + * aggregates over recent windows. + */ + public static Map> synthesizeUsers(int count, long seed) { + Random rng = new Random(seed); + Map> users = new LinkedHashMap<>(count); + for (int i = 1; i <= count; i++) { + String uid = String.format("u%04d", i); + Map row = new LinkedHashMap<>(); + row.put("country_iso", COUNTRY_CHOICES.get(rng.nextInt(COUNTRY_CHOICES.size()))); + row.put("risk_segment", weightedChoice(rng, RISK_SEGMENTS, RISK_WEIGHTS)); + row.put("account_age_days", 7 + rng.nextInt(2394)); + row.put("tx_count_7d", rng.nextInt(81)); + row.put("avg_amount_30d", Math.round((5.0 + rng.nextDouble() * 345.0) * 100.0) / 100.0); + row.put("chargeback_count_180d", weightedChoiceInt(rng, CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS)); + users.put(uid, row); + } + return users; + } + + public static void main(String[] args) { + String redisHost = "localhost"; + int redisPort = 6379; + int count = 200; + long ttlSeconds = 24L * 60L * 60L; + String keyPrefix = "fs:user:"; + long seed = 42L; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--redis-host" -> redisHost = args[++i]; + case "--redis-port" -> redisPort = Integer.parseInt(args[++i]); + case "--count" -> count = Integer.parseInt(args[++i]); + case "--ttl-seconds" -> ttlSeconds = Long.parseLong(args[++i]); + case "--key-prefix" -> keyPrefix = args[++i]; + case "--seed" -> seed = Long.parseLong(args[++i]); + case "-h", "--help" -> { + System.out.println( + "Usage: mvn exec:java -Dexec.mainClass=BuildFeatures " + + "-Dexec.args=\"[--redis-host H] [--redis-port P] " + + "[--count N] [--ttl-seconds S] [--key-prefix PREFIX] [--seed N]\""); + return; + } + default -> { + System.err.println("Unknown argument: " + args[i]); + System.exit(2); + } + } + } + + try (JedisPool pool = new JedisPool(redisHost, redisPort)) { + FeatureStore store = new FeatureStore(pool, keyPrefix, ttlSeconds, + FeatureStore.DEFAULT_STREAMING_TTL_SECONDS); + Map> rows = synthesizeUsers(count, seed); + int loaded = store.bulkLoad(rows, ttlSeconds); + System.out.printf( + "Materialized %d users at %s* with a %ds key-level TTL.%n", + loaded, keyPrefix, ttlSeconds); + } + } + + private static String weightedChoice(Random rng, List items, int[] weights) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.size(); i++) { + r -= weights[i]; + if (r < 0) return items.get(i); + } + return items.get(items.size() - 1); + } + + private static int weightedChoiceInt(Random rng, int[] items, int[] weights) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.length; i++) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + } +} diff --git a/content/develop/use-cases/feature-store/java-jedis/DemoServer.java b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java new file mode 100644 index 0000000000..e56429e927 --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java @@ -0,0 +1,1014 @@ +import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.locks.ReentrantLock; + +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; + +/** + * Redis feature-store demo server (Jedis + JDK HttpServer). + * + *

Run with {@code mvn exec:java -Dexec.mainClass=DemoServer} and + * visit {@code http://localhost:8088} to watch an online feature + * store at work: a batch materialization loads N users with a 24-hour + * key-level TTL, a streaming worker overwrites a handful of users' + * real-time features every second with a per-field {@code HEXPIRE}, + * and the inference panel reads any subset of features for any user + * with {@code HMGET} in a single round trip.

+ */ +public class DemoServer { + + private static FeatureStore store; + private static StreamingWorker worker; + private static FeatureStoreDemo demo; + private static JedisPool jedisPool; + + public static void main(String[] args) throws Exception { + String host = "127.0.0.1"; + int port = 8088; + String redisHost = "localhost"; + int redisPort = 6379; + String keyPrefix = "fs:user:"; + long batchTtlSeconds = 24L * 60L * 60L; + long streamingTtlSeconds = 5L * 60L; + int usersPerTick = 5; + int seedUsers = 200; + boolean resetOnStart = true; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--host" -> host = args[++i]; + case "--port" -> port = Integer.parseInt(args[++i]); + case "--redis-host" -> redisHost = args[++i]; + case "--redis-port" -> redisPort = Integer.parseInt(args[++i]); + case "--key-prefix" -> keyPrefix = args[++i]; + case "--batch-ttl-seconds" -> batchTtlSeconds = Long.parseLong(args[++i]); + case "--streaming-ttl-seconds" -> streamingTtlSeconds = Long.parseLong(args[++i]); + case "--users-per-tick" -> usersPerTick = Integer.parseInt(args[++i]); + case "--seed-users" -> seedUsers = Integer.parseInt(args[++i]); + case "--no-reset" -> resetOnStart = false; + case "-h", "--help" -> { + System.out.println( + "Usage: mvn exec:java -Dexec.mainClass=DemoServer " + + "-Dexec.args=\"[--host H] [--port P] [--redis-host H] " + + "[--redis-port P] [--key-prefix PFX] " + + "[--batch-ttl-seconds S] [--streaming-ttl-seconds S] " + + "[--users-per-tick N] [--seed-users N] [--no-reset]\""); + return; + } + default -> { + System.err.println("Unknown argument: " + args[i]); + System.exit(2); + } + } + } + + JedisPoolConfig poolCfg = new JedisPoolConfig(); + poolCfg.setMaxTotal(64); + poolCfg.setMaxIdle(32); + poolCfg.setMinIdle(4); + jedisPool = new JedisPool(poolCfg, redisHost, redisPort); + + store = new FeatureStore(jedisPool, keyPrefix, + batchTtlSeconds, streamingTtlSeconds); + worker = new StreamingWorker(store, 1000L, usersPerTick, 1337L); + demo = new FeatureStoreDemo(store, worker, 42L); + + if (resetOnStart) { + System.out.printf( + "Dropping any existing users under '%s*' for a clean demo run (pass --no-reset to keep them).%n", + keyPrefix); + store.reset(); + store.resetStats(); + } + int seeded = demo.materialize(seedUsers, batchTtlSeconds).loaded(); + worker.start(); + + HttpServer server = HttpServer.create(new InetSocketAddress(host, port), 0); + server.createContext("/", new RootHandler()); + server.createContext("/state", new StateHandler()); + server.createContext("/inspect", new InspectHandler()); + server.createContext("/bulk-load", new BulkLoadHandler()); + server.createContext("/reset", new ResetHandler()); + server.createContext("/worker/toggle", new ToggleWorkerHandler()); + server.createContext("/read", new ReadHandler()); + server.createContext("/batch-read", new BatchReadHandler()); + server.setExecutor(Executors.newFixedThreadPool(16)); + server.start(); + + System.out.printf("Redis feature-store demo server listening on http://%s:%d%n", host, port); + System.out.printf( + "Using Redis at %s:%d with key prefix '%s' (batch TTL %ds, streaming TTL %ds)%n", + redisHost, redisPort, keyPrefix, batchTtlSeconds, streamingTtlSeconds); + System.out.printf("Materialized %d user(s); streaming worker running.%n", seeded); + + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + System.out.println("\nShutting down..."); + worker.stop(); + server.stop(0); + jedisPool.close(); + })); + + Thread.currentThread().join(); + } + + // --------------------------------------------------------------- + // FeatureStoreDemo wires the store and worker with the lifecycle + // operations the HTTP handlers call into. + // --------------------------------------------------------------- + + static class FeatureStoreDemo { + private final FeatureStore store; + private final StreamingWorker worker; + private final long seed; + private final ReentrantLock lock = new ReentrantLock(); + + FeatureStoreDemo(FeatureStore store, StreamingWorker worker, long seed) { + this.store = store; + this.worker = worker; + this.seed = seed; + } + + public record MaterializeResult(int loaded, long ttlSeconds, double elapsedMs) {} + + public MaterializeResult materialize(int count, long ttlSeconds) { + lock.lock(); + try { + Map> rows = BuildFeatures.synthesizeUsers(count, seed); + long t0 = System.nanoTime(); + int loaded = store.bulkLoad(rows, ttlSeconds); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + return new MaterializeResult(loaded, ttlSeconds, elapsedMs); + } finally { + lock.unlock(); + } + } + + public long reset() { + lock.lock(); + try { + // Pause the streaming worker around the DEL sweep so a + // concurrent tick can't recreate a user that was just + // enumerated for deletion (streaming HSET creates the + // key if it's missing, and that would leave behind a + // streaming-only hash with no key-level TTL). + // pause() only blocks *future* ticks — waitForIdle() + // flushes an already-running tick before the DEL sweep. + boolean wasPaused = worker.isPaused(); + if (worker.isRunning()) { + if (!wasPaused) worker.pause(); + worker.waitForIdle(); + } + try { + long deleted = store.reset(); + store.resetStats(); + worker.resetStats(); + return deleted; + } finally { + if (worker.isRunning() && !wasPaused) worker.resume(); + } + } finally { + lock.unlock(); + } + } + + public Map toggleWorker() { + lock.lock(); + try { + if (!worker.isRunning()) worker.start(); + if (worker.isPaused()) worker.resume(); + else worker.pause(); + return Map.of( + "paused", worker.isPaused(), + "running", worker.isRunning() + ); + } finally { + lock.unlock(); + } + } + } + + // --------------------------------------------------------------- + // Handlers + // --------------------------------------------------------------- + + static class RootHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!ex.getRequestURI().getPath().equals("/") && + !ex.getRequestURI().getPath().equals("/index.html")) { + send(ex, 404, "text/plain", "Not Found"); + return; + } + send(ex, 200, "text/html; charset=utf-8", htmlPage()); + } + } + + static class StateHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"GET".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + try { + List ids = store.listEntityIds(500); + long count = store.countEntities(); + Map out = new LinkedHashMap<>(); + out.put("key_prefix", store.getKeyPrefix()); + out.put("batch_ttl_seconds", store.getBatchTtlSeconds()); + out.put("streaming_ttl_seconds", store.getStreamingTtlSeconds()); + out.put("entity_count", count); + out.put("entity_ids", ids); + out.put("stats", statsToMap(store.stats())); + out.put("worker", workerStatsToMap(worker.statsSnapshot())); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class InspectHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"GET".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map q = parseQuery(ex.getRequestURI()); + String user = q.getOrDefault("user", "").trim(); + if (user.isEmpty()) { + sendJson(ex, 400, Map.of("error", "user is required")); return; + } + try { + Map full = store.getAllFeatures(user); + long keyTTL = store.keyTtlSeconds(user); + if (full.isEmpty()) { + sendJson(ex, 200, Map.of( + "exists", false, + "key_ttl_seconds", keyTTL)); + return; + } + List names = new ArrayList<>(full.keySet()); + Map ttls = store.fieldTtlsSeconds(user, names); + Collections.sort(names); + List> fields = new ArrayList<>(names.size()); + for (String n : names) { + Map row = new LinkedHashMap<>(); + row.put("name", n); + row.put("value", full.get(n)); + row.put("ttl_seconds", ttls.getOrDefault(n, -1L)); + fields.add(row); + } + sendJson(ex, 200, Map.of( + "exists", true, + "key_ttl_seconds", keyTTL, + "fields", fields)); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class BulkLoadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map form = parseForm(ex); + int count = clamp(parseIntOr(form.get("count"), 200), 1, 2000); + long ttl = (long) clamp(parseIntOr(form.get("ttl"), 86400), 5, 172_800); + try { + FeatureStoreDemo.MaterializeResult r = demo.materialize(count, ttl); + sendJson(ex, 200, Map.of( + "loaded", r.loaded(), + "ttl_seconds", r.ttlSeconds(), + "elapsed_ms", r.elapsedMs())); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class ResetHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + try { + long deleted = demo.reset(); + sendJson(ex, 200, Map.of("deleted", deleted)); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class ToggleWorkerHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + sendJson(ex, 200, demo.toggleWorker()); + } + } + + static class ReadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map> form = parseFormMulti(ex); + String user = first(form.get("user"), "").trim(); + if (user.isEmpty()) { + sendJson(ex, 400, Map.of("error", "user is required")); return; + } + List fields = nonEmpty(form.getOrDefault("field", List.of())); + try { + long t0 = System.nanoTime(); + Map values = fields.isEmpty() + ? Collections.emptyMap() + : store.getFeatures(user, fields); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + Map ttls = fields.isEmpty() + ? Collections.emptyMap() + : store.fieldTtlsSeconds(user, fields); + long keyTTL = store.keyTtlSeconds(user); + Map out = new LinkedHashMap<>(); + out.put("requested", fields); + out.put("values", values); + out.put("ttls", ttls); + out.put("key_ttl_seconds", keyTTL); + out.put("returned_count", values.size()); + out.put("elapsed_ms", elapsedMs); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class BatchReadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map> form = parseFormMulti(ex); + int count = clamp(parseIntOr(first(form.get("count"), "100"), 100), 1, 500); + List fields = nonEmpty(form.getOrDefault("field", List.of())); + if (fields.isEmpty()) { + fields = new ArrayList<>(FeatureStore.DEFAULT_STREAMING_FIELDS); + fields.add("risk_segment"); + } + try { + List ids = store.listEntityIds(Math.max(count * 2, 2000)); + if (ids.size() > count) ids = ids.subList(0, count); + long t0 = System.nanoTime(); + Map> rows = store.batchGetFeatures(ids, fields); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + int sampleN = Math.min(10, ids.size()); + List> sample = new ArrayList<>(sampleN); + for (int i = 0; i < sampleN; i++) { + String id = ids.get(i); + Map r = new LinkedHashMap<>(); + r.put("id", id); + r.put("field_count", rows.getOrDefault(id, Collections.emptyMap()).size()); + sample.add(r); + } + Map out = new LinkedHashMap<>(); + out.put("entity_count", ids.size()); + out.put("field_count", fields.size()); + out.put("elapsed_ms", elapsedMs); + out.put("sample", sample); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + // --------------------------------------------------------------- + // HTTP plumbing + // --------------------------------------------------------------- + + private static void send(HttpExchange ex, int status, String contentType, String body) throws IOException { + byte[] bytes = body.getBytes(StandardCharsets.UTF_8); + ex.getResponseHeaders().set("Content-Type", contentType); + ex.sendResponseHeaders(status, bytes.length); + try (OutputStream os = ex.getResponseBody()) { os.write(bytes); } + } + + private static void sendJson(HttpExchange ex, int status, Object payload) throws IOException { + send(ex, status, "application/json", toJson(payload)); + } + + private static String toJson(Object o) { + StringBuilder sb = new StringBuilder(); + appendJson(sb, o); + return sb.toString(); + } + + @SuppressWarnings("unchecked") + private static void appendJson(StringBuilder sb, Object o) { + if (o == null) { sb.append("null"); return; } + if (o instanceof Boolean b) { sb.append(b ? "true" : "false"); return; } + if (o instanceof Number n) { sb.append(n.toString()); return; } + if (o instanceof Map m) { + sb.append('{'); + boolean first = true; + for (Map.Entry e : ((Map) m).entrySet()) { + if (!first) sb.append(','); + first = false; + appendJsonString(sb, String.valueOf(e.getKey())); + sb.append(':'); + appendJson(sb, e.getValue()); + } + sb.append('}'); + return; + } + if (o instanceof List l) { + sb.append('['); + boolean first = true; + for (Object v : l) { + if (!first) sb.append(','); + first = false; + appendJson(sb, v); + } + sb.append(']'); + return; + } + if (o.getClass().isArray()) { + sb.append('['); + int len = java.lang.reflect.Array.getLength(o); + for (int i = 0; i < len; i++) { + if (i > 0) sb.append(','); + appendJson(sb, java.lang.reflect.Array.get(o, i)); + } + sb.append(']'); + return; + } + appendJsonString(sb, String.valueOf(o)); + } + + private static void appendJsonString(StringBuilder sb, String s) { + sb.append('"'); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + switch (c) { + case '"' -> sb.append("\\\""); + case '\\' -> sb.append("\\\\"); + case '\n' -> sb.append("\\n"); + case '\r' -> sb.append("\\r"); + case '\t' -> sb.append("\\t"); + default -> { + if (c < 0x20) sb.append(String.format("\\u%04x", (int) c)); + else sb.append(c); + } + } + } + sb.append('"'); + } + + private static Map parseQuery(URI uri) { + Map out = new HashMap<>(); + String q = uri.getRawQuery(); + if (q == null) return out; + for (String pair : q.split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.put(k, v); + } + return out; + } + + private static Map parseForm(HttpExchange ex) throws IOException { + byte[] body = ex.getRequestBody().readAllBytes(); + Map out = new HashMap<>(); + if (body.length == 0) return out; + for (String pair : new String(body, StandardCharsets.UTF_8).split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.put(k, v); + } + return out; + } + + private static Map> parseFormMulti(HttpExchange ex) throws IOException { + byte[] body = ex.getRequestBody().readAllBytes(); + Map> out = new HashMap<>(); + if (body.length == 0) return out; + for (String pair : new String(body, StandardCharsets.UTF_8).split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.computeIfAbsent(k, x -> new ArrayList<>()).add(v); + } + return out; + } + + private static String first(List values, String def) { + return values == null || values.isEmpty() ? def : values.get(0); + } + + private static List nonEmpty(List in) { + List out = new ArrayList<>(in.size()); + for (String v : in) if (v != null && !v.isEmpty()) out.add(v); + return out; + } + + private static int parseIntOr(String s, int def) { + if (s == null || s.isEmpty()) return def; + try { return Integer.parseInt(s); } catch (NumberFormatException e) { return def; } + } + + private static int clamp(int n, int low, int high) { + return n < low ? low : (n > high ? high : n); + } + + private static Map statsToMap(FeatureStore.Stats s) { + Map out = new LinkedHashMap<>(); + out.put("batch_writes_total", s.batchWritesTotal()); + out.put("streaming_writes_total", s.streamingWritesTotal()); + out.put("reads_total", s.readsTotal()); + out.put("read_fields_total", s.readFieldsTotal()); + return out; + } + + private static Map workerStatsToMap(StreamingWorker.Stats s) { + Map out = new LinkedHashMap<>(); + out.put("running", s.running()); + out.put("paused", s.paused()); + out.put("tick_count", s.tickCount()); + out.put("writes_count", s.writesCount()); + return out; + } + + private static String htmlPage() { + return HTML_TEMPLATE + .replace("__KEY_PREFIX__", store.getKeyPrefix()) + .replace("__STREAM_TTL__", Long.toString(store.getStreamingTtlSeconds())) + .replace("__USERS_PER_TICK__", Integer.toString(worker.getUsersPerTick())) + .replace("__BATCH_FIELDS_JSON__", toJson(FeatureStore.DEFAULT_BATCH_FIELDS)) + .replace("__STREAM_FIELDS_JSON__", toJson(FeatureStore.DEFAULT_STREAMING_FIELDS)); + } + + // --------------------------------------------------------------- + // HTML template + // --------------------------------------------------------------- + + private static final String HTML_TEMPLATE = """ + + + + + + Redis Feature Store Demo (Jedis) + + + +
+
Jedis + JDK com.sun.net.httpserver
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one Jedis Pipeline, so the whole batch + ships in one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + Pipeline.sync(). One network round trip for the + whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +"""; +} diff --git a/content/develop/use-cases/feature-store/java-jedis/FeatureStore.java b/content/develop/use-cases/feature-store/java-jedis/FeatureStore.java new file mode 100644 index 0000000000..019a1e1c5e --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/FeatureStore.java @@ -0,0 +1,450 @@ +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicLong; + +import redis.clients.jedis.Jedis; +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.Pipeline; +import redis.clients.jedis.Response; + +/** + * Redis online feature store backed by per-entity Hashes. + * + *

Each entity (here, a user) lives at a deterministic key such as + * {@code fs:user:{id}}. The hash holds every feature for that entity + * as one field per feature — batch-materialized aggregates (refreshed + * on a daily cycle) alongside streaming-updated signals (refreshed + * every few seconds). One {@code HMGET} returns whichever subset the + * model needs in one network round trip.

+ * + *

Two TTL layers solve the mixed staleness problem: + *

    + *
  • A key-level {@code EXPIRE} aligned with the batch + * materialization cycle causes the whole entity to disappear + * if its batch refresher fails, so inference sees a missing + * entity (which the model handler can detect and fall back on) + * rather than silently outdated values.
  • + *
  • A per-field {@code HEXPIRE} on each streaming field gives that + * field its own shorter expiry, independent of the rest of the + * hash. When the streaming pipeline stops updating a field, the + * field self-cleans while the rest of the entity stays + * populated.
  • + *

+ * + *

{@code HEXPIRE} and {@code HTTL} require Redis 7.4 or later. + * Jedis exposes them as {@code hexpire} / {@code httl} from 5.2.

+ * + *

Concurrency is by construction: Redis is single-threaded per + * shard, so overlapping {@code HSET} calls from a batch job and a + * streaming worker on the same entity hash are applied atomically + * without locks or version columns.

+ */ +public class FeatureStore { + + /** Default batch feature schema. */ + public static final List DEFAULT_BATCH_FIELDS = List.of( + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d" + ); + + /** Default streaming feature schema. */ + public static final List DEFAULT_STREAMING_FIELDS = List.of( + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country" + ); + + public static final long DEFAULT_BATCH_TTL_SECONDS = 24L * 60L * 60L; + public static final long DEFAULT_STREAMING_TTL_SECONDS = 5L * 60L; + public static final String DEFAULT_KEY_PREFIX = "fs:user:"; + + private final JedisPool pool; + private final String keyPrefix; + private final long batchTtlSeconds; + private final long streamingTtlSeconds; + + private final AtomicLong batchWritesTotal = new AtomicLong(); + private final AtomicLong streamingWritesTotal = new AtomicLong(); + private final AtomicLong readsTotal = new AtomicLong(); + private final AtomicLong readFieldsTotal = new AtomicLong(); + + public FeatureStore(JedisPool pool) { + this(pool, DEFAULT_KEY_PREFIX, + DEFAULT_BATCH_TTL_SECONDS, + DEFAULT_STREAMING_TTL_SECONDS); + } + + public FeatureStore(JedisPool pool, String keyPrefix, + long batchTtlSeconds, long streamingTtlSeconds) { + this.pool = pool; + this.keyPrefix = keyPrefix; + this.batchTtlSeconds = batchTtlSeconds; + this.streamingTtlSeconds = streamingTtlSeconds; + } + + public String getKeyPrefix() { return keyPrefix; } + public long getBatchTtlSeconds() { return batchTtlSeconds; } + public long getStreamingTtlSeconds() { return streamingTtlSeconds; } + + public String keyFor(String entityId) { + return keyPrefix + entityId; + } + + // --------------------------------------------------------------- + // Batch ingestion (materialization) + // --------------------------------------------------------------- + + /** + * Materialize a batch of entities into Redis. + * + *

{@code rows} is keyed by entity ID. One {@code HSET} plus one + * {@code EXPIRE} per entity, all queued through a single + * {@link Pipeline} so the whole batch ships in one round trip. + * The key-level {@code EXPIRE} is what makes the entity disappear + * if a future batch run fails — inference reads the missing entity + * rather than silently outdated values.

+ */ + public int bulkLoad(Map> rows, long ttlSeconds) { + if (rows.isEmpty()) return 0; + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + for (Map.Entry> e : rows.entrySet()) { + String key = keyFor(e.getKey()); + Map encoded = encode(e.getValue()); + pipe.hset(key, encoded); + pipe.expire(key, ttlSeconds); + } + pipe.sync(); + } + batchWritesTotal.addAndGet(rows.size()); + return rows.size(); + } + + public int bulkLoad(Map> rows) { + return bulkLoad(rows, batchTtlSeconds); + } + + /** + * Update a single batch feature without touching the key TTL. + * Used by the demo's "manually refresh one user" lever; real + * pipelines flow through {@link #bulkLoad}. + */ + public void updateBatchFeature(String entityId, String field, Object value) { + try (Jedis jedis = pool.getResource()) { + jedis.hset(keyFor(entityId), field, encodeValue(value)); + } + batchWritesTotal.incrementAndGet(); + } + + // --------------------------------------------------------------- + // Streaming ingestion + // --------------------------------------------------------------- + + /** + * Write streaming features with a per-field TTL. + * + *

Each field carries its own {@code HEXPIRE} so it self-expires + * independently of the rest of the hash. If the streaming + * pipeline stops, the streaming fields drop out while the + * batch-materialized fields remain populated under their longer + * key-level {@code EXPIRE}.

+ * + *

{@code HEXPIRE} returns one status code per field: + *

    + *
  • {@code 1}: TTL set / updated
  • + *
  • {@code 2}: the expiry was 0 or in the past, so Redis + * deleted the field instead of applying a TTL
  • + *
  • {@code 0}: an {@code NX | XX | GT | LT} conditional flag + * was specified and not met (we never use one here)
  • + *
  • {@code -2}: no such field, or no such key
  • + *
+ * We just {@code HSET} every field on the same call, so any code + * other than {@code 1} means the per-field TTL invariant did not + * hold — the mixed-staleness story relies on every streaming + * field carrying a fresh TTL after the write, so failure is + * loud.

+ */ + public void updateStreaming(String entityId, Map fields, long ttlSeconds) { + if (fields.isEmpty()) return; + String key = keyFor(entityId); + Map encoded = encode(fields); + String[] names = encoded.keySet().toArray(new String[0]); + + List expireCodes; + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + pipe.hset(key, encoded); + Response> expireResp = pipe.hexpire(key, ttlSeconds, names); + pipe.sync(); + expireCodes = expireResp.get(); + } + for (Long code : expireCodes) { + if (code == null || code != 1L) { + throw new IllegalStateException( + "HEXPIRE did not set every field TTL for " + key + ": " + expireCodes); + } + } + streamingWritesTotal.addAndGet(fields.size()); + } + + public void updateStreaming(String entityId, Map fields) { + updateStreaming(entityId, fields, streamingTtlSeconds); + } + + // --------------------------------------------------------------- + // Inference reads + // --------------------------------------------------------------- + + /** + * Retrieve a subset of features for one entity. Pass + * {@code fieldNames=null} (or call {@link #getAllFeatures}) to + * fetch the full hash with {@code HGETALL} — useful for debugging + * but rarely the right call on the request path, where the model + * knows exactly which features it consumes. + */ + public Map getFeatures(String entityId, List fieldNames) { + String key = keyFor(entityId); + Map out = new LinkedHashMap<>(); + if (fieldNames == null) { + try (Jedis jedis = pool.getResource()) { + Map all = jedis.hgetAll(key); + if (all != null) out.putAll(all); + } + readsTotal.incrementAndGet(); + readFieldsTotal.addAndGet(out.size()); + return out; + } + if (fieldNames.isEmpty()) return out; + List values; + try (Jedis jedis = pool.getResource()) { + values = jedis.hmget(key, fieldNames.toArray(new String[0])); + } + for (int i = 0; i < fieldNames.size(); i++) { + String v = values.get(i); + if (v != null) out.put(fieldNames.get(i), v); + } + readsTotal.incrementAndGet(); + readFieldsTotal.addAndGet(out.size()); + return out; + } + + public Map getAllFeatures(String entityId) { + return getFeatures(entityId, null); + } + + /** + * Pipeline {@code HMGET} across many entities for batch scoring. + * One round trip for the whole batch. + */ + public Map> batchGetFeatures( + List entityIds, List fieldNames) { + if (entityIds.isEmpty() || fieldNames.isEmpty()) { + return Collections.emptyMap(); + } + String[] names = fieldNames.toArray(new String[0]); + Map> out = new LinkedHashMap<>(); + List>> responses = new ArrayList<>(entityIds.size()); + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + for (String id : entityIds) { + responses.add(pipe.hmget(keyFor(id), names)); + } + pipe.sync(); + } + long seenFields = 0; + for (int i = 0; i < entityIds.size(); i++) { + List values = responses.get(i).get(); + Map row = new LinkedHashMap<>(); + for (int j = 0; j < fieldNames.size(); j++) { + String v = values.get(j); + if (v != null) { + row.put(fieldNames.get(j), v); + seenFields++; + } + } + out.put(entityIds.get(i), row); + } + readsTotal.addAndGet(entityIds.size()); + readFieldsTotal.addAndGet(seenFields); + return out; + } + + // --------------------------------------------------------------- + // TTL inspection (used by the demo UI) + // --------------------------------------------------------------- + + /** + * Seconds until the entity key expires. Returns {@code -1} if no + * key-level TTL is set, {@code -2} if the key doesn't exist. + */ + public long keyTtlSeconds(String entityId) { + try (Jedis jedis = pool.getResource()) { + return jedis.ttl(keyFor(entityId)); + } + } + + /** + * Per-field TTL via {@code HTTL} (Redis 7.4+). Each value mirrors + * the {@code TTL} convention: positive means seconds remaining, + * {@code -1} means the field has no TTL set, {@code -2} means + * the field doesn't exist on this hash (or the key itself is + * missing). + */ + public Map fieldTtlsSeconds(String entityId, List fieldNames) { + if (fieldNames.isEmpty()) return Collections.emptyMap(); + List codes; + try (Jedis jedis = pool.getResource()) { + codes = jedis.httl(keyFor(entityId), fieldNames.toArray(new String[0])); + } + Map out = new LinkedHashMap<>(); + for (int i = 0; i < fieldNames.size(); i++) { + // HTTL on a missing key returns a flat list of -2s; jedis + // surfaces null per element if the reply shape ever changes + // upstream, so coerce to -2 defensively. + Long c = i < codes.size() ? codes.get(i) : null; + out.put(fieldNames.get(i), c == null ? -2L : c); + } + return out; + } + + // --------------------------------------------------------------- + // Demo housekeeping + // --------------------------------------------------------------- + + /** + * Enumerate entity IDs by scanning {@code keyPrefix*}. {@code SCAN} + * is non-blocking; the demo uses it to populate UI dropdowns, not + * as a serving primitive. + */ + public List listEntityIds(int limit) { + List ids = new ArrayList<>(); + String pattern = keyPrefix + "*"; + String cursor = "0"; + try (Jedis jedis = pool.getResource()) { + do { + redis.clients.jedis.params.ScanParams params = new redis.clients.jedis.params.ScanParams() + .match(pattern) + .count(200); + redis.clients.jedis.resps.ScanResult sr = jedis.scan(cursor, params); + for (String k : sr.getResult()) { + if (k.length() > keyPrefix.length()) { + ids.add(k.substring(keyPrefix.length())); + if (ids.size() >= limit) { + Collections.sort(ids); + return ids; + } + } + } + cursor = sr.getCursor(); + } while (!"0".equals(cursor)); + } + Collections.sort(ids); + return ids; + } + + /** + * Count entities under the key prefix without an in-memory cap so + * the UI can report the real total even when more keys exist than + * the dropdown lists. + */ + public long countEntities() { + long count = 0; + String pattern = keyPrefix + "*"; + String cursor = "0"; + try (Jedis jedis = pool.getResource()) { + do { + redis.clients.jedis.params.ScanParams params = new redis.clients.jedis.params.ScanParams() + .match(pattern) + .count(500); + redis.clients.jedis.resps.ScanResult sr = jedis.scan(cursor, params); + count += sr.getResult().size(); + cursor = sr.getCursor(); + } while (!"0".equals(cursor)); + } + return count; + } + + public long deleteEntity(String entityId) { + try (Jedis jedis = pool.getResource()) { + return jedis.del(keyFor(entityId)); + } + } + + /** + * Drop every entity under the key prefix. Used by the demo reset + * path. Scans in batches and issues one variadic {@code DEL} per + * batch, so a large demo dataset doesn't land on the server as + * one giant synchronous delete. + */ + public long reset() { + long deleted = 0; + String pattern = keyPrefix + "*"; + String cursor = "0"; + try (Jedis jedis = pool.getResource()) { + do { + redis.clients.jedis.params.ScanParams params = new redis.clients.jedis.params.ScanParams() + .match(pattern) + .count(500); + redis.clients.jedis.resps.ScanResult sr = jedis.scan(cursor, params); + List batch = sr.getResult(); + if (!batch.isEmpty()) { + deleted += jedis.del(batch.toArray(new String[0])); + } + cursor = sr.getCursor(); + } while (!"0".equals(cursor)); + } + return deleted; + } + + public Stats stats() { + return new Stats( + batchWritesTotal.get(), + streamingWritesTotal.get(), + readsTotal.get(), + readFieldsTotal.get() + ); + } + + public void resetStats() { + batchWritesTotal.set(0); + streamingWritesTotal.set(0); + readsTotal.set(0); + readFieldsTotal.set(0); + } + + // --------------------------------------------------------------- + // Encoding helpers + // --------------------------------------------------------------- + + private static Map encode(Map fields) { + Map out = new LinkedHashMap<>(fields.size()); + for (Map.Entry e : fields.entrySet()) { + out.put(e.getKey(), encodeValue(e.getValue())); + } + return out; + } + + /** Render a feature value as a string for hash storage. */ + public static String encodeValue(Object value) { + if (value == null) return ""; + if (value instanceof Boolean b) return b ? "true" : "false"; + return value.toString(); + } + + /** Immutable snapshot of the helper's in-process counters. */ + public static record Stats( + long batchWritesTotal, + long streamingWritesTotal, + long readsTotal, + long readFieldsTotal + ) {} +} diff --git a/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java b/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java new file mode 100644 index 0000000000..e0bdb336ac --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java @@ -0,0 +1,220 @@ +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Streaming feature updater for the demo. + * + *

Stands in for whatever Flink, Kafka Streams, or bespoke service + * computes the real-time features in a real deployment. In production + * this code lives in the streaming layer; here it runs as a daemon + * Thread next to the demo server so the page can start, pause, and + * resume it from the UI.

+ * + *

Every tick it picks a few random users and writes a new value + * for each streaming feature, with a per-field {@code HEXPIRE} so the + * field self-expires if the worker is paused. Pause the worker for + * longer than {@code streamingTtlSeconds} and the streaming fields + * drop out of the hash while the batch fields remain populated under + * the longer key-level TTL — the mixed staleness story made + * visible.

+ */ +public class StreamingWorker { + + private static final List DEVICE_IDS = List.of( + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2"); + private static final List SESSION_COUNTRIES = List.of( + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"); + private static final int[] FAILED_LOGIN_BUCKETS = {0, 1, 2, 5}; + private static final int[] FAILED_LOGIN_WEIGHTS = {70, 20, 8, 2}; + + private final FeatureStore store; + private final long tickMillis; + private final int usersPerTick; + private final Random rng; + + private final Object rngLock = new Object(); + private final AtomicBoolean running = new AtomicBoolean(false); + private final AtomicBoolean paused = new AtomicBoolean(false); + private final AtomicBoolean tickInFlight = new AtomicBoolean(false); + private final AtomicLong tickCount = new AtomicLong(); + private final AtomicLong writesCount = new AtomicLong(); + + private Thread worker; + + public StreamingWorker(FeatureStore store, long tickMillis, int usersPerTick, long seed) { + this.store = store; + this.tickMillis = tickMillis > 0 ? tickMillis : 1000L; + this.usersPerTick = usersPerTick > 0 ? usersPerTick : 5; + this.rng = new Random(seed); + } + + public int getUsersPerTick() { return usersPerTick; } + + // --------------------------------------------------------------- + // Lifecycle + // --------------------------------------------------------------- + + /** Start the worker thread. Safe to call when already running. */ + public synchronized void start() { + if (running.get()) return; + running.set(true); + paused.set(false); + worker = new Thread(this::run, "streaming-worker"); + worker.setDaemon(true); + worker.start(); + } + + /** Stop the worker and wait for any in-flight tick to finish. */ + public synchronized void stop() { + if (!running.getAndSet(false)) return; + if (worker != null) { + try { + worker.join(2000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + worker = null; + } + waitForIdle(); + } + + public void pause() { paused.set(true); } + public void resume() { paused.set(false); } + + public boolean isRunning() { return running.get(); } + public boolean isPaused() { return paused.get(); } + + /** + * Block until any in-flight tick has finished its current + * updateStreaming loop. {@link #pause()} only stops future + * ticks from running — it does not interrupt one that is already + * mid-flight. Callers that need a quiesced worker (a reset that's + * about to DEL every entity, for example) must call {@code pause()} + * AND {@code waitForIdle()} before they touch state the tick + * might still be writing to. + */ + public void waitForIdle() { + while (tickInFlight.get()) { + try { + Thread.sleep(20); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + return; + } + } + } + + // --------------------------------------------------------------- + // Tick + // --------------------------------------------------------------- + + private void run() { + while (running.get()) { + try { + Thread.sleep(tickMillis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + if (!running.get()) break; + if (paused.get()) continue; + + tickInFlight.set(true); + try { + doTick(); + } catch (Exception e) { + System.err.printf("[streaming-worker] tick failed: %s%n", e.getMessage()); + } finally { + tickInFlight.set(false); + } + } + } + + private void doTick() { + List ids = store.listEntityIds(500); + if (ids.isEmpty()) return; + List picks = sample(ids, usersPerTick); + long nowMs = System.currentTimeMillis(); + int writes = 0; + for (String id : picks) { + Map fields = new LinkedHashMap<>(); + fields.put("last_login_ts", nowMs); + fields.put("last_device_id", choice(DEVICE_IDS)); + fields.put("tx_count_5m", intn(13)); + fields.put("failed_logins_15m", weightedInt(FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS)); + fields.put("session_country", choice(SESSION_COUNTRIES)); + store.updateStreaming(id, fields); + writes += fields.size(); + } + tickCount.incrementAndGet(); + writesCount.addAndGet(writes); + } + + // --------------------------------------------------------------- + // Stats + // --------------------------------------------------------------- + + public Stats statsSnapshot() { + return new Stats(isRunning(), isPaused(), tickCount.get(), writesCount.get()); + } + + public void resetStats() { + tickCount.set(0); + writesCount.set(0); + } + + public static record Stats( + boolean running, + boolean paused, + long tickCount, + long writesCount + ) {} + + // --------------------------------------------------------------- + // RNG helpers (all synchronized on rngLock so the worker stays + // deterministic across concurrent toggles from the demo UI). + // --------------------------------------------------------------- + + private List sample(List items, int k) { + synchronized (rngLock) { + int n = Math.min(k, items.size()); + List pool = new java.util.ArrayList<>(items); + List out = new java.util.ArrayList<>(n); + for (int i = 0; i < n; i++) { + int idx = rng.nextInt(pool.size()); + out.add(pool.remove(idx)); + } + return out; + } + } + + private String choice(List items) { + synchronized (rngLock) { + return items.get(rng.nextInt(items.size())); + } + } + + private int intn(int n) { + synchronized (rngLock) { + return rng.nextInt(n); + } + } + + private int weightedInt(int[] items, int[] weights) { + synchronized (rngLock) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.length; i++) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + } + } +} diff --git a/content/develop/use-cases/feature-store/java-jedis/_index.md b/content/develop/use-cases/feature-store/java-jedis/_index.md new file mode 100644 index 0000000000..71b331a00a --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/_index.md @@ -0,0 +1,735 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Java with Jedis +linkTitle: Jedis example (Java) +title: Redis feature store with Jedis +weight: 4 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Java with [Jedis]({{< relref "/develop/clients/jedis" >}}). It includes a +local web server built with the JDK's `com.sun.net.httpserver.HttpServer` so +you can bulk-load a batch of users with a key-level TTL, run a streaming +worker that overwrites real-time features with per-field TTL, retrieve any +subset of features for one user under 2 ms, and pipeline `HMGET` across a +hundred users for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`BuildFeatures.java` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`StreamingWorker.java` — the demo's stand-in for a Flink / Kafka Streams job. +The inference handlers of the demo server read any subset of those features +through `FeatureStore.java`'s helper class. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `BuildFeatures.synthesizeUsers(N, seed)` (in + production, the equivalent computation lives in an offline pipeline against + the warehouse). The result is `Map>` keyed by + user ID. +2. `store.bulkLoad(rows, ttlSeconds)` queues one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through Jedis's + [`Pipeline`]({{< relref "/develop/clients/jedis/transpipe" >}}), then + `pipe.sync()` ships the whole batch in a single round trip. The `HSET` + writes every batch field; the `EXPIRE` is what makes the entity disappear + if the next batch run fails, so inference reads a missing entity rather + than silently outdated values. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming +layer computes whatever real-time signals fall out of that event and calls +`store.updateStreaming(userId, fields, ttlSeconds)`. That batches: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field values. + Redis is single-threaded per shard, so this is atomic against any + concurrent batch write on the same hash — no version columns, no locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the fields + that were written, with the streaming TTL. Each streaming field carries + its own per-field expiry independent of the rest of the hash. Stop the + worker and these fields drop out one by one as their TTLs elapse, while + the batch fields remain populated under the longer key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is owned by + the model, not the store). +2. It calls `store.getFeatures(userId, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values in + the same order as the requested fields, with `null` for any field that + doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.batchGetFeatures(userIds, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` users + in a single network round trip via Jedis's `Pipeline.sync()`. + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-jedis/FeatureStore.java)): + +```java +import redis.clients.jedis.JedisPool; +import redis.clients.jedis.JedisPoolConfig; + +JedisPoolConfig cfg = new JedisPoolConfig(); +cfg.setMaxTotal(64); +JedisPool pool = new JedisPool(cfg, "localhost", 6379); +FeatureStore store = new FeatureStore(pool, + "fs:user:", + 24L * 60L * 60L, // whole-entity TTL aligned with the daily batch cycle + 5L * 60L // per-field TTL on each streaming feature +); + +// Batch materialization: one HSET + EXPIRE per user, all pipelined. +Map> rows = Map.of( + "u0001", Map.of( + "country_iso", "US", "risk_segment", "low", + "tx_count_7d", 14, "avg_amount_30d", 92.40, + "account_age_days", 612, "chargeback_count_180d", 0), + "u0002", Map.of( + "country_iso", "GB", "risk_segment", "medium", + "tx_count_7d", 47, "avg_amount_30d", 220.10, + "account_age_days", 1840, "chargeback_count_180d", 1)); +store.bulkLoad(rows); + +// Streaming write: HSET + HEXPIRE on just the fields that changed. +store.updateStreaming("u0001", Map.of( + "last_login_ts", System.currentTimeMillis(), + "last_device_id", "ios-9f02", + "tx_count_5m", 3, + "failed_logins_15m", 0, + "session_country", "US")); + +// Inference read: HMGET of whatever the model needs. +Map features = store.getFeatures("u0001", List.of( + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m")); + +// Batch scoring: pipelined HMGET across many users. +Map> batch = store.batchGetFeatures( + List.of("u0001", "u0002", "u0003"), + List.of("risk_segment", "tx_count_5m", "failed_logins_15m")); +``` + +### Project layout + +The four `.java` files and the `pom.xml` live in the same directory — the +`build-helper-maven-plugin` adds the project root as the source directory so +the Java sources sit alongside the build descriptor. Run with: + +```bash +mvn package +mvn exec:java -Dexec.mainClass=DemoServer +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes on the wire, so the helper encodes booleans as `"true"` / +`"false"` (`encodeValue(Object)` in `FeatureStore.java`) and renders +everything else with `Object.toString()`. The model server is responsible for +parsing back to the right type, the same way it would when reading any +serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +The batch fields sit under the key-level `EXPIRE`. The streaming fields each +carry their own [`HEXPIRE`]({{< relref "/commands/hexpire" >}}). If the +streaming pipeline stops, the streaming fields drop one by one as their +per-field TTLs elapse; the batch fields stay until the daily key-level +`EXPIRE` fires (or the next batch cycle re-pins them). + +### Bulk-loading batch features + +`bulkLoad` queues one `HSET` and one `EXPIRE` per user into a single +`Pipeline` and calls `sync()` to ship the lot. With 500 users that's 1000 +commands in one network call — Redis processes them sequentially on the +server side but the client only pays one RTT. + +```java +public int bulkLoad(Map> rows, long ttlSeconds) { + if (rows.isEmpty()) return 0; + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + for (Map.Entry> e : rows.entrySet()) { + String key = keyFor(e.getKey()); + Map encoded = encode(e.getValue()); + pipe.hset(key, encoded); + pipe.expire(key, ttlSeconds); + } + pipe.sync(); + } + ... +} +``` + +Jedis's `pipelined()` is a non-transactional batch: commands queue up and +ship in one round trip, but they don't run inside a `MULTI/EXEC` block. +That's the right choice here because each user's `HSET` + `EXPIRE` pair is +independent of every other user's, and an all-or-nothing transaction would +block the server for the duration of the batch. For the rare case where the +pair has to be inseparable (a server crash between the two would leave the +entity without a key-level TTL) you'd wrap each user in a `Transaction` or a +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}); for a +daily ingestion job that runs end-to-end every cycle, the next run re-pins +the TTL — no extra machinery needed. + +In production, the equivalent of this script runs as an offline pipeline (a +Spark or Feast `materialize` job) that reads from the warehouse and writes +into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`updateStreaming` is the linchpin of the mixed-staleness story: + +```java +public void updateStreaming(String entityId, Map fields, long ttlSeconds) { + if (fields.isEmpty()) return; + String key = keyFor(entityId); + Map encoded = encode(fields); + String[] names = encoded.keySet().toArray(new String[0]); + + List expireCodes; + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + pipe.hset(key, encoded); + Response> expireResp = pipe.hexpire(key, ttlSeconds, names); + pipe.sync(); + expireCodes = expireResp.get(); + } + for (Long code : expireCodes) { + if (code == null || code != 1L) { + throw new IllegalStateException( + "HEXPIRE did not set every field TTL for " + key + ": " + expireCodes); + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. The two commands are queued in one +`Pipeline` and Redis runs them in order: the `HSET` first creates or +overwrites the fields, then `HEXPIRE` attaches a TTL to each of those same +fields. `HEXPIRE` returns one status code per field — `1` if the TTL was +set, `2` if the expiry was 0 or in the past (so Redis deleted the field +instead), `0` if an `NX | XX | GT | LT` conditional flag was set and not met +(we never use one here), `-2` if the field doesn't exist on the key. The +helper throws if any code is anything other than `1`, so the "every +streaming write renews its TTL" invariant fails loudly rather than silently +leaving a streaming field with no expiry attached. + +`Response>` is Jedis's deferred-result wrapper for pipelined +commands: queue the command, call `pipe.sync()` to ship the batch, then read +each result via `.get()`. The `Response` for `hexpire` returns the per-field +codes; that list is what the helper validates above. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse — there is no application-side cleaner involved. +[`HTTL`]({{< relref "/commands/httl" >}}) lets the model side inspect the +remaining TTL on any field, which is useful both for debugging ("why is this +feature missing?" → "it expired three seconds ago") and as a freshness signal +in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level TTL +> commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, +> `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in Redis 7.4. Jedis 5.2 +> was the first release with the bindings; the demo's `pom.xml` pins 6.2. +> On older Redis builds you would have to put streaming features on their +> own keys (one key per feature, or one key per feature group) and set a +> key-level `EXPIRE` instead — at the cost of giving up the single-`HMGET` +> retrieval. + +### Inference reads with HMGET + +`getFeatures` is one `HMGET`: + +```java +public Map getFeatures(String entityId, List fieldNames) { + String key = keyFor(entityId); + Map out = new LinkedHashMap<>(); + if (fieldNames == null) { + try (Jedis jedis = pool.getResource()) { + Map all = jedis.hgetAll(key); + if (all != null) out.putAll(all); + } + return out; + } + if (fieldNames.isEmpty()) return out; + List values; + try (Jedis jedis = pool.getResource()) { + values = jedis.hmget(key, fieldNames.toArray(new String[0])); + } + for (int i = 0; i < fieldNames.size(); i++) { + String v = values.get(i); + if (v != null) out.put(fieldNames.get(i), v); + } + return out; +} +``` + +The model knows exactly which features it consumes, so the request path +always takes the `HMGET` branch with an explicit field list — that's the +sub-millisecond path. `HGETALL` is the right call for debugging (which is +what the demo's "Inspect" panel does) but not for serving: it forces Redis +to serialize every field, including ones the model doesn't need. + +Fields that don't exist (because they were never written, or because they +expired) come back as `null` in the `List` Jedis returns. The helper +drops them from the result `Map` so the caller sees only the features that +are actually available. A real model server would either treat missing +values as a feature ("this user has no streaming signal yet") or fall back +to a default from the model's training data. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```java +public Map> batchGetFeatures( + List entityIds, List fieldNames) { + if (entityIds.isEmpty() || fieldNames.isEmpty()) { + return Collections.emptyMap(); + } + String[] names = fieldNames.toArray(new String[0]); + List>> responses = new ArrayList<>(entityIds.size()); + try (Jedis jedis = pool.getResource()) { + Pipeline pipe = jedis.pipelined(); + for (String id : entityIds) { + responses.add(pipe.hmget(keyFor(id), names)); + } + pipe.sync(); + } + Map> out = new LinkedHashMap<>(); + for (int i = 0; i < entityIds.size(); i++) { + List values = responses.get(i).get(); + Map row = new LinkedHashMap<>(); + for (int j = 0; j < fieldNames.size(); j++) { + String v = values.get(j); + if (v != null) row.put(fieldNames.get(j), v); + } + out.put(entityIds.get(i), row); + } + return out; +} +``` + +One round trip for the whole batch — the demo regularly returns 30 users in +~1 ms against a local Redis. On a real network the round trip dominates; +pipelining is what keeps batch scoring practical. + +A Redis Cluster is different: a single `Pipeline.sync()` is bound to one +shard, because cross-slot pipelines on a cluster connection don't make sense. +For batch reads on a cluster, use +[`JedisCluster`]({{< relref "/develop/clients/jedis" >}}) and either fan out +parallel `hmget` calls (the cluster client routes per-shard for you) or, for +tighter control, group the IDs by hash slot ahead of time and issue one +`Pipeline.sync()` against each shard's connection in parallel. A hash tag +like `fs:user:{vip}:u0001` forces a known set of keys onto the same shard so +one pipeline can cover all of them in a single round trip. + +## The streaming worker + +`StreamingWorker.java` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java)). +It runs as a daemon `Thread` next to the demo server so the UI can start, +pause, and resume it; in production this code would live in the streaming +layer. + +Every tick the worker picks a few random users, generates a new value for +each streaming feature, and calls `store.updateStreaming(userId, fields)`. +The demo defaults to 5 users per tick at 1-second intervals — so a 200-user +store sees roughly half its users refreshed in the first minute, and most +after a few minutes. Raise `--users-per-tick` or drop `--seed-users` if +you'd rather touch every user quickly. + +```java +private void doTick() { + List ids = store.listEntityIds(500); + if (ids.isEmpty()) return; + List picks = sample(ids, usersPerTick); + long nowMs = System.currentTimeMillis(); + for (String id : picks) { + Map fields = new LinkedHashMap<>(); + fields.put("last_login_ts", nowMs); + fields.put("last_device_id", choice(DEVICE_IDS)); + fields.put("tx_count_5m", intn(13)); + fields.put("failed_logins_15m", weightedInt(FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS)); + fields.put("session_country", choice(SESSION_COUNTRIES)); + store.updateStreaming(id, fields); + } + ... +} +``` + +Pausing the worker is what shows off the mixed-staleness behavior: leave it +paused for longer than `streamingTtlSeconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields remain +under the longer key-level `EXPIRE`. The demo's `Pause / resume` button lets +you see this happen in real time. + +`pause()` only blocks *future* ticks from running — the thread checks the +flag at the top of the loop and skips its turn. A reset that's about to +`DEL` every key needs to wait out an already-running tick too, which is +what `waitForIdle()` is for: the demo's `Reset` handler calls +`worker.pause()` *and* `worker.waitForIdle()` before it issues the `DEL` +sweep, so a mid-flight tick can't recreate a user under a streaming-only +hash with no key-level TTL. + +## The batch builder + +`BuildFeatures.java` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-jedis/BuildFeatures.java)). +It generates synthetic feature rows and calls `store.bulkLoad` once. The +synthesis itself is not the point — in a real deployment the equivalent code +reads from the offline store (Snowflake, BigQuery, Iceberg) and writes the +resulting hashes into Redis. + +```java +public static Map> synthesizeUsers(int count, long seed) { + Random rng = new Random(seed); + Map> users = new LinkedHashMap<>(count); + for (int i = 1; i <= count; i++) { + String uid = String.format("u%04d", i); + Map row = new LinkedHashMap<>(); + row.put("country_iso", COUNTRY_CHOICES.get(rng.nextInt(COUNTRY_CHOICES.size()))); + row.put("risk_segment", weightedChoice(rng, RISK_SEGMENTS, RISK_WEIGHTS)); + row.put("account_age_days", 7 + rng.nextInt(2394)); + row.put("tx_count_7d", rng.nextInt(81)); + row.put("avg_amount_30d", Math.round((5.0 + rng.nextDouble() * 345.0) * 100.0) / 100.0); + row.put("chargeback_count_180d", weightedChoiceInt(rng, CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS)); + users.put(uid, row); + } + return users; +} +``` + +You can run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +mvn exec:java -Dexec.mainClass=BuildFeatures -Dexec.args="--count 500 --ttl-seconds 3600" +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which is +how a typical operator would pre-seed a feature store from the command line +when debugging. + +## The interactive demo + +`DemoServer.java` runs the JDK `HttpServer` on port 8088 with a fixed thread +pool. The HTML page lets you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. Drop the TTL to 30 s and watch the entire store expire on + schedule — the same thing that happens if a daily refresher fails. +* See the **store state** at a glance: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status (running / paused, ticks completed, + writes performed) and **pause or resume** it. Leave it paused for longer + than the streaming TTL to watch streaming fields drop out. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users and see + the total elapsed time plus the per-user breakdown. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL — the right view for debugging "why is this feature missing?" at + read time. + +The server holds one `FeatureStore` and one `StreamingWorker` for the +lifetime of the process, plus a `JedisPool` that all handlers borrow +connections from. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the + demo relies on per-field TTL for the mixed-staleness story. +* **Java 17 or later.** The demo uses pattern-matching `switch`, records, + and text blocks. +* **Jedis 5.2 or later.** The demo's `pom.xml` pins + `redis.clients:jedis:6.2.0`. Field-level TTL bindings (`hexpire`, `httl`, + `hpersist`) ship from Jedis 5.2. + +If your Redis server is running elsewhere, start the demo with `--redis-host` +and `--redis-port`. + +## Running the demo + +### Get the source files + +The demo lives in a small Maven project under +[`feature-store/java-jedis`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/java-jedis). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/java-jedis +mvn package +``` + +### Start the demo server + +From the project directory: + +```bash +mvn exec:java -Dexec.mainClass=DemoServer +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8088 +Using Redis at localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +By default the demo wipes the configured key prefix on startup so each run +starts from a clean state. Pass `--no-reset` to keep any existing data, or +`--key-prefix ` to point the demo at a different prefix entirely. +Maven exec passes CLI args via `-Dexec.args`: + +```bash +mvn exec:java -Dexec.mainClass=DemoServer \ + -Dexec.args="--port 9000 --streaming-ttl-seconds 30" +``` + +Open [http://127.0.0.1:8088](http://127.0.0.1:8088) in a browser. Useful +things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused for + ~5 minutes (or restart the server with `--streaming-ttl-seconds 30` to + make it visible in seconds). Re-run **Read features** on any user and + watch the streaming fields disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Bulk-load** with a short TTL (say 30 seconds) and watch the user + count fall to zero on the next minute — the same thing that happens if a + daily batch run fails to land. +* Click **Reset** to drop every user and start over. + +The server is read/write against your local Redis. The default key prefix +is `fs:user:`. Pass `--no-reset` to keep existing data across restarts, or +`--redis-host` / `--redis-port` to point at a different Redis. + +## Production usage + +The guidance below focuses on the production concerns that are specific to +running a feature store on Redis. For the generic Jedis production checklist +— `JedisPool` sizing, TLS, AUTH/ACL, retry policy, sentinel/cluster +failover — see the +[Jedis production usage guide]({{< relref "/develop/clients/jedis/produsage" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness from a +broken batch pipeline. Set it longer than your worst-case batch outage so a +single missed run doesn't take the feature store offline, but short enough +that a sustained outage causes loud failures (missing entities) rather than +quiet ones (yesterday's features being scored as today's). The standard +choice is one cycle of "expected refresh interval × 2" — for a daily batch, +48 hours; for a 6-hour batch, 12 hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't churn +features needlessly, but short enough that a stalled worker causes visible +freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the schema +in your offline store. The batch materialization step is your chance to +flatten joins, encode categoricals, and project to whatever shape the model +server wants — so the request path is exactly one `HMGET` and zero +transforms. + +The training pipeline reads from the offline store with its own schema; the +serving pipeline reads from Redis with the flattened serving schema. +Keeping those two pipelines as the same code path is what prevents +training-serving skew. + +### Pipeline batch reads across shards + +On a single Redis instance, `Pipeline.sync()` across `N` `hmget` calls is +one round trip. A Redis Cluster is different: a single `Pipeline.sync()` is +bound to one shard, because cross-slot pipelines on a cluster connection +don't make sense, and the keys for a typical user batch will land on +multiple shards. For batch reads on a cluster, use +[`JedisCluster`]({{< relref "/develop/clients/jedis" >}}) — its +implementation routes per-shard for you. For tighter control, group the IDs +by hash slot ahead of time and issue one `Pipeline.sync()` per shard's +connection in parallel. For a small number of frequently-queried users (a +top-N customer list, for example), a hash tag like `fs:user:{vip}:u0001` +forces a known set of keys onto the same shard so one pipeline can cover +all of them in a single round trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the streaming +write applies `HEXPIRE` *every time*. If a streaming worker writes a field +without renewing its TTL, the field carries whatever expiry was there before +— possibly none, possibly stale — and the mixed-staleness invariant breaks. +Keep the `HSET` and `HEXPIRE` in the same pipeline (or, even safer, in the +same [Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model doesn't +need. With dozens of features per entity, that is wasted serialization work +on the server and wasted bandwidth on the wire. Always specify the field +list explicitly with `hmget` in the model server. + +The exception is debugging and feature-set discovery, where you genuinely +want the full hash. The demo's "Inspect" button uses `hgetAll` for exactly +this reason. + +### Size the JedisPool for the request shape + +The demo creates a `JedisPool` with `maxTotal=64` because each HTTP request +borrows one connection for the duration of the handler. In production, size +`maxTotal` to at least your expected concurrent request count plus the +worker pool's borrow rate. Setting it too low forces requests to block +waiting for a connection — a slow read-side cliff that doesn't show up +under load tests with very few clients. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the hash +(either it was never written, or it expired); `-1` means the field has no +TTL set (and is therefore covered only by the key-level `EXPIRE`); any +positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on streaming + features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL aligned + with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one network + round trip — see + [transactions and pipelining]({{< relref "/develop/clients/jedis/transpipe" >}}). + +See the [Jedis documentation]({{< relref "/develop/clients/jedis" >}}) for +the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the deeper +conceptual model — including the listpack encoding that makes small hashes +particularly compact in memory, which matters at feature-store scale. diff --git a/content/develop/use-cases/feature-store/java-jedis/pom.xml b/content/develop/use-cases/feature-store/java-jedis/pom.xml new file mode 100644 index 0000000000..a7d8cd0f5a --- /dev/null +++ b/content/develop/use-cases/feature-store/java-jedis/pom.xml @@ -0,0 +1,88 @@ + + + + 4.0.0 + + com.redis.docs + feature-store-jedis + 0.1.0 + jar + + + 17 + UTF-8 + + + + + + redis.clients + jedis + 6.2.0 + + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.6.0 + + + add-source + generate-sources + add-source + + + ${project.basedir} + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + + *.java + + + + + + org.codehaus.mojo + exec-maven-plugin + 3.5.0 + + + + + ${project.basedir} + + From 18105f75de04e88f71b09eae9519b410f7f0efa6 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Fri, 29 May 2026 16:09:22 +0100 Subject: [PATCH 06/20] DOC-6661 Go and Jedis after Codex changes --- .../use-cases/feature-store/go/_index.md | 22 ++++++- .../use-cases/feature-store/go/demo_server.go | 16 ++--- .../feature-store/go/streaming_worker.go | 34 +++++++--- .../java-jedis/StreamingWorker.java | 62 +++++++++++++------ .../feature-store/java-jedis/_index.md | 29 ++++++--- .../feature-store/java-jedis/pom.xml | 3 +- 6 files changed, 119 insertions(+), 47 deletions(-) diff --git a/content/develop/use-cases/feature-store/go/_index.md b/content/develop/use-cases/feature-store/go/_index.md index b8be523e93..fcb44622dd 100644 --- a/content/develop/use-cases/feature-store/go/_index.md +++ b/content/develop/use-cases/feature-store/go/_index.md @@ -451,7 +451,11 @@ func (w *StreamingWorker) doTick(ctx context.Context) error { if len(ids) == 0 { return nil } - chosen := w.rng.Perm(len(ids))[:w.usersPerTick] + n := w.usersPerTick + if n > len(ids) { + n = len(ids) + } + chosen := w.rng.Perm(len(ids))[:n] nowMs := time.Now().UnixMilli() for _, idx := range chosen { fields := FeatureMap{ @@ -636,6 +640,22 @@ and the The feature-store demo runs against `localhost` with the defaults; a real deployment should harden the client first. +### Plumb the right context to each call site + +go-redis takes a `context.Context` on every command, and the right context +depends on the call site: + +* **Inference handlers**: pass `r.Context()` (the request context) into the + store calls. If the client hangs up, the in-flight `HMGET` is cancelled + promptly and the connection is returned to the pool — important under + sustained load. +* **Background workers**: pass a server-lifetime context (a + `context.Background()`-derived one stored on the worker struct, as + `StreamingWorker` does). A worker driven off `r.Context()` would die on + the very next tick after its triggering request completes. +* **Batch jobs**: a `context.WithTimeout` is the usual choice so a stuck + Redis can't hold the materialization pipeline open indefinitely. + ### Pick the batch TTL to outlast a failed refresher The whole-entity `EXPIRE` is your safety net against silent staleness from a diff --git a/content/develop/use-cases/feature-store/go/demo_server.go b/content/develop/use-cases/feature-store/go/demo_server.go index 2777da8a3c..7bb6ce8102 100644 --- a/content/develop/use-cases/feature-store/go/demo_server.go +++ b/content/develop/use-cases/feature-store/go/demo_server.go @@ -44,7 +44,6 @@ import ( "encoding/json" "flag" "fmt" - "log" "net/http" "sort" "strconv" @@ -116,12 +115,15 @@ func (d *FeatureStoreDemo) Reset(ctx context.Context) (int64, error) { } // ToggleWorker pauses or resumes the streaming worker. Starts the -// goroutine if it wasn't running. -func (d *FeatureStoreDemo) ToggleWorker(ctx context.Context) (paused, running bool) { +// goroutine if it wasn't running. The worker owns its own +// background-context lifecycle, so we don't plumb the request +// context in here (it would cancel as soon as the HTTP response +// completes and the worker would die on the next tick). +func (d *FeatureStoreDemo) ToggleWorker() (paused, running bool) { d.mu.Lock() defer d.mu.Unlock() if !d.worker.IsRunning() { - d.worker.Start(ctx) + d.worker.Start() } if d.worker.IsPaused() { d.worker.Resume() @@ -288,7 +290,7 @@ func (s *httpServer) handleToggleWorker(w http.ResponseWriter, r *http.Request) http.Error(w, "method not allowed", http.StatusMethodNotAllowed) return } - paused, running := s.demo.ToggleWorker(r.Context()) + paused, running := s.demo.ToggleWorker() jsonResponse(w, http.StatusOK, map[string]any{ "paused": paused, "running": running, @@ -495,7 +497,7 @@ func RunDemoServer(args []string) error { return fmt.Errorf("seed materialize: %w", err) } - worker.Start(ctx) + worker.Start() defer worker.Stop() srv := &httpServer{store: store, worker: worker, demo: demo} @@ -508,7 +510,7 @@ func RunDemoServer(args []string) error { fmt.Printf("Materialized %d user(s); streaming worker running.\n", seeded) if err := hs.ListenAndServe(); err != nil && err != http.ErrServerClosed { - log.Fatalf("listen: %v", err) + return fmt.Errorf("listen: %w", err) } return nil } diff --git a/content/develop/use-cases/feature-store/go/streaming_worker.go b/content/develop/use-cases/feature-store/go/streaming_worker.go index 5e350899e3..f061382149 100644 --- a/content/develop/use-cases/feature-store/go/streaming_worker.go +++ b/content/develop/use-cases/feature-store/go/streaming_worker.go @@ -77,14 +77,20 @@ func NewStreamingWorker(store *FeatureStore, tick time.Duration, usersPerTick in // Start launches the goroutine that ticks. Safe to call when the // worker is already running (no-op in that case). -func (w *StreamingWorker) Start(ctx context.Context) { +// +// The worker uses an internal `context.Background()`-derived context +// rather than one passed in by the caller: the HTTP toggle handler +// runs on a request-scoped context that cancels as soon as the +// response completes, which would kill the worker on the very next +// tick. Lifecycle is owned by ``Stop`` (and the internal ``stopCh``). +func (w *StreamingWorker) Start() { if !w.running.CompareAndSwap(false, true) { return } w.paused.Store(false) w.stopCh = make(chan struct{}) w.doneCh = make(chan struct{}) - go w.run(ctx) + go w.run(context.Background()) } // Stop signals the worker to exit and waits for any in-flight tick @@ -140,7 +146,16 @@ func (w *StreamingWorker) ResetStats() { } func (w *StreamingWorker) run(ctx context.Context) { - defer close(w.doneCh) + // Whatever exits this goroutine — stopCh, ctx.Done(), or a future + // panic-recovery path — must clear `running` so a later Start() + // can spin a fresh goroutine. Without this, a one-shot ctx cancel + // (or any unexpected exit) leaves IsRunning() returning true + // forever, and ToggleWorker's CompareAndSwap refuses to restart. + defer func() { + w.running.Store(false) + w.tickInFlight.Store(false) + close(w.doneCh) + }() t := time.NewTicker(w.tick) defer t.Stop() for { @@ -150,12 +165,15 @@ func (w *StreamingWorker) run(ctx context.Context) { case <-ctx.Done(): return case <-t.C: - if w.paused.Load() { - continue - } + // Set tickInFlight *before* the pause check so a + // concurrent Pause()+WaitForIdle() can never see + // tickInFlight=false in the window between the pause + // check and the actual doTick call. w.tickInFlight.Store(true) - if err := w.doTick(ctx); err != nil { - log.Printf("[streaming-worker] tick failed: %v", err) + if !w.paused.Load() { + if err := w.doTick(ctx); err != nil { + log.Printf("[streaming-worker] tick failed: %v", err) + } } w.tickInFlight.Store(false) } diff --git a/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java b/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java index e0bdb336ac..e8c495ab0f 100644 --- a/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java +++ b/content/develop/use-cases/feature-store/java-jedis/StreamingWorker.java @@ -99,14 +99,22 @@ public synchronized void stop() { * might still be writing to. */ public void waitForIdle() { + // Reset cannot safely proceed while a tick is mid-write, so an + // interrupt during the wait must NOT short-circuit out with + // tickInFlight still true. Save the interrupt status, keep + // looping until the tick clears, then restore the flag so the + // caller can act on it if they care. + boolean interrupted = false; while (tickInFlight.get()) { try { Thread.sleep(20); } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - return; + interrupted = true; } } + if (interrupted) { + Thread.currentThread().interrupt(); + } } // --------------------------------------------------------------- @@ -114,24 +122,40 @@ public void waitForIdle() { // --------------------------------------------------------------- private void run() { - while (running.get()) { - try { - Thread.sleep(tickMillis); - } catch (InterruptedException e) { - Thread.currentThread().interrupt(); - break; - } - if (!running.get()) break; - if (paused.get()) continue; - - tickInFlight.set(true); - try { - doTick(); - } catch (Exception e) { - System.err.printf("[streaming-worker] tick failed: %s%n", e.getMessage()); - } finally { - tickInFlight.set(false); + try { + while (running.get()) { + try { + Thread.sleep(tickMillis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + if (!running.get()) break; + + // Set tickInFlight *before* the pause check so a + // concurrent pause()+waitForIdle() can never see + // tickInFlight=false in the window between the pause + // check and the actual doTick call. The finally + // block clears the flag whether we paused, succeeded, + // or threw. + tickInFlight.set(true); + try { + if (!paused.get()) { + doTick(); + } + } catch (Exception e) { + System.err.printf("[streaming-worker] tick failed: %s%n", e.getMessage()); + } finally { + tickInFlight.set(false); + } } + } finally { + // Whatever exits this thread — running flipping false, + // an interrupt, or any unexpected throw — must clear + // both the running and in-flight flags so a later start() + // can spin a fresh thread. + running.set(false); + tickInFlight.set(false); } } diff --git a/content/develop/use-cases/feature-store/java-jedis/_index.md b/content/develop/use-cases/feature-store/java-jedis/_index.md index 71b331a00a..a2f61eec3c 100644 --- a/content/develop/use-cases/feature-store/java-jedis/_index.md +++ b/content/develop/use-cases/feature-store/java-jedis/_index.md @@ -528,8 +528,8 @@ connections from. Endpoints: * **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the demo relies on per-field TTL for the mixed-staleness story. -* **Java 17 or later.** The demo uses pattern-matching `switch`, records, - and text blocks. +* **Java 17 or later.** The demo uses switch expressions with arrow + labels (`case "..." -> ...`), records, and text blocks. * **Jedis 5.2 or later.** The demo's `pom.xml` pins `redis.clients:jedis:6.2.0`. Field-level TTL bindings (`hexpire`, `httl`, `hpersist`) ship from Jedis 5.2. @@ -604,9 +604,11 @@ is `fs:user:`. Pass `--no-reset` to keep existing data across restarts, or The guidance below focuses on the production concerns that are specific to running a feature store on Redis. For the generic Jedis production checklist -— `JedisPool` sizing, TLS, AUTH/ACL, retry policy, sentinel/cluster -failover — see the +— `JedisPool` sizing, AUTH/ACL, retry policy, sentinel/cluster failover — +see the [Jedis production usage guide]({{< relref "/develop/clients/jedis/produsage" >}}). +For TLS specifically, follow the +[connect-with-TLS recipe]({{< relref "/develop/clients/jedis/connect#connect-to-your-production-redis-with-tls" >}}). The feature-store demo runs against `localhost` with the defaults; a real deployment should harden the client first. @@ -676,12 +678,19 @@ this reason. ### Size the JedisPool for the request shape -The demo creates a `JedisPool` with `maxTotal=64` because each HTTP request -borrows one connection for the duration of the handler. In production, size -`maxTotal` to at least your expected concurrent request count plus the -worker pool's borrow rate. Setting it too low forces requests to block -waiting for a connection — a slow read-side cliff that doesn't show up -under load tests with very few clients. +Every `FeatureStore` helper method borrows a connection from the +`JedisPool` for the duration of one Redis call (or one `Pipeline.sync()`) +and returns it via the try-with-resources block. One HTTP handler can +therefore borrow several connections sequentially — `/read`, for example, +makes one `hmget` call, one `httl` call, and one `ttl` call, each of +which is its own borrow. + +The demo uses `maxTotal=64`. In production, size `maxTotal` to comfortably +exceed your peak concurrent borrow count: that's roughly +`(concurrent HTTP handlers × Redis calls per handler in flight at once) + +(background worker borrow rate)`. Setting it too low forces some borrows +to block waiting for a returned connection — a slow read-side cliff that +doesn't show up under load tests with very few clients. ### Inspect the store directly with redis-cli diff --git a/content/develop/use-cases/feature-store/java-jedis/pom.xml b/content/develop/use-cases/feature-store/java-jedis/pom.xml index a7d8cd0f5a..fbed81d6dd 100644 --- a/content/develop/use-cases/feature-store/java-jedis/pom.xml +++ b/content/develop/use-cases/feature-store/java-jedis/pom.xml @@ -30,8 +30,7 @@ + (hexpire / httl / hpersist); the demo pins 6.2.0. --> redis.clients jedis From f5daa75c82958623308a58d755fecb7c83c4795b Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 09:34:05 +0100 Subject: [PATCH 07/20] DOC-6661 Lettuce example plus some fixes --- .../develop/use-cases/feature-store/_index.md | 1 + .../use-cases/feature-store/go/demo_server.go | 25 +- .../feature-store/java-jedis/DemoServer.java | 16 +- .../java-lettuce/BuildFeatures.java | 115 ++ .../java-lettuce/DemoServer.java | 1036 +++++++++++++++++ .../java-lettuce/FeatureStore.java | 529 +++++++++ .../java-lettuce/StreamingWorker.java | 235 ++++ .../feature-store/java-lettuce/_index.md | 657 +++++++++++ .../feature-store/java-lettuce/pom.xml | 82 ++ .../feature-store/nodejs/demoServer.js | 20 +- .../feature-store/redis-py/demo_server.py | 16 +- 11 files changed, 2719 insertions(+), 13 deletions(-) create mode 100644 content/develop/use-cases/feature-store/java-lettuce/BuildFeatures.java create mode 100644 content/develop/use-cases/feature-store/java-lettuce/DemoServer.java create mode 100644 content/develop/use-cases/feature-store/java-lettuce/FeatureStore.java create mode 100644 content/develop/use-cases/feature-store/java-lettuce/StreamingWorker.java create mode 100644 content/develop/use-cases/feature-store/java-lettuce/_index.md create mode 100644 content/develop/use-cases/feature-store/java-lettuce/pom.xml diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index 65d8ca2d9c..6e99909fd4 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -159,3 +159,4 @@ for a single user under 1 ms, and pipeline batch reads across a hundred users. * [node-redis (Node.js)]({{< relref "/develop/use-cases/feature-store/nodejs" >}}) * [go-redis (Go)]({{< relref "/develop/use-cases/feature-store/go" >}}) * [Jedis (Java)]({{< relref "/develop/use-cases/feature-store/java-jedis" >}}) +* [Lettuce (Java)]({{< relref "/develop/use-cases/feature-store/java-lettuce" >}}) diff --git a/content/develop/use-cases/feature-store/go/demo_server.go b/content/develop/use-cases/feature-store/go/demo_server.go index 7bb6ce8102..8ce1b63e23 100644 --- a/content/develop/use-cases/feature-store/go/demo_server.go +++ b/content/develop/use-cases/feature-store/go/demo_server.go @@ -222,9 +222,24 @@ func (s *httpServer) handleInspect(w http.ResponseWriter, r *http.Request) { }) return } - names := make([]string, 0, len(full)) - for n := range full { + // Iterate the known schema (batch + streaming) plus any extras the + // hash carries. Expired streaming fields surface as ttl_seconds=-2 + // in the Inspect view instead of silently disappearing, which is + // exactly the debugging view someone hits "Inspect" for. + seen := make(map[string]struct{}, len(DefaultBatchFields)+len(DefaultStreamingFields)) + names := make([]string, 0, len(DefaultBatchFields)+len(DefaultStreamingFields)+len(full)) + for _, n := range DefaultBatchFields { + names = append(names, n) + seen[n] = struct{}{} + } + for _, n := range DefaultStreamingFields { names = append(names, n) + seen[n] = struct{}{} + } + for n := range full { + if _, ok := seen[n]; !ok { + names = append(names, n) + } } ttls, err := s.store.FieldTTLsSeconds(ctx, user, names) if err != nil { @@ -233,10 +248,14 @@ func (s *httpServer) handleInspect(w http.ResponseWriter, r *http.Request) { } rows := make([]map[string]any, 0, len(names)) for _, n := range names { + ttl, ok := ttls[n] + if !ok { + ttl = -2 + } rows = append(rows, map[string]any{ "name": n, "value": full[n], - "ttl_seconds": ttls[n], + "ttl_seconds": ttl, }) } sort.Slice(rows, func(i, j int) bool { diff --git a/content/develop/use-cases/feature-store/java-jedis/DemoServer.java b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java index e56429e927..8822a3e186 100644 --- a/content/develop/use-cases/feature-store/java-jedis/DemoServer.java +++ b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java @@ -259,15 +259,25 @@ static class InspectHandler implements HttpHandler { "key_ttl_seconds", keyTTL)); return; } - List names = new ArrayList<>(full.keySet()); + // Iterate the known schema (batch + streaming) plus + // any extras the hash carries. Expired streaming + // fields surface as ttl_seconds=-2 in the Inspect + // view instead of silently disappearing, which is + // exactly the debugging view someone hits "Inspect" + // for. + List names = new ArrayList<>(FeatureStore.DEFAULT_BATCH_FIELDS); + names.addAll(FeatureStore.DEFAULT_STREAMING_FIELDS); + for (String n : full.keySet()) { + if (!names.contains(n)) names.add(n); + } Map ttls = store.fieldTtlsSeconds(user, names); Collections.sort(names); List> fields = new ArrayList<>(names.size()); for (String n : names) { Map row = new LinkedHashMap<>(); row.put("name", n); - row.put("value", full.get(n)); - row.put("ttl_seconds", ttls.getOrDefault(n, -1L)); + row.put("value", full.getOrDefault(n, "")); + row.put("ttl_seconds", ttls.getOrDefault(n, -2L)); fields.add(row); } sendJson(ex, 200, Map.of( diff --git a/content/develop/use-cases/feature-store/java-lettuce/BuildFeatures.java b/content/develop/use-cases/feature-store/java-lettuce/BuildFeatures.java new file mode 100644 index 0000000000..ae89261231 --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/BuildFeatures.java @@ -0,0 +1,115 @@ +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; + +import io.lettuce.core.RedisClient; +import io.lettuce.core.api.StatefulRedisConnection; + +/** + * Synthesize a small batch of users with realistic-looking features + * and bulk-load them into Redis with a 24-hour key-level TTL. + * + *

Stands in for the nightly Spark / Feast materialization job in a + * real deployment. In production the equivalent of this script lives + * in an offline pipeline that reads from the offline store and writes + * the serving-time hashes into Redis via {@code HSET} + {@code EXPIRE}.

+ * + *

Run with: {@code mvn exec:java -Dexec.mainClass=BuildFeatures -Dexec.args="--count 500"}

+ */ +public class BuildFeatures { + + private static final List COUNTRY_CHOICES = List.of( + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"); + private static final List RISK_SEGMENTS = List.of("low", "medium", "high"); + private static final int[] RISK_WEIGHTS = {70, 25, 5}; + private static final int[] CHARGEBACK_BUCKETS = {0, 1, 2, 3}; + private static final int[] CHARGEBACK_WEIGHTS = {85, 10, 4, 1}; + + public static Map> synthesizeUsers(int count, long seed) { + Random rng = new Random(seed); + Map> users = new LinkedHashMap<>(count); + for (int i = 1; i <= count; i++) { + String uid = String.format("u%04d", i); + Map row = new LinkedHashMap<>(); + row.put("country_iso", COUNTRY_CHOICES.get(rng.nextInt(COUNTRY_CHOICES.size()))); + row.put("risk_segment", weightedChoice(rng, RISK_SEGMENTS, RISK_WEIGHTS)); + row.put("account_age_days", 7 + rng.nextInt(2394)); + row.put("tx_count_7d", rng.nextInt(81)); + row.put("avg_amount_30d", Math.round((5.0 + rng.nextDouble() * 345.0) * 100.0) / 100.0); + row.put("chargeback_count_180d", weightedChoiceInt(rng, CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS)); + users.put(uid, row); + } + return users; + } + + public static void main(String[] args) { + String redisUri = "redis://localhost:6379"; + int count = 200; + long ttlSeconds = 24L * 60L * 60L; + String keyPrefix = "fs:user:"; + long seed = 42L; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--redis-uri" -> redisUri = args[++i]; + case "--count" -> count = Integer.parseInt(args[++i]); + case "--ttl-seconds" -> ttlSeconds = Long.parseLong(args[++i]); + case "--key-prefix" -> keyPrefix = args[++i]; + case "--seed" -> seed = Long.parseLong(args[++i]); + case "-h", "--help" -> { + System.out.println( + "Usage: mvn exec:java -Dexec.mainClass=BuildFeatures " + + "-Dexec.args=\"[--redis-uri URI] [--count N] " + + "[--ttl-seconds S] [--key-prefix PREFIX] [--seed N]\""); + return; + } + default -> { + System.err.println("Unknown argument: " + args[i]); + System.exit(2); + } + } + } + + RedisClient client = RedisClient.create(redisUri); + // A one-shot CLI doesn't have concurrent callers, but the + // FeatureStore helper still expects a dedicated pipeline + // connection for its batched paths — open one and let + // try-with-resources close both at the end. + try (StatefulRedisConnection conn = client.connect(); + StatefulRedisConnection pipelineConn = client.connect()) { + FeatureStore store = new FeatureStore(conn, pipelineConn, + keyPrefix, ttlSeconds, + FeatureStore.DEFAULT_STREAMING_TTL_SECONDS); + Map> rows = synthesizeUsers(count, seed); + int loaded = store.bulkLoad(rows, ttlSeconds); + System.out.printf( + "Materialized %d users at %s* with a %ds key-level TTL.%n", + loaded, keyPrefix, ttlSeconds); + } finally { + client.shutdown(); + } + } + + private static String weightedChoice(Random rng, List items, int[] weights) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.size(); i++) { + r -= weights[i]; + if (r < 0) return items.get(i); + } + return items.get(items.size() - 1); + } + + private static int weightedChoiceInt(Random rng, int[] items, int[] weights) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.length; i++) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + } +} diff --git a/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java b/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java new file mode 100644 index 0000000000..cee9427c88 --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java @@ -0,0 +1,1036 @@ +import com.sun.net.httpserver.HttpExchange; +import com.sun.net.httpserver.HttpHandler; +import com.sun.net.httpserver.HttpServer; + +import java.io.IOException; +import java.io.OutputStream; +import java.net.InetSocketAddress; +import java.net.URI; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.locks.ReentrantLock; + +import io.lettuce.core.RedisClient; +import io.lettuce.core.api.StatefulRedisConnection; + +/** + * Redis feature-store demo server (Lettuce + JDK HttpServer). + * + *

Run with {@code mvn exec:java -Dexec.mainClass=DemoServer} and + * visit {@code http://localhost:8089} to watch an online feature + * store at work: a batch materialization loads N users with a 24-hour + * key-level TTL, a streaming worker overwrites a handful of users' + * real-time features every second with a per-field {@code HEXPIRE}, + * and the inference panel reads any subset of features for any user + * with {@code HMGET} in a single round trip.

+ * + *

The Lettuce demo shares a single {@code StatefulRedisConnection} + * across the HTTP thread pool and the streaming worker — Lettuce + * connections are thread-safe and multiplexed, so no pool is + * required for this workload. See the walkthrough for when to add + * one anyway (blocking commands, very high contention).

+ */ +public class DemoServer { + + private static FeatureStore store; + private static StreamingWorker worker; + private static FeatureStoreDemo demo; + private static RedisClient redisClient; + /** Shared connection for non-pipelined reads (multiplexed across the HTTP pool). */ + private static StatefulRedisConnection redisConn; + /** Dedicated connection for the pipelined batched paths (auto-flush toggled). */ + private static StatefulRedisConnection redisPipelineConn; + + public static void main(String[] args) throws Exception { + String host = "127.0.0.1"; + int port = 8089; + String redisUri = "redis://localhost:6379"; + String keyPrefix = "fs:user:"; + long batchTtlSeconds = 24L * 60L * 60L; + long streamingTtlSeconds = 5L * 60L; + int usersPerTick = 5; + int seedUsers = 200; + boolean resetOnStart = true; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--host" -> host = args[++i]; + case "--port" -> port = Integer.parseInt(args[++i]); + case "--redis-uri" -> redisUri = args[++i]; + case "--key-prefix" -> keyPrefix = args[++i]; + case "--batch-ttl-seconds" -> batchTtlSeconds = Long.parseLong(args[++i]); + case "--streaming-ttl-seconds" -> streamingTtlSeconds = Long.parseLong(args[++i]); + case "--users-per-tick" -> usersPerTick = Integer.parseInt(args[++i]); + case "--seed-users" -> seedUsers = Integer.parseInt(args[++i]); + case "--no-reset" -> resetOnStart = false; + case "-h", "--help" -> { + System.out.println( + "Usage: mvn exec:java -Dexec.mainClass=DemoServer " + + "-Dexec.args=\"[--host H] [--port P] [--redis-uri URI] " + + "[--key-prefix PFX] " + + "[--batch-ttl-seconds S] [--streaming-ttl-seconds S] " + + "[--users-per-tick N] [--seed-users N] [--no-reset]\""); + return; + } + default -> { + System.err.println("Unknown argument: " + args[i]); + System.exit(2); + } + } + } + + redisClient = RedisClient.create(redisUri); + // Two connections: the first is multiplexed across the HTTP + // thread pool and the streaming worker for ordinary + // (auto-flushed) commands; the second is reserved for the + // pipelined batched paths in FeatureStore so the auto-flush + // toggle never races with another caller's reads. + redisConn = redisClient.connect(); + redisPipelineConn = redisClient.connect(); + + store = new FeatureStore(redisConn, redisPipelineConn, keyPrefix, + batchTtlSeconds, streamingTtlSeconds); + worker = new StreamingWorker(store, 1000L, usersPerTick, 1337L); + demo = new FeatureStoreDemo(store, worker, 42L); + + if (resetOnStart) { + System.out.printf( + "Dropping any existing users under '%s*' for a clean demo run (pass --no-reset to keep them).%n", + keyPrefix); + store.reset(); + store.resetStats(); + } + int seeded = demo.materialize(seedUsers, batchTtlSeconds).loaded(); + worker.start(); + + HttpServer server = HttpServer.create(new InetSocketAddress(host, port), 0); + server.createContext("/", new RootHandler()); + server.createContext("/state", new StateHandler()); + server.createContext("/inspect", new InspectHandler()); + server.createContext("/bulk-load", new BulkLoadHandler()); + server.createContext("/reset", new ResetHandler()); + server.createContext("/worker/toggle", new ToggleWorkerHandler()); + server.createContext("/read", new ReadHandler()); + server.createContext("/batch-read", new BatchReadHandler()); + server.setExecutor(Executors.newFixedThreadPool(16)); + server.start(); + + System.out.printf("Redis feature-store demo server listening on http://%s:%d%n", host, port); + System.out.printf( + "Using Redis at %s with key prefix '%s' (batch TTL %ds, streaming TTL %ds)%n", + redisUri, keyPrefix, batchTtlSeconds, streamingTtlSeconds); + System.out.printf("Materialized %d user(s); streaming worker running.%n", seeded); + + Runtime.getRuntime().addShutdownHook(new Thread(() -> { + System.out.println("\nShutting down..."); + worker.stop(); + server.stop(0); + redisConn.close(); + redisPipelineConn.close(); + redisClient.shutdown(); + })); + + Thread.currentThread().join(); + } + + // --------------------------------------------------------------- + // FeatureStoreDemo wires the store and worker with the lifecycle + // operations the HTTP handlers call into. + // --------------------------------------------------------------- + + static class FeatureStoreDemo { + private final FeatureStore store; + private final StreamingWorker worker; + private final long seed; + private final ReentrantLock lock = new ReentrantLock(); + + FeatureStoreDemo(FeatureStore store, StreamingWorker worker, long seed) { + this.store = store; + this.worker = worker; + this.seed = seed; + } + + public record MaterializeResult(int loaded, long ttlSeconds, double elapsedMs) {} + + public MaterializeResult materialize(int count, long ttlSeconds) { + lock.lock(); + try { + Map> rows = BuildFeatures.synthesizeUsers(count, seed); + long t0 = System.nanoTime(); + int loaded = store.bulkLoad(rows, ttlSeconds); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + return new MaterializeResult(loaded, ttlSeconds, elapsedMs); + } finally { + lock.unlock(); + } + } + + public long reset() { + lock.lock(); + try { + // Pause the streaming worker around the DEL sweep so a + // concurrent tick can't recreate a user that was just + // enumerated for deletion. pause() only blocks + // *future* ticks — waitForIdle() flushes an + // already-running tick before the DEL sweep starts. + boolean wasPaused = worker.isPaused(); + if (worker.isRunning()) { + if (!wasPaused) worker.pause(); + worker.waitForIdle(); + } + try { + long deleted = store.reset(); + store.resetStats(); + worker.resetStats(); + return deleted; + } finally { + if (worker.isRunning() && !wasPaused) worker.resume(); + } + } finally { + lock.unlock(); + } + } + + public Map toggleWorker() { + lock.lock(); + try { + if (!worker.isRunning()) worker.start(); + if (worker.isPaused()) worker.resume(); + else worker.pause(); + return Map.of( + "paused", worker.isPaused(), + "running", worker.isRunning() + ); + } finally { + lock.unlock(); + } + } + } + + // --------------------------------------------------------------- + // Handlers (identical to the Jedis demo apart from the request URI) + // --------------------------------------------------------------- + + static class RootHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!ex.getRequestURI().getPath().equals("/") && + !ex.getRequestURI().getPath().equals("/index.html")) { + send(ex, 404, "text/plain", "Not Found"); + return; + } + send(ex, 200, "text/html; charset=utf-8", htmlPage()); + } + } + + static class StateHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"GET".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + try { + List ids = store.listEntityIds(500); + long count = store.countEntities(); + Map out = new LinkedHashMap<>(); + out.put("key_prefix", store.getKeyPrefix()); + out.put("batch_ttl_seconds", store.getBatchTtlSeconds()); + out.put("streaming_ttl_seconds", store.getStreamingTtlSeconds()); + out.put("entity_count", count); + out.put("entity_ids", ids); + out.put("stats", statsToMap(store.stats())); + out.put("worker", workerStatsToMap(worker.statsSnapshot())); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class InspectHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"GET".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map q = parseQuery(ex.getRequestURI()); + String user = q.getOrDefault("user", "").trim(); + if (user.isEmpty()) { + sendJson(ex, 400, Map.of("error", "user is required")); return; + } + try { + Map full = store.getAllFeatures(user); + long keyTTL = store.keyTtlSeconds(user); + if (full.isEmpty()) { + sendJson(ex, 200, Map.of( + "exists", false, + "key_ttl_seconds", keyTTL)); + return; + } + // Iterate the known schema (batch + streaming) plus + // any extras the hash carries. Expired streaming + // fields surface as ttl_seconds=-2 in the Inspect + // view instead of silently disappearing, which is + // exactly the debugging view someone hits "Inspect" + // for. + List names = new ArrayList<>(FeatureStore.DEFAULT_BATCH_FIELDS); + names.addAll(FeatureStore.DEFAULT_STREAMING_FIELDS); + for (String n : full.keySet()) { + if (!names.contains(n)) names.add(n); + } + Map ttls = store.fieldTtlsSeconds(user, names); + Collections.sort(names); + List> fields = new ArrayList<>(names.size()); + for (String n : names) { + Map row = new LinkedHashMap<>(); + row.put("name", n); + row.put("value", full.getOrDefault(n, "")); + row.put("ttl_seconds", ttls.getOrDefault(n, -2L)); + fields.add(row); + } + sendJson(ex, 200, Map.of( + "exists", true, + "key_ttl_seconds", keyTTL, + "fields", fields)); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class BulkLoadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map form = parseForm(ex); + int count = clamp(parseIntOr(form.get("count"), 200), 1, 2000); + long ttl = (long) clamp(parseIntOr(form.get("ttl"), 86400), 5, 172_800); + try { + FeatureStoreDemo.MaterializeResult r = demo.materialize(count, ttl); + sendJson(ex, 200, Map.of( + "loaded", r.loaded(), + "ttl_seconds", r.ttlSeconds(), + "elapsed_ms", r.elapsedMs())); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class ResetHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + try { + long deleted = demo.reset(); + sendJson(ex, 200, Map.of("deleted", deleted)); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class ToggleWorkerHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + sendJson(ex, 200, demo.toggleWorker()); + } + } + + static class ReadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map> form = parseFormMulti(ex); + String user = first(form.get("user"), "").trim(); + if (user.isEmpty()) { + sendJson(ex, 400, Map.of("error", "user is required")); return; + } + List fields = nonEmpty(form.getOrDefault("field", List.of())); + try { + long t0 = System.nanoTime(); + Map values = fields.isEmpty() + ? Collections.emptyMap() + : store.getFeatures(user, fields); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + Map ttls = fields.isEmpty() + ? Collections.emptyMap() + : store.fieldTtlsSeconds(user, fields); + long keyTTL = store.keyTtlSeconds(user); + Map out = new LinkedHashMap<>(); + out.put("requested", fields); + out.put("values", values); + out.put("ttls", ttls); + out.put("key_ttl_seconds", keyTTL); + out.put("returned_count", values.size()); + out.put("elapsed_ms", elapsedMs); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + static class BatchReadHandler implements HttpHandler { + @Override public void handle(HttpExchange ex) throws IOException { + if (!"POST".equalsIgnoreCase(ex.getRequestMethod())) { + sendJson(ex, 405, Map.of("error", "method not allowed")); return; + } + Map> form = parseFormMulti(ex); + int count = clamp(parseIntOr(first(form.get("count"), "100"), 100), 1, 500); + List fields = nonEmpty(form.getOrDefault("field", List.of())); + if (fields.isEmpty()) { + fields = new ArrayList<>(FeatureStore.DEFAULT_STREAMING_FIELDS); + fields.add("risk_segment"); + } + try { + List ids = store.listEntityIds(Math.max(count * 2, 2000)); + if (ids.size() > count) ids = ids.subList(0, count); + long t0 = System.nanoTime(); + Map> rows = store.batchGetFeatures(ids, fields); + double elapsedMs = (System.nanoTime() - t0) / 1_000_000.0; + int sampleN = Math.min(10, ids.size()); + List> sample = new ArrayList<>(sampleN); + for (int i = 0; i < sampleN; i++) { + String id = ids.get(i); + Map r = new LinkedHashMap<>(); + r.put("id", id); + r.put("field_count", rows.getOrDefault(id, Collections.emptyMap()).size()); + sample.add(r); + } + Map out = new LinkedHashMap<>(); + out.put("entity_count", ids.size()); + out.put("field_count", fields.size()); + out.put("elapsed_ms", elapsedMs); + out.put("sample", sample); + sendJson(ex, 200, out); + } catch (Exception e) { + sendJson(ex, 500, Map.of("error", e.getMessage())); + } + } + } + + // --------------------------------------------------------------- + // HTTP plumbing (mirrors the Jedis demo verbatim) + // --------------------------------------------------------------- + + private static void send(HttpExchange ex, int status, String contentType, String body) throws IOException { + byte[] bytes = body.getBytes(StandardCharsets.UTF_8); + ex.getResponseHeaders().set("Content-Type", contentType); + ex.sendResponseHeaders(status, bytes.length); + try (OutputStream os = ex.getResponseBody()) { os.write(bytes); } + } + + private static void sendJson(HttpExchange ex, int status, Object payload) throws IOException { + send(ex, status, "application/json", toJson(payload)); + } + + private static String toJson(Object o) { + StringBuilder sb = new StringBuilder(); + appendJson(sb, o); + return sb.toString(); + } + + @SuppressWarnings("unchecked") + private static void appendJson(StringBuilder sb, Object o) { + if (o == null) { sb.append("null"); return; } + if (o instanceof Boolean b) { sb.append(b ? "true" : "false"); return; } + if (o instanceof Number n) { sb.append(n.toString()); return; } + if (o instanceof Map m) { + sb.append('{'); + boolean first = true; + for (Map.Entry e : ((Map) m).entrySet()) { + if (!first) sb.append(','); + first = false; + appendJsonString(sb, String.valueOf(e.getKey())); + sb.append(':'); + appendJson(sb, e.getValue()); + } + sb.append('}'); + return; + } + if (o instanceof List l) { + sb.append('['); + boolean first = true; + for (Object v : l) { + if (!first) sb.append(','); + first = false; + appendJson(sb, v); + } + sb.append(']'); + return; + } + if (o.getClass().isArray()) { + sb.append('['); + int len = java.lang.reflect.Array.getLength(o); + for (int i = 0; i < len; i++) { + if (i > 0) sb.append(','); + appendJson(sb, java.lang.reflect.Array.get(o, i)); + } + sb.append(']'); + return; + } + appendJsonString(sb, String.valueOf(o)); + } + + private static void appendJsonString(StringBuilder sb, String s) { + sb.append('"'); + for (int i = 0; i < s.length(); i++) { + char c = s.charAt(i); + switch (c) { + case '"' -> sb.append("\\\""); + case '\\' -> sb.append("\\\\"); + case '\n' -> sb.append("\\n"); + case '\r' -> sb.append("\\r"); + case '\t' -> sb.append("\\t"); + default -> { + if (c < 0x20) sb.append(String.format("\\u%04x", (int) c)); + else sb.append(c); + } + } + } + sb.append('"'); + } + + private static Map parseQuery(URI uri) { + Map out = new HashMap<>(); + String q = uri.getRawQuery(); + if (q == null) return out; + for (String pair : q.split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.put(k, v); + } + return out; + } + + private static Map parseForm(HttpExchange ex) throws IOException { + byte[] body = ex.getRequestBody().readAllBytes(); + Map out = new HashMap<>(); + if (body.length == 0) return out; + for (String pair : new String(body, StandardCharsets.UTF_8).split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.put(k, v); + } + return out; + } + + private static Map> parseFormMulti(HttpExchange ex) throws IOException { + byte[] body = ex.getRequestBody().readAllBytes(); + Map> out = new HashMap<>(); + if (body.length == 0) return out; + for (String pair : new String(body, StandardCharsets.UTF_8).split("&")) { + int eq = pair.indexOf('='); + if (eq < 0) continue; + String k = java.net.URLDecoder.decode(pair.substring(0, eq), StandardCharsets.UTF_8); + String v = java.net.URLDecoder.decode(pair.substring(eq + 1), StandardCharsets.UTF_8); + out.computeIfAbsent(k, x -> new ArrayList<>()).add(v); + } + return out; + } + + private static String first(List values, String def) { + return values == null || values.isEmpty() ? def : values.get(0); + } + + private static List nonEmpty(List in) { + List out = new ArrayList<>(in.size()); + for (String v : in) if (v != null && !v.isEmpty()) out.add(v); + return out; + } + + private static int parseIntOr(String s, int def) { + if (s == null || s.isEmpty()) return def; + try { return Integer.parseInt(s); } catch (NumberFormatException e) { return def; } + } + + private static int clamp(int n, int low, int high) { + return n < low ? low : (n > high ? high : n); + } + + private static Map statsToMap(FeatureStore.Stats s) { + Map out = new LinkedHashMap<>(); + out.put("batch_writes_total", s.batchWritesTotal()); + out.put("streaming_writes_total", s.streamingWritesTotal()); + out.put("reads_total", s.readsTotal()); + out.put("read_fields_total", s.readFieldsTotal()); + return out; + } + + private static Map workerStatsToMap(StreamingWorker.Stats s) { + Map out = new LinkedHashMap<>(); + out.put("running", s.running()); + out.put("paused", s.paused()); + out.put("tick_count", s.tickCount()); + out.put("writes_count", s.writesCount()); + return out; + } + + private static String htmlPage() { + return HTML_TEMPLATE + .replace("__KEY_PREFIX__", store.getKeyPrefix()) + .replace("__STREAM_TTL__", Long.toString(store.getStreamingTtlSeconds())) + .replace("__USERS_PER_TICK__", Integer.toString(worker.getUsersPerTick())) + .replace("__BATCH_FIELDS_JSON__", toJson(FeatureStore.DEFAULT_BATCH_FIELDS)) + .replace("__STREAM_FIELDS_JSON__", toJson(FeatureStore.DEFAULT_STREAMING_FIELDS)); + } + + // --------------------------------------------------------------- + // HTML template (identical to the Jedis demo apart from the pill text) + // --------------------------------------------------------------- + + private static final String HTML_TEMPLATE = """ + + + + + + Redis Feature Store Demo (Lettuce) + + + +
+
Lettuce + JDK com.sun.net.httpserver
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users through one + connection-level flush. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + with auto-flush disabled, then one flush ships the whole + batch.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + Lettuce's connection-level flush. One network round trip for + the whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +"""; +} diff --git a/content/develop/use-cases/feature-store/java-lettuce/FeatureStore.java b/content/develop/use-cases/feature-store/java-lettuce/FeatureStore.java new file mode 100644 index 0000000000..1d0731ae8c --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/FeatureStore.java @@ -0,0 +1,529 @@ +import java.time.Duration; +import java.util.ArrayList; +import java.util.Collections; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +import io.lettuce.core.KeyScanCursor; +import io.lettuce.core.KeyValue; +import io.lettuce.core.LettuceFutures; +import io.lettuce.core.RedisException; +import io.lettuce.core.RedisFuture; +import io.lettuce.core.ScanArgs; +import io.lettuce.core.ScanCursor; +import io.lettuce.core.api.StatefulRedisConnection; +import io.lettuce.core.api.async.RedisAsyncCommands; + +/** + * Redis online feature store backed by per-entity Hashes (Lettuce). + * + *

Each entity (here, a user) lives at a deterministic key such as + * {@code fs:user:{id}}. The hash holds every feature for that entity + * as one field per feature — batch-materialized aggregates (refreshed + * on a daily cycle) alongside streaming-updated signals (refreshed + * every few seconds). One {@code HMGET} returns whichever subset the + * model needs in one network round trip.

+ * + *

Two TTL layers solve the mixed staleness problem: + *

    + *
  • A key-level {@code EXPIRE} aligned with the batch + * materialization cycle.
  • + *
  • A per-field {@code HEXPIRE} on each streaming field gives + * that field its own shorter expiry, independent of the rest of + * the hash.
  • + *

+ * + *

{@code HEXPIRE} and {@code HTTL} require Redis 7.4 or later. + * Lettuce exposes them as {@code hexpire} / {@code httl} on + * {@code RedisAsyncCommands} from 6.4 onwards; the demo pins + * 7.5.2.RELEASE.

+ * + *

Lettuce vs. Jedis

+ * + *

Unlike Jedis, Lettuce is async-by-default and the + * {@code StatefulRedisConnection} is thread-safe and multiplexed: + * one connection serves the whole process and there is no per-call + * pool checkout. Every async call returns a {@code RedisFuture} + * (which is also a {@code CompletionStage}) — the helper blocks + * with {@code .get()} where the calling context is synchronous, but + * the underlying writes are pipelined onto the same connection + * automatically.

+ * + *

For batched writes (bulk-load, streaming-update, + * batch-get-features) the helper switches the connection's + * auto-flush off, queues every command, then flushes once and awaits + * the resulting {@code RedisFuture}s with + * {@link LettuceFutures#awaitAll}. That is Lettuce's canonical + * pipelining idiom.

+ */ +public class FeatureStore { + + public static final List DEFAULT_BATCH_FIELDS = List.of( + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d" + ); + + public static final List DEFAULT_STREAMING_FIELDS = List.of( + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country" + ); + + public static final long DEFAULT_BATCH_TTL_SECONDS = 24L * 60L * 60L; + public static final long DEFAULT_STREAMING_TTL_SECONDS = 5L * 60L; + public static final String DEFAULT_KEY_PREFIX = "fs:user:"; + + /** Hard cap on how long we block waiting for any single batched flush. */ + private static final Duration BATCH_TIMEOUT = Duration.ofSeconds(10); + + /** + * Shared connection used for non-pipelined reads (HMGET, HGETALL, + * HTTL, TTL, SCAN, DEL). Safe to use concurrently from many + * threads because Lettuce multiplexes auto-flushed commands. + */ + private final StatefulRedisConnection conn; + private final RedisAsyncCommands async; + + /** + * Dedicated connection used for the pipelined batched paths + * ({@link #bulkLoad}, {@link #updateStreaming}, + * {@link #batchGetFeatures}). These flip + * {@code setAutoFlushCommands(false)} on the connection while + * they queue commands — a *connection-level* state change — so + * they cannot share the read connection without interfering with + * other threads. A single lock serializes pipelined batches on + * this connection; concurrent batches block each other rather + * than corrupting the auto-flush flag. + * + *

If you need true batch concurrency, scale this design to a + * small {@code BoundedAsyncPool>} + * of pipeline connections and lease one per batch.

+ */ + private final StatefulRedisConnection pipelineConn; + private final RedisAsyncCommands pipelineAsync; + private final Object pipelineLock = new Object(); + + private final String keyPrefix; + private final long batchTtlSeconds; + private final long streamingTtlSeconds; + + private final AtomicLong batchWritesTotal = new AtomicLong(); + private final AtomicLong streamingWritesTotal = new AtomicLong(); + private final AtomicLong readsTotal = new AtomicLong(); + private final AtomicLong readFieldsTotal = new AtomicLong(); + + public FeatureStore(StatefulRedisConnection conn, + StatefulRedisConnection pipelineConn) { + this(conn, pipelineConn, DEFAULT_KEY_PREFIX, + DEFAULT_BATCH_TTL_SECONDS, + DEFAULT_STREAMING_TTL_SECONDS); + } + + public FeatureStore(StatefulRedisConnection conn, + StatefulRedisConnection pipelineConn, + String keyPrefix, + long batchTtlSeconds, + long streamingTtlSeconds) { + this.conn = conn; + this.async = conn.async(); + this.pipelineConn = pipelineConn; + this.pipelineAsync = pipelineConn.async(); + this.keyPrefix = keyPrefix; + this.batchTtlSeconds = batchTtlSeconds; + this.streamingTtlSeconds = streamingTtlSeconds; + } + + public String getKeyPrefix() { return keyPrefix; } + public long getBatchTtlSeconds() { return batchTtlSeconds; } + public long getStreamingTtlSeconds() { return streamingTtlSeconds; } + + public String keyFor(String entityId) { + return keyPrefix + entityId; + } + + // --------------------------------------------------------------- + // Batch ingestion (materialization) + // --------------------------------------------------------------- + + /** + * Materialize a batch of entities into Redis. + * + *

One {@code HSET} plus one {@code EXPIRE} per entity. The + * connection's auto-flush is disabled around the queue so all + * commands ship as a single network frame; the helper then + * blocks on {@link LettuceFutures#awaitAll} so the caller sees + * the batch as a synchronous operation.

+ */ + public int bulkLoad(Map> rows, long ttlSeconds) { + if (rows.isEmpty()) return 0; + + synchronized (pipelineLock) { + List> futures = new ArrayList<>(rows.size() * 2); + pipelineConn.setAutoFlushCommands(false); + try { + for (Map.Entry> e : rows.entrySet()) { + String key = keyFor(e.getKey()); + Map encoded = encode(e.getValue()); + futures.add(pipelineAsync.hset(key, encoded)); + futures.add(pipelineAsync.expire(key, ttlSeconds)); + } + pipelineConn.flushCommands(); + // Await *inside* the auto-flush=false scope so the + // futures resolve before any other code path can flip + // the flag. With the dedicated pipelineConn + lock, + // this is defense in depth; without it (a shared + // connection design) the order matters for correctness. + if (!LettuceFutures.awaitAll(BATCH_TIMEOUT, + futures.toArray(new RedisFuture[0]))) { + throw new IllegalStateException( + "bulkLoad: timed out after " + BATCH_TIMEOUT); + } + } finally { + pipelineConn.setAutoFlushCommands(true); + } + } + batchWritesTotal.addAndGet(rows.size()); + return rows.size(); + } + + public int bulkLoad(Map> rows) { + return bulkLoad(rows, batchTtlSeconds); + } + + /** + * Update a single batch feature without touching the key TTL. + * Used by the demo's "manually refresh one user" lever; real + * pipelines flow through {@link #bulkLoad}. + */ + public void updateBatchFeature(String entityId, String field, Object value) { + awaitOne(async.hset(keyFor(entityId), Map.of(field, encodeValue(value)))); + batchWritesTotal.incrementAndGet(); + } + + // --------------------------------------------------------------- + // Streaming ingestion + // --------------------------------------------------------------- + + /** + * Write streaming features with a per-field TTL. + * + *

{@code HSET} and {@code HEXPIRE} are queued on the same + * connection-level flush so they hit Redis in pipeline order: + * the {@code HSET} runs first, then {@code HEXPIRE} attaches a + * TTL to each field that was just written.

+ * + *

{@code HEXPIRE} returns one status code per field: + *

    + *
  • {@code 1}: TTL set / updated
  • + *
  • {@code 2}: the expiry was 0 or in the past, so Redis + * deleted the field instead of applying a TTL
  • + *
  • {@code 0}: an {@code NX | XX | GT | LT} conditional flag + * was specified and not met (we never use one here)
  • + *
  • {@code -2}: no such field, or no such key
  • + *
+ * We just {@code HSET} every field on the same call, so any code + * other than {@code 1} means the per-field TTL invariant did + * not hold — fail loudly rather than silently leave a streaming + * field with no expiry.

+ */ + public void updateStreaming(String entityId, Map fields, long ttlSeconds) { + if (fields.isEmpty()) return; + String key = keyFor(entityId); + Map encoded = encode(fields); + String[] names = encoded.keySet().toArray(new String[0]); + + List codes; + synchronized (pipelineLock) { + pipelineConn.setAutoFlushCommands(false); + try { + RedisFuture hsetFut = pipelineAsync.hset(key, encoded); + RedisFuture> hexpireFut = + pipelineAsync.hexpire(key, ttlSeconds, names); + pipelineConn.flushCommands(); + // Resolve both futures while auto-flush is still off, + // so nothing else on this connection can run between + // the queue and the wait. + awaitOne(hsetFut); + codes = awaitOne(hexpireFut); + } finally { + pipelineConn.setAutoFlushCommands(true); + } + } + for (Long code : codes) { + if (code == null || code != 1L) { + throw new IllegalStateException( + "HEXPIRE did not set every field TTL for " + key + ": " + codes); + } + } + streamingWritesTotal.addAndGet(fields.size()); + } + + public void updateStreaming(String entityId, Map fields) { + updateStreaming(entityId, fields, streamingTtlSeconds); + } + + // --------------------------------------------------------------- + // Inference reads + // --------------------------------------------------------------- + + /** + * Retrieve a subset of features for one entity. Pass + * {@code fieldNames=null} (or call {@link #getAllFeatures}) to + * fetch the full hash with {@code HGETALL} — useful for debugging + * but rarely the right call on the request path, where the model + * knows exactly which features it consumes. + */ + public Map getFeatures(String entityId, List fieldNames) { + String key = keyFor(entityId); + Map out = new LinkedHashMap<>(); + if (fieldNames == null) { + Map all = awaitOne(async.hgetall(key)); + if (all != null) out.putAll(all); + readsTotal.incrementAndGet(); + readFieldsTotal.addAndGet(out.size()); + return out; + } + if (fieldNames.isEmpty()) return out; + List> values = awaitOne( + async.hmget(key, fieldNames.toArray(new String[0]))); + for (KeyValue kv : values) { + if (kv != null && kv.hasValue()) { + out.put(kv.getKey(), kv.getValue()); + } + } + readsTotal.incrementAndGet(); + readFieldsTotal.addAndGet(out.size()); + return out; + } + + public Map getAllFeatures(String entityId) { + return getFeatures(entityId, null); + } + + /** + * Pipeline {@code HMGET} across many entities for batch scoring. + * One round trip for the whole batch via the connection-level + * flush. + */ + public Map> batchGetFeatures( + List entityIds, List fieldNames) { + if (entityIds.isEmpty() || fieldNames.isEmpty()) { + return Collections.emptyMap(); + } + String[] names = fieldNames.toArray(new String[0]); + + Map> out = new LinkedHashMap<>(); + long seenFields = 0; + synchronized (pipelineLock) { + List>>> futures = + new ArrayList<>(entityIds.size()); + pipelineConn.setAutoFlushCommands(false); + try { + for (String id : entityIds) { + futures.add(pipelineAsync.hmget(keyFor(id), names)); + } + pipelineConn.flushCommands(); + // Resolve every future inside the auto-flush=false + // scope; restoring auto-flush before the awaits would + // be merely cosmetic on a dedicated connection, but + // is genuinely unsafe if someone reuses this method's + // pattern against a shared connection later. + for (int i = 0; i < entityIds.size(); i++) { + List> values = awaitOne(futures.get(i)); + Map row = new LinkedHashMap<>(); + for (KeyValue kv : values) { + if (kv != null && kv.hasValue()) { + row.put(kv.getKey(), kv.getValue()); + seenFields++; + } + } + out.put(entityIds.get(i), row); + } + } finally { + pipelineConn.setAutoFlushCommands(true); + } + } + readsTotal.addAndGet(entityIds.size()); + readFieldsTotal.addAndGet(seenFields); + return out; + } + + // --------------------------------------------------------------- + // TTL inspection (used by the demo UI) + // --------------------------------------------------------------- + + /** + * Seconds until the entity key expires. Returns {@code -1} if no + * key-level TTL is set, {@code -2} if the key doesn't exist. + */ + public long keyTtlSeconds(String entityId) { + return awaitOne(async.ttl(keyFor(entityId))); + } + + /** + * Per-field TTL via {@code HTTL} (Redis 7.4+). Each value mirrors + * the {@code TTL} convention: positive means seconds remaining, + * {@code -1} means the field has no TTL set, {@code -2} means + * the field doesn't exist on this hash (or the key itself is + * missing). + */ + public Map fieldTtlsSeconds(String entityId, List fieldNames) { + if (fieldNames.isEmpty()) return Collections.emptyMap(); + List codes = awaitOne( + async.httl(keyFor(entityId), fieldNames.toArray(new String[0]))); + Map out = new LinkedHashMap<>(); + for (int i = 0; i < fieldNames.size(); i++) { + // HTTL on a missing key returns a flat list of -2s. Coerce + // any unexpected nulls defensively in case a future + // Lettuce release tweaks the reply shape. + Long c = i < codes.size() ? codes.get(i) : null; + out.put(fieldNames.get(i), c == null ? -2L : c); + } + return out; + } + + // --------------------------------------------------------------- + // Demo housekeeping + // --------------------------------------------------------------- + + /** + * Enumerate entity IDs by scanning {@code keyPrefix*}. {@code SCAN} + * is non-blocking; the demo uses it to populate UI dropdowns, not + * as a serving primitive. + */ + public List listEntityIds(int limit) { + List ids = new ArrayList<>(); + ScanCursor cursor = ScanCursor.INITIAL; + ScanArgs args = ScanArgs.Builder.matches(keyPrefix + "*").limit(200); + while (true) { + KeyScanCursor sr = awaitOne(async.scan(cursor, args)); + for (String k : sr.getKeys()) { + if (k.length() > keyPrefix.length()) { + ids.add(k.substring(keyPrefix.length())); + if (ids.size() >= limit) { + Collections.sort(ids); + return ids; + } + } + } + if (sr.isFinished()) break; + cursor = ScanCursor.of(sr.getCursor()); + } + Collections.sort(ids); + return ids; + } + + /** + * Count entities under the key prefix without an in-memory cap so + * the UI can report the real total even when more keys exist than + * the dropdown lists. + */ + public long countEntities() { + long count = 0; + ScanCursor cursor = ScanCursor.INITIAL; + ScanArgs args = ScanArgs.Builder.matches(keyPrefix + "*").limit(500); + while (true) { + KeyScanCursor sr = awaitOne(async.scan(cursor, args)); + count += sr.getKeys().size(); + if (sr.isFinished()) break; + cursor = ScanCursor.of(sr.getCursor()); + } + return count; + } + + public long deleteEntity(String entityId) { + return awaitOne(async.del(keyFor(entityId))); + } + + /** + * Drop every entity under the key prefix. Used by the demo reset + * path. Scans in batches and issues one variadic {@code DEL} per + * batch. + */ + public long reset() { + long deleted = 0; + ScanCursor cursor = ScanCursor.INITIAL; + ScanArgs args = ScanArgs.Builder.matches(keyPrefix + "*").limit(500); + while (true) { + KeyScanCursor sr = awaitOne(async.scan(cursor, args)); + List batch = sr.getKeys(); + if (!batch.isEmpty()) { + deleted += awaitOne(async.del(batch.toArray(new String[0]))); + } + if (sr.isFinished()) break; + cursor = ScanCursor.of(sr.getCursor()); + } + return deleted; + } + + public Stats stats() { + return new Stats( + batchWritesTotal.get(), + streamingWritesTotal.get(), + readsTotal.get(), + readFieldsTotal.get() + ); + } + + public void resetStats() { + batchWritesTotal.set(0); + streamingWritesTotal.set(0); + readsTotal.set(0); + readFieldsTotal.set(0); + } + + // --------------------------------------------------------------- + // Helpers + // --------------------------------------------------------------- + + /** + * Block until a single {@link RedisFuture} resolves, propagating + * the underlying Redis exception as an unchecked one. Lettuce's + * {@code LettuceFutures.awaitOrCancel} is the idiomatic single- + * future blocking helper: it already unwraps execution exceptions + * into Lettuce's own {@code RedisException} hierarchy, cancels + * the future on timeout, and restores the thread interrupt flag. + */ + private static T awaitOne(RedisFuture future) { + try { + return LettuceFutures.awaitOrCancel( + future, BATCH_TIMEOUT.toSeconds(), TimeUnit.SECONDS); + } catch (RedisException e) { + // Already unchecked; surface as-is so call sites can + // catch on the canonical Lettuce hierarchy. + throw e; + } + } + + private static Map encode(Map fields) { + Map out = new LinkedHashMap<>(fields.size()); + for (Map.Entry e : fields.entrySet()) { + out.put(e.getKey(), encodeValue(e.getValue())); + } + return out; + } + + /** Render a feature value as a string for hash storage. */ + public static String encodeValue(Object value) { + if (value == null) return ""; + if (value instanceof Boolean b) return b ? "true" : "false"; + return value.toString(); + } + + /** Immutable snapshot of the helper's in-process counters. */ + public static record Stats( + long batchWritesTotal, + long streamingWritesTotal, + long readsTotal, + long readFieldsTotal + ) {} +} diff --git a/content/develop/use-cases/feature-store/java-lettuce/StreamingWorker.java b/content/develop/use-cases/feature-store/java-lettuce/StreamingWorker.java new file mode 100644 index 0000000000..fff3ed5a0c --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/StreamingWorker.java @@ -0,0 +1,235 @@ +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicLong; + +/** + * Streaming feature updater for the demo. + * + *

Stands in for whatever Flink, Kafka Streams, or bespoke service + * computes the real-time features in a real deployment. In production + * this code lives in the streaming layer; here it runs as a daemon + * Thread next to the demo server so the page can start, pause, and + * resume it from the UI.

+ * + *

Identical control surface to the Jedis demo (start / stop / pause / + * resume / waitForIdle). The only difference is that + * {@code FeatureStore.updateStreaming} pipelines its {@code HSET} + + * {@code HEXPIRE} through Lettuce's connection-level auto-flush + * mechanism rather than through a Jedis {@code Pipeline}.

+ */ +public class StreamingWorker { + + private static final List DEVICE_IDS = List.of( + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2"); + private static final List SESSION_COUNTRIES = List.of( + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL"); + private static final int[] FAILED_LOGIN_BUCKETS = {0, 1, 2, 5}; + private static final int[] FAILED_LOGIN_WEIGHTS = {70, 20, 8, 2}; + + private final FeatureStore store; + private final long tickMillis; + private final int usersPerTick; + private final Random rng; + + private final Object rngLock = new Object(); + private final AtomicBoolean running = new AtomicBoolean(false); + private final AtomicBoolean paused = new AtomicBoolean(false); + private final AtomicBoolean tickInFlight = new AtomicBoolean(false); + private final AtomicLong tickCount = new AtomicLong(); + private final AtomicLong writesCount = new AtomicLong(); + + private Thread worker; + + public StreamingWorker(FeatureStore store, long tickMillis, int usersPerTick, long seed) { + this.store = store; + this.tickMillis = tickMillis > 0 ? tickMillis : 1000L; + this.usersPerTick = usersPerTick > 0 ? usersPerTick : 5; + this.rng = new Random(seed); + } + + public int getUsersPerTick() { return usersPerTick; } + + // --------------------------------------------------------------- + // Lifecycle + // --------------------------------------------------------------- + + public synchronized void start() { + if (running.get()) return; + running.set(true); + paused.set(false); + worker = new Thread(this::run, "streaming-worker"); + worker.setDaemon(true); + worker.start(); + } + + public synchronized void stop() { + if (!running.getAndSet(false)) return; + if (worker != null) { + try { + worker.join(2000); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + } + worker = null; + } + waitForIdle(); + } + + public void pause() { paused.set(true); } + public void resume() { paused.set(false); } + + public boolean isRunning() { return running.get(); } + public boolean isPaused() { return paused.get(); } + + /** + * Block until any in-flight tick has finished its current + * updateStreaming loop. {@link #pause()} only stops future + * ticks; this is what callers (a reset that's about to DEL every + * entity, for example) use to flush a mid-flight tick before they + * touch state the tick might still be writing to. + * + *

Reset cannot safely proceed while a tick is mid-write, so + * an interrupt during the wait must NOT short-circuit out with + * tickInFlight still true. The method captures the interrupt + * status, keeps polling, and restores it before returning.

+ */ + public void waitForIdle() { + boolean interrupted = false; + while (tickInFlight.get()) { + try { + Thread.sleep(20); + } catch (InterruptedException e) { + interrupted = true; + } + } + if (interrupted) { + Thread.currentThread().interrupt(); + } + } + + // --------------------------------------------------------------- + // Tick + // --------------------------------------------------------------- + + private void run() { + try { + while (running.get()) { + try { + Thread.sleep(tickMillis); + } catch (InterruptedException e) { + Thread.currentThread().interrupt(); + break; + } + if (!running.get()) break; + + // Set tickInFlight *before* the pause check so a + // concurrent pause()+waitForIdle() can never observe + // tickInFlight=false in the window between the pause + // check and the actual doTick call. The finally + // block clears the flag whether we paused, succeeded, + // or threw. + tickInFlight.set(true); + try { + if (!paused.get()) { + doTick(); + } + } catch (Exception e) { + System.err.printf("[streaming-worker] tick failed: %s%n", e.getMessage()); + } finally { + tickInFlight.set(false); + } + } + } finally { + // Clear running and tickInFlight no matter how the thread + // exits so a later start() can spin a fresh thread. + running.set(false); + tickInFlight.set(false); + } + } + + private void doTick() { + List ids = store.listEntityIds(500); + if (ids.isEmpty()) return; + List picks = sample(ids, usersPerTick); + long nowMs = System.currentTimeMillis(); + int writes = 0; + for (String id : picks) { + Map fields = new LinkedHashMap<>(); + fields.put("last_login_ts", nowMs); + fields.put("last_device_id", choice(DEVICE_IDS)); + fields.put("tx_count_5m", intn(13)); + fields.put("failed_logins_15m", weightedInt(FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS)); + fields.put("session_country", choice(SESSION_COUNTRIES)); + store.updateStreaming(id, fields); + writes += fields.size(); + } + tickCount.incrementAndGet(); + writesCount.addAndGet(writes); + } + + // --------------------------------------------------------------- + // Stats + // --------------------------------------------------------------- + + public Stats statsSnapshot() { + return new Stats(isRunning(), isPaused(), tickCount.get(), writesCount.get()); + } + + public void resetStats() { + tickCount.set(0); + writesCount.set(0); + } + + public static record Stats( + boolean running, + boolean paused, + long tickCount, + long writesCount + ) {} + + // --------------------------------------------------------------- + // RNG helpers + // --------------------------------------------------------------- + + private List sample(List items, int k) { + synchronized (rngLock) { + int n = Math.min(k, items.size()); + List pool = new java.util.ArrayList<>(items); + List out = new java.util.ArrayList<>(n); + for (int i = 0; i < n; i++) { + int idx = rng.nextInt(pool.size()); + out.add(pool.remove(idx)); + } + return out; + } + } + + private String choice(List items) { + synchronized (rngLock) { + return items.get(rng.nextInt(items.size())); + } + } + + private int intn(int n) { + synchronized (rngLock) { + return rng.nextInt(n); + } + } + + private int weightedInt(int[] items, int[] weights) { + synchronized (rngLock) { + int total = 0; + for (int w : weights) total += w; + int r = rng.nextInt(total); + for (int i = 0; i < items.length; i++) { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[items.length - 1]; + } + } +} diff --git a/content/develop/use-cases/feature-store/java-lettuce/_index.md b/content/develop/use-cases/feature-store/java-lettuce/_index.md new file mode 100644 index 0000000000..e5ac38fc26 --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/_index.md @@ -0,0 +1,657 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Java with Lettuce +linkTitle: Lettuce example (Java) +title: Redis feature store with Lettuce +weight: 5 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Java with [Lettuce]({{< relref "/develop/clients/lettuce" >}}), the +async-by-default Netty-based Redis client. The demo runs on top of the JDK's +`com.sun.net.httpserver.HttpServer` so you can bulk-load a batch of users +with a key-level TTL, run a streaming worker that overwrites real-time +features with per-field TTL, retrieve any subset of features for one user +under 2 ms, and pipeline `HMGET` across a hundred users for batch scoring. + +The [Jedis walkthrough]({{< relref "/develop/use-cases/feature-store/java-jedis" >}}) +covers the same flow with a synchronous, pool-borrowing client. This page +focuses on what's different in Lettuce — the multiplexed connection, the +`RedisAsyncCommands` surface, and the auto-flush pipelining model — rather +than re-explaining the shared concepts. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How Lettuce differs from Jedis + +The big mental-model difference for someone arriving from Jedis: + +* **One shared, multiplexed connection.** A `StatefulRedisConnection` + is thread-safe and serves the whole process. There's no `JedisPool`-style + per-call borrow — every handler in the HTTP thread pool *and* the + streaming worker share the same connection, and Netty handles the + serialization onto the underlying socket. +* **Async-by-default API.** Every method on `RedisAsyncCommands` + returns a `RedisFuture` (which is a `CompletionStage` and a + `Future`). For synchronous code paths the helper blocks with `.get()`; + for reactive pipelines you'd compose with `.thenApply()` / + `.thenCompose()` or use the `.reactive()` API directly. +* **Pipelining via connection-level auto-flush.** Lettuce doesn't have a + `pipelined()`-style builder. Instead, you toggle + `conn.setAutoFlushCommands(false)` on the connection, queue commands as + normal async calls (each returns its own `RedisFuture`), call + `conn.flushCommands()` to ship the batch, and toggle auto-flush back on. + `LettuceFutures.awaitAll(...)` waits for all the futures to resolve. + +In short: reach for **Lettuce** when you need async/reactive composition +or you're already in a reactive stack (Spring WebFlux, Project Reactor); +reach for **Jedis** when blocking commands are common or you want a +simple sync API with explicit per-call connection lifetime. The +[Lettuce]({{< relref "/develop/clients/lettuce" >}}) and +[Jedis]({{< relref "/develop/clients/jedis" >}}) client guides cover the +deeper selection criteria. + +In this example, the batch features describe a user's longer-term shape and +are bulk-loaded by `BuildFeatures.java`. The streaming features describe +what the user is doing right now and are written by `StreamingWorker.java` +on a daemon thread. The inference handlers of the demo server read any +subset of those features through `FeatureStore.java`'s helper class. All +four sources share one `StatefulRedisConnection` opened in `DemoServer.java`. + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-lettuce/FeatureStore.java)): + +```java +import io.lettuce.core.RedisClient; +import io.lettuce.core.api.StatefulRedisConnection; + +RedisClient client = RedisClient.create("redis://localhost:6379"); +try (StatefulRedisConnection conn = client.connect()) { + FeatureStore store = new FeatureStore(conn, + "fs:user:", + 24L * 60L * 60L, // whole-entity TTL aligned with the daily batch cycle + 5L * 60L // per-field TTL on each streaming feature + ); + + // Batch materialization: one HSET + EXPIRE per user, all pipelined + // through a single connection-level flush. + Map> rows = Map.of( + "u0001", Map.of( + "country_iso", "US", "risk_segment", "low", + "tx_count_7d", 14, "avg_amount_30d", 92.40, + "account_age_days", 612, "chargeback_count_180d", 0)); + store.bulkLoad(rows); + + // Streaming write: HSET + HEXPIRE on just the fields that changed. + store.updateStreaming("u0001", Map.of( + "last_login_ts", System.currentTimeMillis(), + "last_device_id", "ios-9f02", + "tx_count_5m", 3, + "failed_logins_15m", 0, + "session_country", "US")); + + // Inference read: HMGET of whatever the model needs. + Map features = store.getFeatures("u0001", List.of( + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m")); + + // Batch scoring: pipelined HMGET across many users. + Map> batch = store.batchGetFeatures( + List.of("u0001", "u0002", "u0003"), + List.of("risk_segment", "tx_count_5m", "failed_logins_15m")); +} finally { + client.shutdown(); +} +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes on the wire, so the helper encodes booleans as `"true"` / +`"false"` (`encodeValue(Object)` in `FeatureStore.java`) and renders +everything else with `Object.toString()`. The model server is responsible +for parsing back to the right type, the same way it would when reading any +serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +### Bulk-loading batch features + +`bulkLoad` queues one `HSET` and one `EXPIRE` per user with auto-flush +disabled, flushes once, and waits for every `RedisFuture` to resolve. + +```java +public int bulkLoad(Map> rows, long ttlSeconds) { + if (rows.isEmpty()) return 0; + + List> futures = new ArrayList<>(rows.size() * 2); + conn.setAutoFlushCommands(false); + try { + for (Map.Entry> e : rows.entrySet()) { + String key = keyFor(e.getKey()); + Map encoded = encode(e.getValue()); + futures.add(async.hset(key, encoded)); + futures.add(async.expire(key, ttlSeconds)); + } + conn.flushCommands(); + } finally { + conn.setAutoFlushCommands(true); + } + if (!LettuceFutures.awaitAll(BATCH_TIMEOUT, futures.toArray(new RedisFuture[0]))) { + throw new IllegalStateException("bulkLoad: timed out after " + BATCH_TIMEOUT); + } + ... +} +``` + +The two important things to notice: + +1. **`setAutoFlushCommands(false)` is on the connection, not the async + commands.** It affects *every* call going through that + `StatefulRedisConnection` until it's flipped back. The `finally` block + restores auto-flush even if a queue step throws — failing to do so would + silently break every subsequent command in the JVM. +2. **`LettuceFutures.awaitAll` blocks with a timeout.** With auto-flush off, + queued commands can sit in the local pipeline buffer indefinitely if + something below the flush goes wrong. The timeout gives `bulkLoad` a + clean failure mode rather than hanging forever. + +In production, the equivalent of this script runs as an offline pipeline (a +Spark or Feast `materialize` job) that reads from the warehouse and writes +into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`updateStreaming` is the linchpin of the mixed-staleness story: + +```java +public void updateStreaming(String entityId, Map fields, long ttlSeconds) { + if (fields.isEmpty()) return; + String key = keyFor(entityId); + Map encoded = encode(fields); + String[] names = encoded.keySet().toArray(new String[0]); + + RedisFuture hsetFut; + RedisFuture> hexpireFut; + conn.setAutoFlushCommands(false); + try { + hsetFut = async.hset(key, encoded); + hexpireFut = async.hexpire(key, ttlSeconds, names); + conn.flushCommands(); + } finally { + conn.setAutoFlushCommands(true); + } + awaitOne(hsetFut); + List codes = awaitOne(hexpireFut); + for (Long code : codes) { + if (code == null || code != 1L) { + throw new IllegalStateException( + "HEXPIRE did not set every field TTL for " + key + ": " + codes); + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. The two commands are queued under one +flush so Redis runs them in pipeline order: the `HSET` first creates or +overwrites the fields, then `HEXPIRE` attaches a TTL to each of those same +fields. `HEXPIRE` returns one status code per field — `1` if the TTL was +set, `2` if the expiry was 0 or in the past (so Redis deleted the field +instead), `0` if an `NX | XX | GT | LT` conditional flag was set and not +met (we never use one here), `-2` if the field doesn't exist on the key. +The helper throws if any code is anything other than `1`, so the "every +streaming write renews its TTL" invariant fails loudly rather than silently +leaving a streaming field with no expiry attached. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse. [`HTTL`]({{< relref "/commands/httl" >}}) lets +the model side inspect the remaining TTL on any field, which is useful both +for debugging and as a freshness signal in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level TTL +> commands were added in Redis 7.4. Lettuce 6.4 was the first release with +> the bindings; the demo's `pom.xml` pins 7.5.2.RELEASE. + +### Inference reads with HMGET + +`getFeatures` is one `HMGET`: + +```java +public Map getFeatures(String entityId, List fieldNames) { + String key = keyFor(entityId); + Map out = new LinkedHashMap<>(); + if (fieldNames == null) { + Map all = awaitOne(async.hgetall(key)); + if (all != null) out.putAll(all); + return out; + } + if (fieldNames.isEmpty()) return out; + List> values = awaitOne( + async.hmget(key, fieldNames.toArray(new String[0]))); + for (KeyValue kv : values) { + if (kv != null && kv.hasValue()) { + out.put(kv.getKey(), kv.getValue()); + } + } + return out; +} +``` + +Lettuce's `hmget` returns `List>` rather than a parallel +`List` like Jedis. `KeyValue` is Lettuce's `Optional`-like wrapper: +`kv.hasValue()` tells you whether Redis returned a value or a nil for that +field, and `kv.getValue()` unwraps it. The helper drops `hasValue()==false` +entries so the caller's `Map` only contains fields that +actually exist on the hash. + +### Batch scoring with pipelined HMGET + +The same connection-level flush pattern carries over to batch reads: + +```java +public Map> batchGetFeatures( + List entityIds, List fieldNames) { + if (entityIds.isEmpty() || fieldNames.isEmpty()) { + return Collections.emptyMap(); + } + String[] names = fieldNames.toArray(new String[0]); + + List>>> futures = + new ArrayList<>(entityIds.size()); + conn.setAutoFlushCommands(false); + try { + for (String id : entityIds) { + futures.add(async.hmget(keyFor(id), names)); + } + conn.flushCommands(); + } finally { + conn.setAutoFlushCommands(true); + } + + Map> out = new LinkedHashMap<>(); + for (int i = 0; i < entityIds.size(); i++) { + List> values = awaitOne(futures.get(i)); + Map row = new LinkedHashMap<>(); + for (KeyValue kv : values) { + if (kv != null && kv.hasValue()) row.put(kv.getKey(), kv.getValue()); + } + out.put(entityIds.get(i), row); + } + return out; +} +``` + +One round trip for the whole batch. The first call after server startup +includes a few milliseconds of Netty event-loop and connection warm-up; +steady-state, the demo returns a 100-user batch in 2-5 ms against a local +Redis. + +A Redis Cluster is different: a single auto-flush batch is bound to one +shard, because all the queued commands ship through one connection to one +node. For batch reads on a cluster, use +[`RedisClusterClient`]({{< relref "/develop/clients/lettuce" >}}) — its +`StatefulRedisClusterConnection` exposes `getConnection(slot)` for +per-shard auto-flush batching, and the high-level `RedisAdvancedClusterAsyncCommands` +fans out non-pipelined calls per shard automatically. + +A hash tag like `fs:user:{vip}:u0001` forces a known set of keys onto the +same shard so one auto-flush batch can cover all of them in a single round +trip. + +## The streaming worker + +`StreamingWorker.java` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-lettuce/StreamingWorker.java)). +It runs as a daemon `Thread` next to the demo server so the UI can start, +pause, and resume it; in production this code would live in the streaming +layer. + +The lifecycle (start / stop / pause / resume / waitForIdle) is identical to +the Jedis demo — the worker thread itself doesn't care which client it's +talking to, only that `FeatureStore.updateStreaming` pipelines the +`HSET` + `HEXPIRE` in order within one flush. The Lettuce helper achieves that through +the connection-level flush described above. + +Pausing the worker is what shows off the mixed-staleness behavior: leave +it paused for longer than `streamingTtlSeconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields remain +under the longer key-level `EXPIRE`. The demo's `Pause / resume` button +lets you see this happen in real time. + +`pause()` only blocks *future* ticks from running. A reset that's about to +`DEL` every key also needs to wait out an already-running tick, which is +what `waitForIdle()` is for. The demo's `Reset` handler calls +`worker.pause()` *and* `worker.waitForIdle()` before it issues the `DEL` +sweep, so a mid-flight tick can't recreate a user under a streaming-only +hash with no key-level TTL. + +## The batch builder + +`BuildFeatures.java` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/java-lettuce/BuildFeatures.java)). +It generates synthetic feature rows and calls `store.bulkLoad` once. The +synthesis itself is not the point — in a real deployment the equivalent +code reads from the offline store (Snowflake, BigQuery, Iceberg) and writes +the resulting hashes into Redis. + +Run the builder on its own (independently of the demo server) to populate +Redis from the command line: + +```bash +mvn exec:java -Dexec.mainClass=BuildFeatures -Dexec.args="--count 500 --ttl-seconds 3600" +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which +is how a typical operator would pre-seed a feature store from the command +line when debugging. + +## The interactive demo + +`DemoServer.java` runs the JDK `HttpServer` on port 8089 with a fixed +thread pool. The HTML page lets you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. +* See the **store state**: user count, batch / streaming TTLs, cumulative + read/write counters. +* See the **streaming worker** status and **pause or resume** it. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL. + +The server holds one `FeatureStore`, one `StreamingWorker`, one +`RedisClient`, and one `StatefulRedisConnection` for the lifetime of the +process. Every HTTP handler and the streaming worker share that single +connection — Lettuce multiplexes the commands across them automatically. +Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Auto-flush batched `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the + demo relies on per-field TTL for the mixed-staleness story. +* **Java 17 or later.** The demo uses switch expressions with arrow labels + (`case "..." -> ...`), records, and text blocks. +* **Lettuce 6.4 or later.** The demo's `pom.xml` pins 7.5.2.RELEASE. + Field-level TTL bindings (`hexpire`, `httl`, `hpersist`) ship from + Lettuce 6.4. + +If your Redis server is running elsewhere, start the demo with +`--redis-uri redis://host:port`. + +## Running the demo + +### Get the source files + +The demo lives in a small Maven project under +[`feature-store/java-lettuce`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/java-lettuce). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/java-lettuce +mvn package +``` + +### Start the demo server + +From the project directory: + +```bash +mvn exec:java -Dexec.mainClass=DemoServer +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8089 +Using Redis at redis://localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +Open [http://127.0.0.1:8089](http://127.0.0.1:8089). The first inference +read after startup is a few milliseconds slower than the rest because +Lettuce / Netty are warming up the event loop and the underlying socket; +subsequent reads settle into 1-2 ms on a local Redis. + +Useful things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused for + ~5 minutes (or restart the server with `--streaming-ttl-seconds 30` to + make it visible in seconds). Re-run **Read features** on any user and + watch the streaming fields disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Reset** to drop every user and start over. + +The server is read/write against your local Redis. The default key prefix +is `fs:user:`. Pass `--no-reset` to keep existing data across restarts, or +`--redis-uri` to point at a different Redis. + +## Production usage + +The guidance below focuses on the production concerns that are specific to +running a feature store on Redis. For the generic Lettuce production +checklist — `ClientResources` tuning, AUTH/ACL, retry policy, +sentinel/cluster failover — see the +[Lettuce client guide]({{< relref "/develop/clients/lettuce" >}}). For TLS +specifically, follow the +[connect-with-TLS recipe]({{< relref "/develop/clients/lettuce/connect#tls-connection" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness from +a broken batch pipeline. Set it longer than your worst-case batch outage +so a single missed run doesn't take the feature store offline, but short +enough that a sustained outage causes loud failures (missing entities) +rather than quiet ones (yesterday's features being scored as today's). The +standard choice is one cycle of "expected refresh interval × 2" — for a +daily batch, 48 hours; for a 6-hour batch, 12 hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker causes +visible freshness failures. + +### Don't share auto-flush state across unrelated code paths + +`conn.setAutoFlushCommands(false)` flips a *connection-level* toggle that +affects every call going through that connection until it's flipped +back. If two threads run pipelined writes concurrently against the same +connection, they will fight over the flag — one thread's `flushCommands()` +will ship the other thread's still-being-queued commands, or its +restore-to-true will flush the other thread's queue prematurely. Worse, +a single non-pipelined read on that same connection will be silently +queued (and never flushed) while the flag is off. + +The demo handles this by opening **two** connections from the same +`RedisClient`: + +* **The shared read connection** stays in default auto-flush=true mode. + Every HTTP handler and the streaming worker use it for the + non-pipelined commands (`HMGET`, `HTTL`, `TTL`, `SCAN`, `DEL`, + `HGETALL`). +* **The dedicated pipeline connection** is reserved for `bulkLoad`, + `updateStreaming`, and `batchGetFeatures`. These all acquire a single + `pipelineLock` inside the `FeatureStore` instance before they touch + the auto-flush flag, so concurrent batches block each other instead + of corrupting the state. With one lock and one connection, you get at + most one in-flight batch at a time on the pipeline side; the read + connection is unaffected. + +For batch concurrency beyond what one connection sustains, scale this +pattern to a small +[`BoundedAsyncPool>`]({{< relref "/develop/clients/lettuce" >}}) +of pipeline connections and lease one per batch. + +### Pipeline batch reads across shards + +On a single Redis instance, an auto-flush batched `HMGET` across `N` users +is one round trip. A Redis Cluster is different: a single auto-flush batch +is bound to one shard, because all queued commands ship to one node. For +batch reads on a cluster, use +[`RedisClusterClient`]({{< relref "/develop/clients/lettuce" >}}) and one +of: + +* Fan-out via `RedisAdvancedClusterAsyncCommands` — the cluster client + routes each `hmGet` to the right shard transparently. Easier to write, + slightly more overhead per call. +* Bucket keys by slot with `SlotHash.getSlot(key)` and open one connection + per affected shard; auto-flush-batch each bucket separately. More code, + but one round trip per shard. + +For a small number of frequently-queried users (a top-N customer list, for +example), a hash tag like `fs:user:{vip}:u0001` forces a known set of keys +onto the same shard so one batch can cover them all. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the streaming +write applies `HEXPIRE` *every time*. If a streaming worker writes a field +without renewing its TTL, the field carries whatever expiry was there +before — possibly none, possibly stale — and the mixed-staleness invariant +breaks. Keep the `HSET` and `HEXPIRE` under the same flush boundary (or, +even safer, in the same +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if you +don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model doesn't +need. With dozens of features per entity, that is wasted serialization +work on the server and wasted bandwidth on the wire. Always specify the +field list explicitly with `hmget` in the model server. + +The exception is debugging and feature-set discovery, where you genuinely +want the full hash. The demo's "Inspect" button uses `hgetall` for exactly +this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the hash +(either it was never written, or it expired); `-1` means the field has no +TTL set (and is therefore covered only by the key-level `EXPIRE`); any +positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on streaming + features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL aligned + with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one + network round trip via Lettuce's connection-level auto-flush. + +See the [Lettuce documentation]({{< relref "/develop/clients/lettuce" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the +deeper conceptual model. diff --git a/content/develop/use-cases/feature-store/java-lettuce/pom.xml b/content/develop/use-cases/feature-store/java-lettuce/pom.xml new file mode 100644 index 0000000000..7b926a0096 --- /dev/null +++ b/content/develop/use-cases/feature-store/java-lettuce/pom.xml @@ -0,0 +1,82 @@ + + + + 4.0.0 + + com.redis.docs + feature-store-lettuce + 0.1.0 + jar + + + 17 + UTF-8 + + + + + + io.lettuce + lettuce-core + 7.5.2.RELEASE + + + + + + + org.codehaus.mojo + build-helper-maven-plugin + 3.6.0 + + + add-source + generate-sources + add-source + + + ${project.basedir} + + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.13.0 + + 17 + + *.java + + + + + + org.codehaus.mojo + exec-maven-plugin + 3.5.0 + + + + ${project.basedir} + + diff --git a/content/develop/use-cases/feature-store/nodejs/demoServer.js b/content/develop/use-cases/feature-store/nodejs/demoServer.js index ff69a35e9e..be79f6efaa 100644 --- a/content/develop/use-cases/feature-store/nodejs/demoServer.js +++ b/content/develop/use-cases/feature-store/nodejs/demoServer.js @@ -600,11 +600,23 @@ async function handleRequest(req, res, ctx) { sendJson(res, { exists: false, key_ttl_seconds: keyTtl }); return; } - const fieldNames = Object.keys(full); - const ttls = await store.fieldTtlsSeconds(user, fieldNames); + // Iterate the known schema (batch + streaming) plus any + // extras the hash carries. Expired streaming fields surface + // as ttl_seconds=-2 instead of silently disappearing from the + // Inspect view, which is exactly the debugging view someone + // hits "Inspect" for. + const allNames = [...DEFAULT_BATCH_FIELDS, ...DEFAULT_STREAMING_FIELDS]; + for (const n of Object.keys(full)) { + if (!allNames.includes(n)) allNames.push(n); + } + const ttls = await store.fieldTtlsSeconds(user, allNames); const keyTtl = await store.keyTtlSeconds(user); - const fields = fieldNames - .map((name) => ({ name, value: full[name], ttl_seconds: ttls[name] ?? -1 })) + const fields = allNames + .map((name) => ({ + name, + value: full[name] ?? "", + ttl_seconds: ttls[name] ?? -2, + })) .sort((a, b) => a.name.localeCompare(b.name)); sendJson(res, { exists: true, diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py index d09f48ad00..fd8713a3f3 100644 --- a/content/develop/use-cases/feature-store/redis-py/demo_server.py +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -649,12 +649,22 @@ def _handle_inspect(self, query: dict[str, list[str]]) -> None: 200, ) return - ttls = self.store.field_ttls_seconds(user, full.keys()) + # Iterate the known schema (batch + streaming) plus any + # extras the hash happens to carry. This makes expired + # streaming fields surface as ttl_seconds=-2 in the UI + # instead of silently disappearing, which is exactly the + # debugging view someone hits "Inspect" for. + all_names = list(DEFAULT_BATCH_FIELDS) + list(DEFAULT_STREAMING_FIELDS) + for n in full: + if n not in all_names: + all_names.append(n) + ttls = self.store.field_ttls_seconds(user, all_names) key_ttl = self.store.key_ttl_seconds(user) fields = sorted( [ - {"name": n, "value": v, "ttl_seconds": ttls.get(n, -1)} - for n, v in full.items() + {"name": n, "value": full.get(n, ""), + "ttl_seconds": ttls.get(n, -2)} + for n in all_names ], key=lambda r: r["name"], ) From 2d5d1d4f1571f691c94e23e44d377c2ef32ed401 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 10:27:42 +0100 Subject: [PATCH 08/20] DOC-6661 draft Rust and .NET examples --- .gitignore | 8 + .../develop/use-cases/feature-store/_index.md | 2 + .../feature-store/dotnet/BuildFeatures.cs | 127 ++++ .../feature-store/dotnet/FeatureStore.cs | 442 ++++++++++++ .../dotnet/FeatureStoreDemo.csproj | 17 + .../feature-store/dotnet/HtmlTemplate.cs | 370 ++++++++++ .../use-cases/feature-store/dotnet/Program.cs | 287 ++++++++ .../feature-store/dotnet/StreamingWorker.cs | 236 +++++++ .../use-cases/feature-store/dotnet/_index.md | 666 ++++++++++++++++++ .../use-cases/feature-store/rust/Cargo.toml | 28 + .../use-cases/feature-store/rust/_index.md | 653 +++++++++++++++++ .../feature-store/rust/build_features.rs | 116 +++ .../feature-store/rust/build_features_bin.rs | 14 + .../feature-store/rust/demo_server.rs | 466 ++++++++++++ .../feature-store/rust/demo_template.html | 342 +++++++++ .../feature-store/rust/feature_store.rs | 482 +++++++++++++ .../use-cases/feature-store/rust/lib.rs | 29 + .../feature-store/rust/streaming_worker.rs | 275 ++++++++ 18 files changed, 4560 insertions(+) create mode 100644 content/develop/use-cases/feature-store/dotnet/BuildFeatures.cs create mode 100644 content/develop/use-cases/feature-store/dotnet/FeatureStore.cs create mode 100644 content/develop/use-cases/feature-store/dotnet/FeatureStoreDemo.csproj create mode 100644 content/develop/use-cases/feature-store/dotnet/HtmlTemplate.cs create mode 100644 content/develop/use-cases/feature-store/dotnet/Program.cs create mode 100644 content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs create mode 100644 content/develop/use-cases/feature-store/dotnet/_index.md create mode 100644 content/develop/use-cases/feature-store/rust/Cargo.toml create mode 100644 content/develop/use-cases/feature-store/rust/_index.md create mode 100644 content/develop/use-cases/feature-store/rust/build_features.rs create mode 100644 content/develop/use-cases/feature-store/rust/build_features_bin.rs create mode 100644 content/develop/use-cases/feature-store/rust/demo_server.rs create mode 100644 content/develop/use-cases/feature-store/rust/demo_template.html create mode 100644 content/develop/use-cases/feature-store/rust/feature_store.rs create mode 100644 content/develop/use-cases/feature-store/rust/lib.rs create mode 100644 content/develop/use-cases/feature-store/rust/streaming_worker.rs diff --git a/.gitignore b/.gitignore index 29e1ed27e9..5c4cc19956 100644 --- a/.gitignore +++ b/.gitignore @@ -19,3 +19,11 @@ package-lock.json # Rust docs demos /content/develop/use-cases/rate-limiter/rust/target/ /content/develop/use-cases/rate-limiter/rust/Cargo.lock +/content/develop/use-cases/**/rust/target/ +/content/develop/use-cases/**/rust/Cargo.lock +# Java / Maven build output for the docs demos +/content/develop/use-cases/**/java-jedis/target/ +/content/develop/use-cases/**/java-lettuce/target/ +# .NET build output for the docs demos +/content/develop/use-cases/**/dotnet/bin/ +/content/develop/use-cases/**/dotnet/obj/ diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index 6e99909fd4..4f65e0d2b1 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -160,3 +160,5 @@ for a single user under 1 ms, and pipeline batch reads across a hundred users. * [go-redis (Go)]({{< relref "/develop/use-cases/feature-store/go" >}}) * [Jedis (Java)]({{< relref "/develop/use-cases/feature-store/java-jedis" >}}) * [Lettuce (Java)]({{< relref "/develop/use-cases/feature-store/java-lettuce" >}}) +* [redis-rs (Rust)]({{< relref "/develop/use-cases/feature-store/rust" >}}) +* [StackExchange.Redis (C#)]({{< relref "/develop/use-cases/feature-store/dotnet" >}}) diff --git a/content/develop/use-cases/feature-store/dotnet/BuildFeatures.cs b/content/develop/use-cases/feature-store/dotnet/BuildFeatures.cs new file mode 100644 index 0000000000..e95c0003e0 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/BuildFeatures.cs @@ -0,0 +1,127 @@ +using StackExchange.Redis; + +namespace FeatureStoreDemo; + +/// +/// Synthesize a small batch of users with realistic-looking features +/// and bulk-load them into Redis with a 24-hour key-level TTL. +/// +/// +/// Stands in for the nightly Spark / Feast materialization job in a +/// real deployment. In production the equivalent of this script lives +/// in an offline pipeline that reads from the offline store and +/// writes the serving-time hashes into Redis via HSET + +/// EXPIRE. +/// +public static class BuildFeatures +{ + private static readonly string[] CountryChoices = { + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", + }; + private static readonly string[] RiskSegments = { "low", "medium", "high" }; + private static readonly int[] RiskWeights = { 70, 25, 5 }; + private static readonly int[] ChargebackBuckets = { 0, 1, 2, 3 }; + private static readonly int[] ChargebackWeights = { 85, 10, 4, 1 }; + + /// + /// Generate synthetic user feature rows. + /// + public static Dictionary> SynthesizeUsers( + int count, int seed) + { + var rng = new Random(seed); + var users = new Dictionary>(count); + for (int i = 1; i <= count; i++) + { + var uid = $"u{i:D4}"; + users[uid] = new Dictionary + { + ["country_iso"] = CountryChoices[rng.Next(CountryChoices.Length)], + ["risk_segment"] = WeightedChoice(rng, RiskSegments, RiskWeights), + ["account_age_days"] = 7 + rng.Next(2394), + ["tx_count_7d"] = rng.Next(81), + ["avg_amount_30d"] = Math.Round(5.0 + rng.NextDouble() * 345.0, 2), + ["chargeback_count_180d"] = WeightedChoiceInt(rng, ChargebackBuckets, ChargebackWeights), + }; + } + return users; + } + + /// + /// CLI entry point. Run with: + /// dotnet run --project . -- --mode build-features --count 500 + /// + public static async Task RunCliAsync(string[] args) + { + var redisUri = "localhost:6379"; + var count = 200; + var ttlSeconds = 24L * 60L * 60L; + var keyPrefix = "fs:user:"; + var seed = 42; + + for (int i = 0; i < args.Length; i++) + { + switch (args[i]) + { + case "--redis-uri" when i + 1 < args.Length: + redisUri = args[++i]; break; + case "--count" when i + 1 < args.Length: + count = int.Parse(args[++i]); break; + case "--ttl-seconds" when i + 1 < args.Length: + ttlSeconds = long.Parse(args[++i]); break; + case "--key-prefix" when i + 1 < args.Length: + keyPrefix = args[++i]; break; + case "--seed" when i + 1 < args.Length: + seed = int.Parse(args[++i]); break; + case "-h": + case "--help": + Console.WriteLine( + "Usage: dotnet run -- --mode build-features [--redis-uri URI] " + + "[--count N] [--ttl-seconds S] [--key-prefix PREFIX] [--seed N]"); + return 0; + } + } + + var mux = await ConnectionMultiplexer.ConnectAsync(redisUri); + try + { + var store = new FeatureStore(mux, keyPrefix, ttlSeconds, + FeatureStore.DefaultStreamingTtlSeconds); + var rows = SynthesizeUsers(count, seed); + var loaded = await store.BulkLoadAsync(rows, ttlSeconds); + Console.WriteLine( + $"Materialized {loaded} users at {keyPrefix}* with a {ttlSeconds}s key-level TTL."); + } + finally + { + await mux.CloseAsync(); + } + return 0; + } + + private static string WeightedChoice(Random rng, string[] items, int[] weights) + { + int total = 0; + foreach (var w in weights) total += w; + int r = rng.Next(total); + for (int i = 0; i < items.Length; i++) + { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[^1]; + } + + private static int WeightedChoiceInt(Random rng, int[] items, int[] weights) + { + int total = 0; + foreach (var w in weights) total += w; + int r = rng.Next(total); + for (int i = 0; i < items.Length; i++) + { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[^1]; + } +} diff --git a/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs new file mode 100644 index 0000000000..46985f2fd4 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs @@ -0,0 +1,442 @@ +using System.Collections.Concurrent; +using StackExchange.Redis; + +namespace FeatureStoreDemo; + +/// +/// Redis online feature store backed by per-entity Hashes +/// (StackExchange.Redis). +/// +/// +/// Each entity (here, a user) lives at a deterministic key such as +/// fs:user:{id}. The hash holds every feature for that entity +/// as one field per feature — batch-materialized aggregates +/// (refreshed on a daily cycle) alongside streaming-updated signals +/// (refreshed every few seconds). One HMGET returns whichever +/// subset the model needs in one network round trip. +/// +/// Two TTL layers solve the mixed staleness problem: +/// +/// +/// A key-level EXPIRE aligned with the batch +/// materialization cycle. +/// A per-field HEXPIRE on each streaming field gives +/// that field its own shorter expiry, independent of the rest of +/// the hash. +/// +/// +/// HEXPIRE and HTTL require Redis 7.4 or later. +/// StackExchange.Redis 2.8+ exposes them as +/// and +/// . The demo pins +/// 2.13.17. +/// +/// The shared ConnectionMultiplexer is thread-safe and +/// multiplexed — one instance serves the whole process, and every +/// handler in the ASP.NET Core thread pool plus the streaming +/// worker call into it without coordination. +/// +public sealed class FeatureStore +{ + public static readonly IReadOnlyList DefaultBatchFields = new[] + { + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d", + }; + + public static readonly IReadOnlyList DefaultStreamingFields = new[] + { + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country", + }; + + public const long DefaultBatchTtlSeconds = 24L * 60L * 60L; + public const long DefaultStreamingTtlSeconds = 5L * 60L; + public const string DefaultKeyPrefix = "fs:user:"; + + private readonly IConnectionMultiplexer _mux; + private readonly IDatabase _db; + public string KeyPrefix { get; } + public long BatchTtlSeconds { get; } + public long StreamingTtlSeconds { get; } + + private long _batchWritesTotal; + private long _streamingWritesTotal; + private long _readsTotal; + private long _readFieldsTotal; + + public FeatureStore( + IConnectionMultiplexer mux, + string keyPrefix = DefaultKeyPrefix, + long batchTtlSeconds = DefaultBatchTtlSeconds, + long streamingTtlSeconds = DefaultStreamingTtlSeconds) + { + _mux = mux; + _db = mux.GetDatabase(); + KeyPrefix = keyPrefix; + BatchTtlSeconds = batchTtlSeconds; + StreamingTtlSeconds = streamingTtlSeconds; + } + + public string KeyFor(string entityId) => KeyPrefix + entityId; + + // --------------------------------------------------------------- + // Batch ingestion (materialization) + // --------------------------------------------------------------- + + /// + /// Materialize a batch of entities into Redis. + /// + /// + /// One HSET plus one EXPIRE per entity, all queued + /// through an IBatch so the whole batch ships in a single + /// network round trip. + /// + public async Task BulkLoadAsync( + IReadOnlyDictionary> rows, + long ttlSeconds) + { + if (rows.Count == 0) return 0; + var batch = _db.CreateBatch(); + var tasks = new List(rows.Count * 2); + foreach (var (entityId, fields) in rows) + { + var key = (RedisKey)KeyFor(entityId); + var entries = new HashEntry[fields.Count]; + int i = 0; + foreach (var (name, value) in fields) + { + entries[i++] = new HashEntry(name, EncodeValue(value)); + } + tasks.Add(batch.HashSetAsync(key, entries)); + tasks.Add(batch.KeyExpireAsync(key, TimeSpan.FromSeconds(ttlSeconds))); + } + batch.Execute(); + await Task.WhenAll(tasks); + Interlocked.Add(ref _batchWritesTotal, rows.Count); + return rows.Count; + } + + // --------------------------------------------------------------- + // Streaming ingestion + // --------------------------------------------------------------- + + /// + /// Write streaming features with a per-field TTL. + /// + /// + /// HSET and HEXPIRE are queued in the same + /// IBatch so Redis runs them in pipeline order: the + /// HSET first creates or overwrites the fields, then + /// HEXPIRE attaches a TTL to each of those same fields. + /// + /// + /// returns one + /// per field: + /// + /// Success (= Redis code 1): TTL set / updated. + /// Due (= 2): the expiry was 0 or in the past, so + /// Redis deleted the field instead of applying a TTL. + /// ConditionNotMet (= 0): NX/XX/GT/LT condition + /// not met (we never use one here). + /// NotExist (= -2): no such field, or no such key. + /// + /// We always follow HSET with HEXPIRE so any code + /// other than Success means the per-field TTL invariant + /// didn't hold — the helper throws rather than silently leaving a + /// streaming field with no expiry attached. + /// + /// + public async Task UpdateStreamingAsync( + string entityId, + IReadOnlyDictionary fields, + long ttlSeconds) + { + if (fields.Count == 0) return; + var key = (RedisKey)KeyFor(entityId); + var entries = new HashEntry[fields.Count]; + var names = new RedisValue[fields.Count]; + int i = 0; + foreach (var (name, value) in fields) + { + entries[i] = new HashEntry(name, EncodeValue(value)); + names[i] = name; + i++; + } + + var batch = _db.CreateBatch(); + var hsetTask = batch.HashSetAsync(key, entries); + var hexpireTask = batch.HashFieldExpireAsync( + key, names, TimeSpan.FromSeconds(ttlSeconds)); + batch.Execute(); + await hsetTask; + var codes = await hexpireTask; + foreach (var code in codes) + { + if (code != ExpireResult.Success) + { + throw new InvalidOperationException( + $"HEXPIRE did not set every field TTL for {key}: [{string.Join(",", codes)}]"); + } + } + Interlocked.Add(ref _streamingWritesTotal, fields.Count); + } + + // --------------------------------------------------------------- + // Inference reads + // --------------------------------------------------------------- + + /// + /// Retrieve a subset of features for one entity with HMGET. + /// Returns only the fields that actually exist on the hash; + /// missing fields are dropped from the result. + /// + public async Task> GetFeaturesAsync( + string entityId, IReadOnlyList fieldNames) + { + var key = (RedisKey)KeyFor(entityId); + var out_ = new Dictionary(); + if (fieldNames.Count == 0) return out_; + var values = await _db.HashGetAsync( + key, fieldNames.Select(f => (RedisValue)f).ToArray()); + for (int i = 0; i < fieldNames.Count; i++) + { + if (!values[i].IsNull) + { + out_[fieldNames[i]] = values[i].ToString(); + } + } + Interlocked.Increment(ref _readsTotal); + Interlocked.Add(ref _readFieldsTotal, out_.Count); + return out_; + } + + /// + /// Full-hash read via HGETALL. Useful for debugging but + /// the model server should always go through + /// with an explicit field list. + /// + public async Task> GetAllFeaturesAsync(string entityId) + { + var entries = await _db.HashGetAllAsync(KeyFor(entityId)); + var dict = new Dictionary(entries.Length); + foreach (var e in entries) + { + dict[e.Name.ToString()] = e.Value.ToString(); + } + Interlocked.Increment(ref _readsTotal); + Interlocked.Add(ref _readFieldsTotal, entries.Length); + return dict; + } + + /// + /// Pipeline HMGET across many entities for batch scoring. + /// One round trip for the whole batch via IBatch. + /// + public async Task>> BatchGetFeaturesAsync( + IReadOnlyList entityIds, IReadOnlyList fieldNames) + { + if (entityIds.Count == 0 || fieldNames.Count == 0) + return new Dictionary>(); + + var fieldValues = fieldNames.Select(f => (RedisValue)f).ToArray(); + var batch = _db.CreateBatch(); + var tasks = new Task[entityIds.Count]; + for (int i = 0; i < entityIds.Count; i++) + { + tasks[i] = batch.HashGetAsync(KeyFor(entityIds[i]), fieldValues); + } + batch.Execute(); + var rows = await Task.WhenAll(tasks); + + var out_ = new Dictionary>(); + long seen = 0; + for (int i = 0; i < entityIds.Count; i++) + { + var row = new Dictionary(); + for (int j = 0; j < fieldNames.Count; j++) + { + if (!rows[i][j].IsNull) + { + row[fieldNames[j]] = rows[i][j].ToString(); + seen++; + } + } + out_[entityIds[i]] = row; + } + Interlocked.Add(ref _readsTotal, entityIds.Count); + Interlocked.Add(ref _readFieldsTotal, seen); + return out_; + } + + // --------------------------------------------------------------- + // TTL inspection (used by the demo UI) + // --------------------------------------------------------------- + + /// + /// Seconds until the entity key expires. Returns -1 if no TTL is + /// set, -2 if the key doesn't exist. + /// + public async Task KeyTtlSecondsAsync(string entityId) + { + var ttl = await _db.KeyTimeToLiveAsync(KeyFor(entityId)); + if (ttl == null) + { + // StackExchange.Redis returns null both for "no TTL" and + // for "key doesn't exist". Disambiguate with KeyExists. + return await _db.KeyExistsAsync(KeyFor(entityId)) ? -1L : -2L; + } + return (long)ttl.Value.TotalSeconds; + } + + /// + /// Per-field TTL via HTTL (Redis 7.4+). Values are in + /// seconds (the StackExchange.Redis return is milliseconds; we + /// convert here for consistency with the other clients): + /// positive seconds remaining, -1 no field TTL, -2 field + /// (or key) missing. + /// + public async Task> FieldTtlsSecondsAsync( + string entityId, IReadOnlyList fieldNames) + { + var out_ = new Dictionary(); + if (fieldNames.Count == 0) return out_; + var values = fieldNames.Select(f => (RedisValue)f).ToArray(); + var ms = await _db.HashFieldGetTimeToLiveAsync(KeyFor(entityId), values); + for (int i = 0; i < fieldNames.Count; i++) + { + // HTTL returns ms remaining; negative sentinels pass + // through. Convert positive durations to whole seconds + // for parity with the other clients' helpers. + long v = ms[i]; + out_[fieldNames[i]] = v < 0 ? v : v / 1000; + } + return out_; + } + + // --------------------------------------------------------------- + // Demo housekeeping + // --------------------------------------------------------------- + + /// + /// Enumerate up to entity IDs by + /// scanning keyPrefix*. SCAN is non-blocking; the + /// demo uses it for UI dropdowns, not as a serving primitive. + /// + public List ListEntityIds(int limit) + { + var ids = new List(Math.Min(limit, 1024)); + foreach (var endPoint in _mux.GetEndPoints()) + { + var server = _mux.GetServer(endPoint); + // pageSize=200 mirrors the other clients' SCAN COUNT + foreach (var key in server.Keys( + pattern: KeyPrefix + "*", pageSize: 200)) + { + var k = key.ToString(); + if (k.Length > KeyPrefix.Length) + { + ids.Add(k[KeyPrefix.Length..]); + if (ids.Count >= limit) break; + } + } + if (ids.Count >= limit) break; + } + ids.Sort(StringComparer.Ordinal); + return ids; + } + + /// + /// Count every entity under the key prefix. Iterates SCAN without + /// an in-memory cap so the UI can show the true total even when + /// more keys exist than returns. + /// + public long CountEntities() + { + long count = 0; + foreach (var endPoint in _mux.GetEndPoints()) + { + var server = _mux.GetServer(endPoint); + foreach (var _ in server.Keys( + pattern: KeyPrefix + "*", pageSize: 500)) + { + count++; + } + } + return count; + } + + public Task DeleteEntityAsync(string entityId) => + _db.KeyDeleteAsync(KeyFor(entityId)).ContinueWith(t => t.Result ? 1L : 0L); + + /// + /// Drop every entity under the key prefix. Used by the demo + /// reset path; SCANs and DELs in batches of 500. + /// + public async Task ResetAsync() + { + long deleted = 0; + foreach (var endPoint in _mux.GetEndPoints()) + { + var server = _mux.GetServer(endPoint); + var batch = new List(500); + foreach (var key in server.Keys( + pattern: KeyPrefix + "*", pageSize: 500)) + { + batch.Add(key); + if (batch.Count >= 500) + { + deleted += await _db.KeyDeleteAsync(batch.ToArray()); + batch.Clear(); + } + } + if (batch.Count > 0) + { + deleted += await _db.KeyDeleteAsync(batch.ToArray()); + } + } + return deleted; + } + + public Stats StatsSnapshot() => new( + Interlocked.Read(ref _batchWritesTotal), + Interlocked.Read(ref _streamingWritesTotal), + Interlocked.Read(ref _readsTotal), + Interlocked.Read(ref _readFieldsTotal)); + + public void ResetStats() + { + Interlocked.Exchange(ref _batchWritesTotal, 0); + Interlocked.Exchange(ref _streamingWritesTotal, 0); + Interlocked.Exchange(ref _readsTotal, 0); + Interlocked.Exchange(ref _readFieldsTotal, 0); + } + + public record Stats( + long BatchWritesTotal, + long StreamingWritesTotal, + long ReadsTotal, + long ReadFieldsTotal); + + /// + /// Render a feature value as a string for hash storage. Booleans + /// become "true"/"false" so they round-trip cleanly through other + /// clients and redis-cli. + /// + public static string EncodeValue(object? value) => value switch + { + null => "", + bool b => b ? "true" : "false", + double d when d == Math.Floor(d) => d.ToString("F1", System.Globalization.CultureInfo.InvariantCulture), + double d => d.ToString(System.Globalization.CultureInfo.InvariantCulture), + float f => f.ToString(System.Globalization.CultureInfo.InvariantCulture), + _ => value.ToString() ?? "", + }; +} diff --git a/content/develop/use-cases/feature-store/dotnet/FeatureStoreDemo.csproj b/content/develop/use-cases/feature-store/dotnet/FeatureStoreDemo.csproj new file mode 100644 index 0000000000..957c3cc258 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/FeatureStoreDemo.csproj @@ -0,0 +1,17 @@ + + + + net8.0 + enable + enable + FeatureStoreDemo + + + + + + + + diff --git a/content/develop/use-cases/feature-store/dotnet/HtmlTemplate.cs b/content/develop/use-cases/feature-store/dotnet/HtmlTemplate.cs new file mode 100644 index 0000000000..f4b808b737 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/HtmlTemplate.cs @@ -0,0 +1,370 @@ +using System.Text.Json; + +namespace FeatureStoreDemo; + +/// +/// Inlined HTML page for the demo. Same UI shape as every other +/// feature-store demo (Python, Node.js, Go, Java, Rust). +/// +internal static class HtmlTemplate +{ + public static string Render(string keyPrefix, long streamingTtl, int usersPerTick) + { + var batchJson = JsonSerializer.Serialize(FeatureStore.DefaultBatchFields); + var streamJson = JsonSerializer.Serialize(FeatureStore.DefaultStreamingFields); + return Template + .Replace("__KEY_PREFIX__", keyPrefix) + .Replace("__STREAM_TTL__", streamingTtl.ToString()) + .Replace("__USERS_PER_TICK__", usersPerTick.ToString()) + .Replace("__BATCH_FIELDS_JSON__", batchJson) + .Replace("__STREAM_FIELDS_JSON__", streamJson); + } + + // C# 11 raw string literals (""") let the JS template literals + // (`backticks`) survive without escapes. + private const string Template = """ + + + + + + Redis Feature Store Demo (.NET) + + + +
+
StackExchange.Redis + ASP.NET Core minimal API
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users through one + IBatch. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one IBatch — the whole batch ships in + one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + IBatch. One network round trip for the whole + batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +"""; +} diff --git a/content/develop/use-cases/feature-store/dotnet/Program.cs b/content/develop/use-cases/feature-store/dotnet/Program.cs new file mode 100644 index 0000000000..59af580f91 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/Program.cs @@ -0,0 +1,287 @@ +using System.Diagnostics; +using FeatureStoreDemo; +using Microsoft.AspNetCore.Mvc; +using StackExchange.Redis; + +// CLI: `--mode build-features` shells out to the batch materializer +// without spinning up the HTTP server. Defaults to running the demo +// server. +for (int i = 0; i < args.Length; i++) +{ + if (args[i] == "--mode" && i + 1 < args.Length && args[i + 1] == "build-features") + { + var sub = args.Where((_, idx) => idx != i && idx != i + 1).ToArray(); + return await BuildFeatures.RunCliAsync(sub); + } +} + +var host = "127.0.0.1"; +var port = 8091; +var redisUri = "localhost:6379"; +var keyPrefix = "fs:user:"; +var batchTtlSeconds = FeatureStore.DefaultBatchTtlSeconds; +var streamingTtlSeconds = FeatureStore.DefaultStreamingTtlSeconds; +var usersPerTick = 5; +var seedUsers = 200; +var resetOnStart = true; + +for (int i = 0; i < args.Length; i++) +{ + switch (args[i]) + { + case "--host" when i + 1 < args.Length: host = args[++i]; break; + case "--port" when i + 1 < args.Length: port = int.Parse(args[++i]); break; + case "--redis-uri" when i + 1 < args.Length: redisUri = args[++i]; break; + case "--key-prefix" when i + 1 < args.Length: keyPrefix = args[++i]; break; + case "--batch-ttl-seconds" when i + 1 < args.Length: batchTtlSeconds = long.Parse(args[++i]); break; + case "--streaming-ttl-seconds" when i + 1 < args.Length: streamingTtlSeconds = long.Parse(args[++i]); break; + case "--users-per-tick" when i + 1 < args.Length: usersPerTick = int.Parse(args[++i]); break; + case "--seed-users" when i + 1 < args.Length: seedUsers = int.Parse(args[++i]); break; + case "--no-reset": resetOnStart = false; break; + case "-h": + case "--help": + Console.WriteLine("Usage: dotnet run [--host H] [--port P] [--redis-uri URI] " + + "[--key-prefix PFX] [--batch-ttl-seconds S] [--streaming-ttl-seconds S] " + + "[--users-per-tick N] [--seed-users N] [--no-reset] " + + "[--mode build-features (...)]"); + return 0; + } +} + +var muxOptions = ConfigurationOptions.Parse(redisUri); +muxOptions.AllowAdmin = true; // server.Keys() requires AllowAdmin +var mux = await ConnectionMultiplexer.ConnectAsync(muxOptions); + +var store = new FeatureStore(mux, keyPrefix, batchTtlSeconds, streamingTtlSeconds); +var worker = new StreamingWorker(store, TimeSpan.FromSeconds(1), usersPerTick, 1337); +// Serializes materialize / reset / toggle-worker against each other +// so the pause-and-wait-for-idle dance can't race with a concurrent +// bulk-load. +var demoLock = new SemaphoreSlim(1, 1); +var demoSeed = 42; + +if (resetOnStart) +{ + Console.WriteLine($"Dropping any existing users under '{keyPrefix}*' for a clean demo run (pass --no-reset to keep them)."); + await store.ResetAsync(); + store.ResetStats(); +} +var seeded = await store.BulkLoadAsync( + BuildFeatures.SynthesizeUsers(seedUsers, demoSeed), + batchTtlSeconds); + +worker.Start(); + +var builder = WebApplication.CreateBuilder(args); +builder.WebHost.UseUrls($"http://{host}:{port}"); +builder.Logging.ClearProviders(); +builder.Logging.AddConsole(o => o.LogToStandardErrorThreshold = LogLevel.Warning); +var app = builder.Build(); + +string IndexHtml() => + HtmlTemplate.Render(store.KeyPrefix, store.StreamingTtlSeconds, worker.UsersPerTick); + +app.MapGet("/", () => Results.Content(IndexHtml(), "text/html; charset=utf-8")); + +app.MapGet("/state", () => +{ + var ids = store.ListEntityIds(500); + var count = store.CountEntities(); + return Results.Json(new + { + key_prefix = store.KeyPrefix, + batch_ttl_seconds = store.BatchTtlSeconds, + streaming_ttl_seconds = store.StreamingTtlSeconds, + entity_count = count, + entity_ids = ids, + stats = store.StatsSnapshot(), + worker = worker.StatsSnapshot(), + }); +}); + +app.MapGet("/inspect", async ([FromQuery] string user) => +{ + if (string.IsNullOrWhiteSpace(user)) + return Results.BadRequest(new { error = "user is required" }); + + var full = await store.GetAllFeaturesAsync(user); + var keyTtl = await store.KeyTtlSecondsAsync(user); + if (full.Count == 0) + { + return Results.Json(new { exists = false, key_ttl_seconds = keyTtl }); + } + // Iterate the known schema (batch + streaming) plus any extras + // the hash carries so expired streaming fields surface as + // ttl_seconds=-2 in the Inspect view rather than silently + // disappearing. + var names = new List(FeatureStore.DefaultBatchFields); + names.AddRange(FeatureStore.DefaultStreamingFields); + foreach (var k in full.Keys) if (!names.Contains(k)) names.Add(k); + var ttls = await store.FieldTtlsSecondsAsync(user, names); + var fields = names + .OrderBy(n => n, StringComparer.Ordinal) + .Select(n => new + { + name = n, + value = full.TryGetValue(n, out var v) ? v : "", + ttl_seconds = ttls.TryGetValue(n, out var t) ? t : -2L, + }) + .ToArray(); + return Results.Json(new + { + exists = true, + key_ttl_seconds = keyTtl, + fields, + }); +}); + +app.MapPost("/bulk-load", async (HttpRequest req) => +{ + await demoLock.WaitAsync(); + try + { + var form = await req.ReadFormAsync(); + var count = Clamp(IntOr(form["count"], 200), 1, 2000); + var ttl = (long)Clamp(IntOr(form["ttl"], 86400), 5, 172_800); + var rows = BuildFeatures.SynthesizeUsers(count, demoSeed); + var sw = Stopwatch.StartNew(); + var loaded = await store.BulkLoadAsync(rows, ttl); + sw.Stop(); + return Results.Json(new + { + loaded, + ttl_seconds = ttl, + elapsed_ms = sw.Elapsed.TotalMilliseconds, + }); + } + finally { demoLock.Release(); } +}); + +app.MapPost("/reset", async () => +{ + await demoLock.WaitAsync(); + try + { + // Pause + wait-for-idle around the DEL sweep so a concurrent + // tick can't recreate a user that was just enumerated for + // deletion (streaming HSET creates the key if it's missing). + var wasPaused = worker.IsPaused; + if (worker.IsRunning) + { + if (!wasPaused) worker.Pause(); + await worker.WaitForIdleAsync(); + } + try + { + var deleted = await store.ResetAsync(); + store.ResetStats(); + worker.ResetStats(); + return Results.Json(new { deleted }); + } + finally + { + if (worker.IsRunning && !wasPaused) worker.Resume(); + } + } + finally { demoLock.Release(); } +}); + +app.MapPost("/worker/toggle", async () => +{ + await demoLock.WaitAsync(); + try + { + if (!worker.IsRunning) worker.Start(); + if (worker.IsPaused) worker.Resume(); + else worker.Pause(); + return Results.Json(new { paused = worker.IsPaused, running = worker.IsRunning }); + } + finally { demoLock.Release(); } +}); + +app.MapPost("/read", async (HttpRequest req) => +{ + var form = await req.ReadFormAsync(); + var user = form["user"].ToString().Trim(); + if (string.IsNullOrEmpty(user)) + return Results.BadRequest(new { error = "user is required" }); + var fields = form["field"].Where(f => !string.IsNullOrEmpty(f)).ToList(); + var sw = Stopwatch.StartNew(); + var values = fields.Count > 0 + ? await store.GetFeaturesAsync(user, fields!) + : new Dictionary(); + sw.Stop(); + var ttls = fields.Count > 0 + ? await store.FieldTtlsSecondsAsync(user, fields!) + : new Dictionary(); + var keyTtl = await store.KeyTtlSecondsAsync(user); + return Results.Json(new + { + requested = fields, + values, + ttls, + key_ttl_seconds = keyTtl, + returned_count = values.Count, + elapsed_ms = sw.Elapsed.TotalMilliseconds, + }); +}); + +app.MapPost("/batch-read", async (HttpRequest req) => +{ + var form = await req.ReadFormAsync(); + var count = Clamp(IntOr(form["count"], 100), 1, 500); + var fields = form["field"].Where(f => !string.IsNullOrEmpty(f)).Cast().ToList(); + if (fields.Count == 0) + { + fields = new List(FeatureStore.DefaultStreamingFields) { "risk_segment" }; + } + var ids = store.ListEntityIds(Math.Max(count * 2, 2000)); + if (ids.Count > count) ids = ids.Take(count).ToList(); + var sw = Stopwatch.StartNew(); + var rows = await store.BatchGetFeaturesAsync(ids, fields); + sw.Stop(); + var sample = ids.Take(10) + .Select(id => new + { + id, + field_count = rows.TryGetValue(id, out var r) ? r.Count : 0, + }) + .ToArray(); + return Results.Json(new + { + entity_count = ids.Count, + field_count = fields.Count, + elapsed_ms = sw.Elapsed.TotalMilliseconds, + sample, + }); +}); + +Console.WriteLine($"Redis feature-store demo server listening on http://{host}:{port}"); +Console.WriteLine($"Using Redis at {redisUri} with key prefix '{keyPrefix}' " + + $"(batch TTL {batchTtlSeconds}s, streaming TTL {streamingTtlSeconds}s)"); +Console.WriteLine($"Materialized {seeded} user(s); streaming worker running."); + +var appTask = app.RunAsync(); + +Console.CancelKeyPress += async (_, e) => +{ + e.Cancel = true; + Console.WriteLine("\nShutting down..."); + await worker.StopAsync(); + await app.StopAsync(); + await mux.CloseAsync(); +}; + +await appTask; +return 0; + +// --------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------- + +static int Clamp(int v, int lo, int hi) => Math.Max(lo, Math.Min(hi, v)); + +static int IntOr(Microsoft.Extensions.Primitives.StringValues sv, int def) +{ + return int.TryParse(sv.ToString(), out var n) ? n : def; +} diff --git a/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs new file mode 100644 index 0000000000..1d4b1a84f6 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs @@ -0,0 +1,236 @@ +namespace FeatureStoreDemo; + +/// +/// Streaming feature updater for the demo. +/// +/// +/// Stands in for whatever Flink, Kafka Streams, or bespoke service +/// computes the real-time features in a real deployment. In +/// production this code lives in the streaming layer; here it runs +/// as a background next to the demo server so the +/// UI can start, pause, and resume it. +/// +/// +/// Every tick the worker picks a few random users and writes a new +/// value for each streaming feature, with a per-field HEXPIRE +/// so the field self-expires if the worker is paused. Pause it for +/// longer than StreamingTtlSeconds and the streaming fields +/// drop out of the hash while the batch fields remain populated +/// under the longer key-level TTL — the mixed staleness +/// story made visible. +/// +/// +public sealed class StreamingWorker +{ + private static readonly string[] DeviceIds = { + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2", + }; + private static readonly string[] SessionCountries = { + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", + }; + private static readonly int[] FailedLoginBuckets = { 0, 1, 2, 5 }; + private static readonly int[] FailedLoginWeights = { 70, 20, 8, 2 }; + + private readonly FeatureStore _store; + private readonly TimeSpan _tick; + public int UsersPerTick { get; } + private readonly Random _rng; + private readonly object _rngLock = new(); + + // All three lifecycle flags are read by the worker task and the + // public API (HTTP handlers + Reset), so they have to be + // volatile or Interlocked. + private int _running; + private int _paused; + private int _tickInFlight; + private long _tickCount; + private long _writesCount; + + private CancellationTokenSource? _cts; + private Task? _task; + + public StreamingWorker(FeatureStore store, TimeSpan tick, int usersPerTick, int seed) + { + _store = store; + _tick = tick <= TimeSpan.Zero ? TimeSpan.FromSeconds(1) : tick; + UsersPerTick = usersPerTick > 0 ? usersPerTick : 5; + _rng = new Random(seed); + } + + // --------------------------------------------------------------- + // Lifecycle + // --------------------------------------------------------------- + + public void Start() + { + if (Interlocked.CompareExchange(ref _running, 1, 0) != 0) return; + Interlocked.Exchange(ref _paused, 0); + _cts = new CancellationTokenSource(); + _task = Task.Run(() => RunAsync(_cts.Token)); + } + + public async Task StopAsync() + { + if (Interlocked.Exchange(ref _running, 0) != 1) return; + _cts?.Cancel(); + try { if (_task is not null) await _task; } + catch (OperationCanceledException) { /* expected */ } + _task = null; + Interlocked.Exchange(ref _tickInFlight, 0); + } + + public void Pause() => Interlocked.Exchange(ref _paused, 1); + public void Resume() => Interlocked.Exchange(ref _paused, 0); + + public bool IsRunning => Volatile.Read(ref _running) == 1; + public bool IsPaused => Volatile.Read(ref _paused) == 1; + + /// + /// Block until any in-flight tick has finished. + /// only stops future ticks from running; callers (a reset + /// that's about to DEL every entity, for example) use this to + /// flush a mid-flight tick before they touch state the tick + /// might still be writing to. + /// + public async Task WaitForIdleAsync() + { + while (Volatile.Read(ref _tickInFlight) == 1) + { + await Task.Delay(20); + } + } + + public WorkerStats StatsSnapshot() => new( + IsRunning, + IsPaused, + Interlocked.Read(ref _tickCount), + Interlocked.Read(ref _writesCount)); + + public void ResetStats() + { + Interlocked.Exchange(ref _tickCount, 0); + Interlocked.Exchange(ref _writesCount, 0); + } + + public record WorkerStats(bool Running, bool Paused, long TickCount, long WritesCount); + + // --------------------------------------------------------------- + // Tick + // --------------------------------------------------------------- + + private async Task RunAsync(CancellationToken ct) + { + try + { + while (!ct.IsCancellationRequested) + { + try { await Task.Delay(_tick, ct); } + catch (OperationCanceledException) { break; } + if (ct.IsCancellationRequested) break; + + // Set tick_in_flight *before* the pause check so a + // concurrent pause+wait can never see + // tick_in_flight=0 in the window between the pause + // check and the actual DoTick call. The finally + // block clears the flag whether we paused, succeeded, + // or threw. + Interlocked.Exchange(ref _tickInFlight, 1); + try + { + if (Volatile.Read(ref _paused) == 0) + { + await DoTickAsync(); + } + } + catch (Exception e) + { + Console.Error.WriteLine($"[streaming-worker] tick failed: {e.Message}"); + } + finally + { + Interlocked.Exchange(ref _tickInFlight, 0); + } + } + } + finally + { + // Clear running and tick_in_flight no matter how the + // task exits so a later Start() can spin a fresh task. + Interlocked.Exchange(ref _running, 0); + Interlocked.Exchange(ref _tickInFlight, 0); + } + } + + private async Task DoTickAsync() + { + var ids = _store.ListEntityIds(500); + if (ids.Count == 0) return; + var picks = Sample(ids, UsersPerTick); + var nowMs = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(); + long writes = 0; + foreach (var id in picks) + { + var fields = new Dictionary + { + ["last_login_ts"] = nowMs, + ["last_device_id"] = Choice(DeviceIds), + ["tx_count_5m"] = Intn(13), + ["failed_logins_15m"] = WeightedInt(FailedLoginBuckets, FailedLoginWeights), + ["session_country"] = Choice(SessionCountries), + }; + await _store.UpdateStreamingAsync(id, fields, _store.StreamingTtlSeconds); + writes += fields.Count; + } + Interlocked.Increment(ref _tickCount); + Interlocked.Add(ref _writesCount, writes); + } + + // --------------------------------------------------------------- + // RNG helpers (locked so the worker stays deterministic across + // concurrent toggles). + // --------------------------------------------------------------- + + private List Sample(List items, int k) + { + lock (_rngLock) + { + var n = Math.Min(k, items.Count); + var pool = new List(items); + var outList = new List(n); + for (int i = 0; i < n; i++) + { + int idx = _rng.Next(pool.Count); + outList.Add(pool[idx]); + pool.RemoveAt(idx); + } + return outList; + } + } + + private string Choice(string[] items) + { + lock (_rngLock) { return items[_rng.Next(items.Length)]; } + } + + private int Intn(int n) + { + lock (_rngLock) { return _rng.Next(n); } + } + + private int WeightedInt(int[] items, int[] weights) + { + lock (_rngLock) + { + int total = 0; + foreach (var w in weights) total += w; + int r = _rng.Next(total); + for (int i = 0; i < items.Length; i++) + { + r -= weights[i]; + if (r < 0) return items[i]; + } + return items[^1]; + } + } +} diff --git a/content/develop/use-cases/feature-store/dotnet/_index.md b/content/develop/use-cases/feature-store/dotnet/_index.md new file mode 100644 index 0000000000..8d8aa55cd0 --- /dev/null +++ b/content/develop/use-cases/feature-store/dotnet/_index.md @@ -0,0 +1,666 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in .NET with StackExchange.Redis +linkTitle: StackExchange.Redis example (C#) +title: Redis feature store with StackExchange.Redis +weight: 7 +--- + +This guide shows you how to build a small Redis-backed online feature store +in .NET with [StackExchange.Redis]({{< relref "/develop/clients/dotnet" >}}). +The demo runs on top of ASP.NET Core's minimal-API web framework so you can +bulk-load a batch of users with a key-level TTL, run a streaming worker that +overwrites real-time features with per-field TTL, retrieve any subset of +features for one user under 2 ms, and pipeline `HMGET` across a hundred +users for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the +model needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an +application-side cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with + the batch materialization cycle (24 hours in the demo). If the batch + refresher fails, the whole entity disappears at the next cycle and + inference sees a missing entity — which the model handler can detect and + fall back on — rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) + on each streaming feature gives that field its own shorter expiry, + independent of the rest of the hash. If the streaming pipeline stops + updating a feature, the field self-cleans while the batch fields stay + populated. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own + feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected + field expire on its own timer. + +## How StackExchange.Redis fits the demo + +Three client facts shape the helper: + +* **`ConnectionMultiplexer` is a single, shared, thread-safe object.** One + instance serves the whole process — every HTTP handler in the ASP.NET + Core thread pool and the streaming worker pull an `IDatabase` from the + same multiplexer with `mux.GetDatabase()`. There is no pool to manage and + no per-call connection borrow. +* **`IBatch` is the canonical pipelining handle.** `db.CreateBatch()` + returns a builder; you call the async methods to queue commands (each + returns a `Task` that completes when the batch is flushed), then + `batch.Execute()` ships the lot in one round trip. The pattern is "fire + all the async methods, *then* call Execute, *then* await the Tasks." +* **Per-field TTL is typed.** StackExchange.Redis 2.8+ exposes + `IDatabase.HashFieldExpireAsync` (returns `ExpireResult[]` — an enum + whose values map 1:1 to Redis's HEXPIRE return codes) and + `IDatabase.HashFieldGetTimeToLiveAsync` (returns `long[]` in + milliseconds). The demo pins 2.13.17. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by the +`BuildFeatures` static class. The streaming features describe what the user +is doing right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by a `StreamingWorker` +background task. The HTTP handlers in `Program.cs` read any subset of those +features through `FeatureStore`'s helper class. + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs)): + +```csharp +using StackExchange.Redis; +using FeatureStoreDemo; + +var muxOptions = ConfigurationOptions.Parse("localhost:6379"); +muxOptions.AllowAdmin = true; // needed for SCAN via IServer.Keys() +var mux = await ConnectionMultiplexer.ConnectAsync(muxOptions); + +var store = new FeatureStore( + mux, + "fs:user:", + batchTtlSeconds: 24 * 60 * 60, // whole-entity TTL aligned with the daily batch cycle + streamingTtlSeconds: 5 * 60 // per-field TTL on each streaming feature +); + +// Batch materialization: one HSET + EXPIRE per user, all pipelined. +var rows = new Dictionary> +{ + ["u0001"] = new Dictionary + { + ["country_iso"] = "US", ["risk_segment"] = "low", + ["tx_count_7d"] = 14, ["avg_amount_30d"] = 92.40, + ["account_age_days"] = 612, ["chargeback_count_180d"] = 0, + }, +}; +await store.BulkLoadAsync(rows, 24 * 60 * 60); + +// Streaming write: HSET + HEXPIRE on just the fields that changed. +await store.UpdateStreamingAsync("u0001", new Dictionary +{ + ["last_login_ts"] = DateTimeOffset.UtcNow.ToUnixTimeMilliseconds(), + ["last_device_id"] = "ios-9f02", + ["tx_count_5m"] = 3, + ["failed_logins_15m"] = 0, + ["session_country"] = "US", +}, 5 * 60); + +// Inference read: HMGET of whatever the model needs. +var features = await store.GetFeaturesAsync("u0001", new[] +{ + "risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "failed_logins_15m", +}); + +// Batch scoring: pipelined HMGET across many users. +var batch = await store.BatchGetFeaturesAsync( + new[] { "u0001", "u0002", "u0003" }, + new[] { "risk_segment", "tx_count_5m", "failed_logins_15m" }); +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes on the wire, so `FeatureStore.EncodeValue` renders +booleans as `"true"` / `"false"` and uses `Object.ToString()` (with +`InvariantCulture` for doubles, so a `92.40` doesn't become `"92,40"` in +locales that use a comma decimal separator). The model server is responsible +for parsing back to the right type, the same way it would when reading any +serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +### Bulk-loading batch features + +`BulkLoadAsync` queues one `HSET` and one `EXPIRE` per user through an +`IBatch`, then `Execute()` ships the whole batch in one round trip. + +```csharp +public async Task BulkLoadAsync( + IReadOnlyDictionary> rows, + long ttlSeconds) +{ + if (rows.Count == 0) return 0; + var batch = _db.CreateBatch(); + var tasks = new List(rows.Count * 2); + foreach (var (entityId, fields) in rows) + { + var key = (RedisKey)KeyFor(entityId); + var entries = new HashEntry[fields.Count]; + int i = 0; + foreach (var (name, value) in fields) + entries[i++] = new HashEntry(name, EncodeValue(value)); + tasks.Add(batch.HashSetAsync(key, entries)); + tasks.Add(batch.KeyExpireAsync(key, TimeSpan.FromSeconds(ttlSeconds))); + } + batch.Execute(); + await Task.WhenAll(tasks); + ... +} +``` + +Two things worth noticing: + +1. **Call the async methods *before* `Execute()`.** They don't run anything + yet — they just queue the command and return a `Task` that completes + when the batch is flushed. Order matters: a `batch.HashSetAsync(...)` + after `batch.Execute()` is just a regular async call against the + underlying database (and will fail because the local `IBatch` is now + spent). +2. **`Task.WhenAll(tasks)` after `Execute()`** is how you wait for the + server to acknowledge the whole batch. Skipping it would leak any + per-command errors (a malformed `EXPIRE`, for example) into the next + call instead of the batch. + +In production, the equivalent of this script runs as an offline pipeline +(a Spark or Feast `materialize` job) that reads from the warehouse and +writes into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`UpdateStreamingAsync` is the linchpin of the mixed-staleness story: + +```csharp +public async Task UpdateStreamingAsync( + string entityId, + IReadOnlyDictionary fields, + long ttlSeconds) +{ + if (fields.Count == 0) return; + var key = (RedisKey)KeyFor(entityId); + var entries = new HashEntry[fields.Count]; + var names = new RedisValue[fields.Count]; + int i = 0; + foreach (var (name, value) in fields) + { + entries[i] = new HashEntry(name, EncodeValue(value)); + names[i] = name; + i++; + } + + var batch = _db.CreateBatch(); + var hsetTask = batch.HashSetAsync(key, entries); + var hexpireTask = batch.HashFieldExpireAsync( + key, names, TimeSpan.FromSeconds(ttlSeconds)); + batch.Execute(); + await hsetTask; + var codes = await hexpireTask; + foreach (var code in codes) + { + if (code != ExpireResult.Success) + { + throw new InvalidOperationException( + $"HEXPIRE did not set every field TTL for {key}: [{string.Join(",", codes)}]"); + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on +*individual* hash fields, not on the whole key. The two commands are +queued under one `IBatch` so Redis runs them in pipeline order: the +`HSET` first creates or overwrites the fields, then `HEXPIRE` attaches a +TTL to each of those same fields. `HashFieldExpireAsync` returns one +`ExpireResult` per field: + +* `ExpireResult.Success` (= Redis code `1`): TTL set / updated. +* `ExpireResult.Due` (= `2`): the expiry was 0 or in the past, so Redis + deleted the field instead of applying a TTL. +* `ExpireResult.ConditionNotMet` (= `0`): an `NX | XX | GT | LT` + conditional flag was specified and not met (we never use one here). +* `ExpireResult.NotExist` (= `-2`): no such field, or no such key. + +We always follow `HSET` with `HEXPIRE` so any code other than `Success` +means the per-field TTL invariant didn't hold — the helper throws an +`InvalidOperationException` rather than silently leaving a streaming +field with no expiry attached. + +If a streaming pipeline stops, the streaming fields drop out one by one +as their per-field TTLs elapse. `FieldTtlsSecondsAsync` (which wraps +`HashFieldGetTimeToLiveAsync`) lets the model side inspect the +remaining TTL on any field. Note that the StackExchange.Redis return is +in **milliseconds** — the helper divides by 1000 to match the +`TTL` / `HTTL` second-based convention used by every other client in +this use case (and `redis-cli`). + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level +> TTL commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, +> `HPEXPIREAT`, `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in +> Redis 7.4. StackExchange.Redis 2.8 was the first release with the +> typed bindings; the demo pins 2.13.17. + +### Inference reads with HMGET + +`GetFeaturesAsync` is one `HMGET`: + +```csharp +public async Task> GetFeaturesAsync( + string entityId, IReadOnlyList fieldNames) +{ + var key = (RedisKey)KeyFor(entityId); + var out_ = new Dictionary(); + if (fieldNames.Count == 0) return out_; + var values = await _db.HashGetAsync( + key, fieldNames.Select(f => (RedisValue)f).ToArray()); + for (int i = 0; i < fieldNames.Count; i++) + { + if (!values[i].IsNull) + out_[fieldNames[i]] = values[i].ToString(); + } + ... +} +``` + +`db.HashGetAsync(key, RedisValue[] fields)` issues `HMGET` and returns +a `RedisValue[]` aligned with the input order. Missing fields come back +as `RedisValue.Null` (which `IsNull` detects); the helper drops them +from the result dict so the caller sees only the features that actually +exist on the hash. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users +through one `IBatch`: + +```csharp +public async Task>> BatchGetFeaturesAsync( + IReadOnlyList entityIds, IReadOnlyList fieldNames) +{ + if (entityIds.Count == 0 || fieldNames.Count == 0) + return new Dictionary>(); + + var fieldValues = fieldNames.Select(f => (RedisValue)f).ToArray(); + var batch = _db.CreateBatch(); + var tasks = new Task[entityIds.Count]; + for (int i = 0; i < entityIds.Count; i++) + tasks[i] = batch.HashGetAsync(KeyFor(entityIds[i]), fieldValues); + batch.Execute(); + var rows = await Task.WhenAll(tasks); + ... +} +``` + +One round trip for the whole batch. The demo returns a 30-user batch in +~2 ms against a local Redis after the first-call JIT/connection warm-up. + +A Redis Cluster is different: an `IBatch` is bound to one shard, +because all queued commands ship through one connection to one node. +For batch reads on a cluster, the +[StackExchange.Redis cluster client]({{< relref "/develop/clients/dotnet/connect" >}}) +routes non-batched `HashGetAsync` calls to the right shard +automatically — fan out parallel calls with `Task.WhenAll` and the +multiplexer handles per-shard routing. For tighter control, group +entity IDs by hash slot ahead of time and use one `CreateBatch` per +shard's connection in parallel. A hash tag like `fs:user:{vip}:u0001` +forces a known set of keys onto the same shard so one batch can cover +them all. + +## The streaming worker + +`StreamingWorker.cs` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs)). +It runs as a background `Task` next to the demo server so the UI can +start, pause, and resume it. + +```csharp +private async Task RunAsync(CancellationToken ct) +{ + try + { + while (!ct.IsCancellationRequested) + { + try { await Task.Delay(_tick, ct); } + catch (OperationCanceledException) { break; } + if (ct.IsCancellationRequested) break; + + // Set tick_in_flight *before* the pause check so a + // concurrent pause+wait can never see tick_in_flight=0 + // in the window between the pause check and the actual + // DoTick call. The finally block clears the flag whether + // we paused, succeeded, or threw. + Interlocked.Exchange(ref _tickInFlight, 1); + try + { + if (Volatile.Read(ref _paused) == 0) + await DoTickAsync(); + } + catch (Exception e) + { + Console.Error.WriteLine($"[streaming-worker] tick failed: {e.Message}"); + } + finally + { + Interlocked.Exchange(ref _tickInFlight, 0); + } + } + } + finally + { + // Clear running and tick_in_flight no matter how the task + // exits so a later Start() can spin a fresh task. + Interlocked.Exchange(ref _running, 0); + Interlocked.Exchange(ref _tickInFlight, 0); + } +} +``` + +The same pre-flight `_tickInFlight` + `finally`-clear pattern as every +other client in this use case closes the pause/in-flight race: a reset +that's about to `DEL` every key calls `worker.Pause()` to stop *future* +ticks *and* `await worker.WaitForIdleAsync()` to flush a mid-flight tick +before issuing the DEL sweep. + +Pausing the worker is what shows off the mixed-staleness behavior: leave +it paused for longer than `StreamingTtlSeconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields +remain under the longer key-level `EXPIRE`. The demo's +`Pause / resume` button lets you see this happen in real time. + +## The batch builder + +`BuildFeatures.cs` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/dotnet/BuildFeatures.cs)). +It generates synthetic feature rows and calls `store.BulkLoadAsync` +once. The synthesis itself is not the point — in a real deployment the +equivalent code reads from the offline store (Snowflake, BigQuery, +Iceberg) and writes the resulting hashes into Redis. + +Run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +dotnet run --project . -- --mode build-features --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, +which is how a typical operator would pre-seed a feature store from the +command line when debugging. + +## The interactive demo + +`Program.cs` runs the ASP.NET Core minimal-API server on port 8091. The +HTML page lets you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. +* See the **store state**: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status and **pause or resume** it. +* Run an **inference read** for any user with a chosen feature subset, + and see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users. +* **Inspect** any user's full hash with field-level TTLs and the + key-level TTL. + +The server holds one `FeatureStore`, one `StreamingWorker`, and one +`ConnectionMultiplexer` for the lifetime of the process. Every handler +in the ASP.NET Core thread pool and the streaming worker share that +multiplexer — StackExchange.Redis handles the per-call multiplexing +across the underlying socket. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) + and [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; + the demo relies on per-field TTL for the mixed-staleness story. +* **.NET 8 SDK or later.** +* **StackExchange.Redis 2.8 or later.** The demo's csproj pins 2.13.17. + Typed bindings for the field-TTL commands ship from 2.8. + +The connection multiplexer is opened with `AllowAdmin = true` because +the demo uses `IServer.Keys()` (SCAN under the hood) to populate UI +dropdowns and to power the reset path. In a production read/write +service you would not enable `AllowAdmin`; instead, maintain an external +index of user IDs (a small Redis Set, say, keyed by tenant) and read it +to discover entities. The demo's `SCAN` use is purely a UI convenience. + +If your Redis server is running elsewhere, start the demo with +`--redis-uri host:port`. + +## Running the demo + +### Get the source files + +The demo lives in a small csproj under +[`feature-store/dotnet`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/dotnet). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/dotnet +dotnet build -c Release +``` + +### Start the demo server + +From the project directory: + +```bash +dotnet run -c Release +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8091 +Using Redis at localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +Open [http://127.0.0.1:8091](http://127.0.0.1:8091). Useful things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused + for ~5 minutes (or restart the server with + `--streaming-ttl-seconds 30` to make it visible in seconds). Re-run + **Read features** on any user and watch the streaming fields + disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level + TTLs. +* Click **Reset** to drop every user and start over. + +## Production usage + +The guidance below focuses on the production concerns specific to +running a feature store on Redis. For the generic +StackExchange.Redis production checklist — +[`ConfigurationOptions`]({{< relref "/develop/clients/dotnet/connect" >}}) +tuning, AUTH/ACL, retry/backoff, error handling — see the +[client guide]({{< relref "/develop/clients/dotnet" >}}). For TLS +specifically, follow the +[connect-with-TLS recipe]({{< relref "/develop/clients/dotnet/connect#connect-to-your-production-redis-with-tls" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness +from a broken batch pipeline. Set it longer than your worst-case batch +outage so a single missed run doesn't take the feature store offline, +but short enough that a sustained outage causes loud failures (missing +entities) rather than quiet ones (yesterday's features being scored as +today's). The standard choice is one cycle of "expected refresh +interval × 2" — for a daily batch, 48 hours; for a 6-hour batch, 12 +hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker +causes visible freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the +schema in your offline store. The batch materialization step is your +chance to flatten joins, encode categoricals, and project to whatever +shape the model server wants — so the request path is exactly one +`HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own +schema; the serving pipeline reads from Redis with the flattened +serving schema. Keeping those two pipelines as the same code path is +what prevents training-serving skew. + +### Pipeline batch reads across shards + +On a single Redis instance, an `IBatch` of `HMGET`s across `N` users is +one round trip. A Redis Cluster is different: an `IBatch` is bound to +one shard, so on a cluster you need to either fan out the per-user +`HashGetAsync` calls with `Task.WhenAll` (the multiplexer routes each +one to the right shard) or group entity IDs by hash slot and create +one `IBatch` per shard's connection in parallel. + +A hash tag like `fs:user:{vip}:u0001` forces a known set of keys onto +the same shard so one `IBatch` can cover them all in a single round +trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the +streaming write applies `HEXPIRE` *every time*. If a streaming worker +writes a field without renewing its TTL, the field carries whatever +expiry was there before — possibly none, possibly stale — and the +mixed-staleness invariant breaks. Keep the `HSET` and `HEXPIRE` in the +same `IBatch` (or, even safer, in the same +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model +doesn't need. With dozens of features per entity, that is wasted +serialization work on the server and wasted bandwidth on the wire. +Always specify the field list explicitly with `HashGetAsync(key, RedisValue[])` +in the model server. + +The exception is debugging and feature-set discovery, where you +genuinely want the full hash. The demo's "Inspect" button uses +`HashGetAllAsync` for exactly this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the +hash (either it was never written, or it expired); `-1` means the +field has no TTL set (and is therefore covered only by the key-level +`EXPIRE`); any positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a + whole feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on + streaming features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL + aligned with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one + network round trip — see + [transactions and pipelining]({{< relref "/develop/clients/dotnet/transpipe" >}}). + +See the [StackExchange.Redis documentation]({{< relref "/develop/clients/dotnet" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the +deeper conceptual model. diff --git a/content/develop/use-cases/feature-store/rust/Cargo.toml b/content/develop/use-cases/feature-store/rust/Cargo.toml new file mode 100644 index 0000000000..4ea9e05835 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/Cargo.toml @@ -0,0 +1,28 @@ +[package] +name = "feature-store-demo" +version = "0.1.0" +edition = "2021" + +[lib] +name = "feature_store_demo" +path = "lib.rs" + +[[bin]] +name = "demo_server" +path = "demo_server.rs" + +[[bin]] +name = "build_features" +path = "build_features_bin.rs" + +[dependencies] +# redis-rs 0.27+ ships the tokio + connection-manager features the +# demo uses. HEXPIRE/HTTL are issued via `redis::cmd("HEXPIRE")` +# because the typed bindings don't ship on this client line yet — +# the wire protocol is the same. +redis = { version = "0.27", features = ["tokio-comp", "aio", "connection-manager"] } +tokio = { version = "1", features = ["full"] } +axum = "0.7" +serde = { version = "1.0", features = ["derive"] } +serde_json = "1.0" +rand = "0.8" diff --git a/content/develop/use-cases/feature-store/rust/_index.md b/content/develop/use-cases/feature-store/rust/_index.md new file mode 100644 index 0000000000..e3e1ef15fc --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/_index.md @@ -0,0 +1,653 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Rust with redis-rs +linkTitle: redis-rs example (Rust) +title: Redis feature store with redis-rs +weight: 6 +--- + +This guide shows you how to build a small Redis-backed online feature store in +Rust with the async [redis-rs]({{< relref "/develop/clients/rust" >}}) crate +and `tokio`. The demo runs on top of the [axum](https://docs.rs/axum/) +web framework so you can bulk-load a batch of users with a key-level TTL, run +a streaming worker that overwrites real-time features with per-field TTL, +retrieve any subset of features for one user under 2 ms, and pipeline `HMGET` +across a hundred users for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the model +needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an application-side +cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with the + batch materialization cycle (24 hours in the demo). If the batch refresher + fails, the whole entity disappears at the next cycle and inference sees a + missing entity — which the model handler can detect and fall back on — + rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) on + each streaming feature gives that field its own shorter expiry, independent + of the rest of the hash. If the streaming pipeline stops updating a feature, + the field self-cleans while the batch fields stay populated. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected field + expire on its own timer. + +## How redis-rs fits the demo + +Two crate facts shape the helper: + +* **`ConnectionManager` is the canonical async connection.** It owns a + multiplexed `MultiplexedConnection` underneath and transparently reconnects + on a dropped socket. The type is `Clone` — handing it to one HTTP handler, + the streaming worker, and the batch builder is just three `clone()` calls, + and they all share the same underlying connection. There's no pool to + manage. +* **The `redis::cmd("HEXPIRE")` builder is how you reach commands not yet + typed on the `AsyncCommands` trait.** Per-field TTL bindings (`hexpire`, + `httl`, `hpersist`) aren't part of the typed surface on redis-rs 0.27, so + the helper issues them with the generic command builder. The wire bytes + are identical to the typed form. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`build_features.rs` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`streaming_worker.rs` — a tokio task that stands in for a Flink / Kafka +Streams job. The HTTP handlers in `demo_server.rs` read any subset of those +features through `feature_store.rs`'s helper struct. + +## The feature-store helper + +The `FeatureStore` struct wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/rust/feature_store.rs)): + +```rust +use redis::aio::ConnectionManager; +use feature_store_demo::feature_store::{FeatureStore, FeatureMap, FeatureValue}; +use std::collections::BTreeMap; + +let client = redis::Client::open("redis://127.0.0.1/")?; +let conn = ConnectionManager::new(client).await?; +let store = FeatureStore::new( + conn, + "fs:user:", + 24 * 60 * 60, // whole-entity TTL aligned with the daily batch cycle + 5 * 60, // per-field TTL on each streaming feature +); + +// Batch materialization: one HSET + EXPIRE per user, all pipelined +// through one round trip. +let mut row: FeatureMap = BTreeMap::new(); +row.insert("country_iso".into(), FeatureValue::Str("US".into())); +row.insert("risk_segment".into(), FeatureValue::Str("low".into())); +row.insert("tx_count_7d".into(), FeatureValue::Int(14)); +row.insert("avg_amount_30d".into(), FeatureValue::Float(92.40)); +store.bulk_load(&[("u0001".into(), row)], 24 * 60 * 60).await?; + +// Streaming write: HSET + HEXPIRE on just the fields that changed. +let mut s: FeatureMap = BTreeMap::new(); +s.insert("last_login_ts".into(), FeatureValue::Int(1716998413541)); +s.insert("tx_count_5m".into(), FeatureValue::Int(3)); +store.update_streaming("u0001", &s, 5 * 60).await?; + +// Inference read: HMGET of whatever the model needs. +let features = store.get_features( + "u0001", + &["risk_segment", "tx_count_7d", "avg_amount_30d", + "tx_count_5m", "last_login_ts"], +).await?; + +// Batch scoring: pipelined HMGET across many users. +let batch = store.batch_get_features( + &["u0001".into(), "u0002".into()], + &["risk_segment", "tx_count_5m"], +).await?; +``` + +### Project layout + +The crate is a small lib + two binaries: + +```text +feature-store/rust/ +├── Cargo.toml +├── lib.rs (pub mod feature_store; pub mod streaming_worker; pub mod build_features;) +├── feature_store.rs — FeatureStore struct + methods +├── streaming_worker.rs — async tokio task worker +├── build_features.rs — SynthesizeUsers + cli_main() +├── demo_server.rs — main() for the demo server (axum) +├── build_features_bin.rs — main() for the CLI builder +└── demo_template.html — HTML page, baked in via include_str! +``` + +Run with `cargo run --release --bin demo_server` or +`cargo run --release --bin build_features -- --count 500`. + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis hash +fields are bytes on the wire, so the helper encodes booleans as `"true"` / +`"false"` and renders numbers via `i64::to_string` / `f64::to_string`. The +model server is responsible for parsing back to the right type, the same way +it would when reading any serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +### Bulk-loading batch features + +`bulk_load` pipelines one `HSET` and one `EXPIRE` per user into a single +non-transactional batch through `redis::pipe()`. With 500 users that's 1000 +commands in one network call — Redis processes them sequentially on the +server side but the client only pays one RTT. + +```rust +pub async fn bulk_load( + &self, + rows: &[(String, FeatureMap)], + ttl_seconds: u64, +) -> RedisResult { + if rows.is_empty() { return Ok(0); } + let mut pipe = redis::pipe(); + for (entity_id, fields) in rows { + let key = self.key_for(entity_id); + let encoded: Vec<(&str, String)> = fields + .iter().map(|(k, v)| (k.as_str(), v.encode())).collect(); + pipe.hset_multiple(&key, &encoded).ignore(); + pipe.expire(&key, ttl_seconds as i64).ignore(); + } + let mut conn = self.conn.clone(); + pipe.query_async::<()>(&mut conn).await?; + ... +} +``` + +`redis::pipe()` is a non-transactional builder: commands queue up and ship in +one round trip, but they don't run inside a `MULTI/EXEC` block. That's the +right choice here because each user's `HSET` + `EXPIRE` pair is independent +of every other user's, and an all-or-nothing transaction would block the +server for the duration of the batch. For the rare case where the pair has +to be inseparable, swap to `redis::pipe().atomic()` (which wraps in +`MULTI/EXEC`) or a Lua script via +[`EVAL`]({{< relref "/commands/eval" >}}) / +[Eval scripting]({{< relref "/develop/programmability/eval-intro" >}}). + +In production, the equivalent of this script runs as an offline pipeline +(a Spark or Feast `materialize` job) that reads from the warehouse and +writes into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`update_streaming` is the linchpin of the mixed-staleness story: + +```rust +pub async fn update_streaming( + &self, + entity_id: &str, + fields: &FeatureMap, + ttl_seconds: u64, +) -> RedisResult<()> { + if fields.is_empty() { return Ok(()); } + let key = self.key_for(entity_id); + let encoded: Vec<(&str, String)> = fields.iter() + .map(|(k, v)| (k.as_str(), v.encode())).collect(); + let names: Vec<&str> = fields.keys().map(|s| s.as_str()).collect(); + + let mut pipe = redis::pipe(); + pipe.hset_multiple(&key, &encoded).ignore(); + // HEXPIRE wire form: HEXPIRE key seconds FIELDS count field... + let mut hexpire = redis::cmd("HEXPIRE"); + hexpire.arg(&key).arg(ttl_seconds).arg("FIELDS").arg(names.len()); + for n in &names { hexpire.arg(n); } + pipe.add_command(hexpire); + + let mut conn = self.conn.clone(); + // Pipeline returns one entry per non-ignored command; HSET's + // reply was dropped with .ignore(), so the only remaining entry + // is HEXPIRE's per-field code list. + let pipe_result: Vec> = pipe.query_async(&mut conn).await?; + let codes = pipe_result.into_iter().next().unwrap_or_default(); + for code in &codes { + if *code != 1 { + return Err(redis::RedisError::from(( + redis::ErrorKind::ResponseError, + "HEXPIRE invariant violated", + format!("HEXPIRE did not set every field TTL for {key}: {codes:?}"), + ))); + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on *individual* +hash fields, not on the whole key. The two commands are queued under one +flush so Redis runs them in pipeline order: the `HSET` first creates or +overwrites the fields, then `HEXPIRE` attaches a TTL to each of those same +fields. `HEXPIRE` returns one status code per field — `1` if the TTL was +set, `2` if the expiry was 0 or in the past (so Redis deleted the field +instead), `0` if an `NX | XX | GT | LT` conditional flag was specified and +not met (we never use one here), `-2` if the field doesn't exist on the +key. The helper returns a `RedisError` if any code is anything other than +`1`, so the "every streaming write renews its TTL" invariant fails loudly +rather than silently leaving a streaming field with no expiry attached. + +The pipeline reply shape — `Vec>` — is the one tricky bit. redis-rs +wraps each non-ignored command's reply in the outer `Vec`, even when there +is only one such command. The HEXPIRE reply itself is an array, so we +end up with one outer `Vec` containing one inner `Vec` of codes. + +If a streaming pipeline stops, the streaming fields drop out one by one as +their per-field TTLs elapse. `field_ttls_seconds` (which wraps `HTTL`) lets +the model side inspect the remaining TTL on any field — useful both for +debugging and as a freshness signal in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level +> TTL commands (`HTTL`, `HPERSIST`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, +> `HPTTL`, `HEXPIRETIME`, `HPEXPIRETIME`) were added in Redis 7.4. The +> demo's `Cargo.toml` pins `redis = "0.27"` and uses +> `redis::cmd("HEXPIRE")` because the typed binding doesn't ship on that +> client line yet — the wire bytes are identical. + +### Inference reads with HMGET + +`get_features` is one `HMGET`: + +```rust +pub async fn get_features( + &self, + entity_id: &str, + field_names: &[&str], +) -> RedisResult> { + if field_names.is_empty() { return Ok(BTreeMap::new()); } + let key = self.key_for(entity_id); + let mut conn = self.conn.clone(); + let values: Vec> = conn.hget(&key, field_names).await?; + let mut out = BTreeMap::new(); + for (n, v) in field_names.iter().zip(values.into_iter()) { + if let Some(s) = v { out.insert((*n).to_string(), s); } + } + ... +} +``` + +`conn.hget` with a slice of field names is redis-rs's way of issuing +`HMGET` (the typed `hmget` and `hget(slice)` produce the same wire bytes). +The reply is `Vec>` — fields that don't exist on the hash +come back as `None`, which the helper drops from the result map. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```rust +pub async fn batch_get_features( + &self, + entity_ids: &[String], + field_names: &[&str], +) -> RedisResult>> { + if entity_ids.is_empty() || field_names.is_empty() { + return Ok(BTreeMap::new()); + } + let mut pipe = redis::pipe(); + for id in entity_ids { + pipe.hget(self.key_for(id), field_names); + } + let mut conn = self.conn.clone(); + let rows: Vec>> = pipe.query_async(&mut conn).await?; + ... +} +``` + +One round trip for the whole batch. The demo returns a 30-user batch in +under 1 ms against a local Redis. + +A Redis Cluster is different: a single `redis::pipe()` is bound to one +connection, and a `ConnectionManager` holds one connection to one node. +For batch reads on a cluster, use redis-rs's +[`cluster_async`](https://docs.rs/redis/0.27/redis/cluster_async/index.html) +client and either fan out parallel `hget` calls (the cluster client routes +each one to the right shard) or, for tighter control, group entity IDs by +hash slot and run one pipeline per shard in parallel. A hash tag like +`fs:user:{vip}:u0001` forces a known set of keys onto the same shard so +one pipeline can cover them all in a single round trip. + +## The streaming worker + +`streaming_worker.rs` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/rust/streaming_worker.rs)). +It runs as a tokio task next to the demo server so the UI can start, +pause, and resume it. + +```rust +async fn run(state: Arc) { + struct Guard<'a>(&'a State); + impl Drop for Guard<'_> { + fn drop(&mut self) { + // Clear running and tick_in_flight no matter how the + // task exits — a panic, a manual stop, anything. + self.0.running.store(false, Ordering::Relaxed); + self.0.tick_in_flight.store(false, Ordering::Relaxed); + } + } + let _guard = Guard(&state); + + let mut interval = time::interval(state.tick); + interval.set_missed_tick_behavior(time::MissedTickBehavior::Skip); + interval.tick().await; // skip the first immediate tick + + loop { + if state.stop.load(Ordering::Relaxed) { return; } + interval.tick().await; + + // Set tick_in_flight *before* the pause check so a concurrent + // pause()+wait_for_idle() can never see tick_in_flight=false + // in the window between the pause check and the actual + // do_tick call. + state.tick_in_flight.store(true, Ordering::Relaxed); + let result = if !state.paused.load(Ordering::Relaxed) { + do_tick(&state).await + } else { Ok(()) }; + state.tick_in_flight.store(false, Ordering::Relaxed); + if let Err(e) = result { + eprintln!("[streaming-worker] tick failed: {e}"); + } + } +} +``` + +The same pre-flight-`tick_in_flight` + drop-`Guard` pattern as every other +client in this use case closes the pause/in-flight race: a reset that's +about to `DEL` every key calls `worker.pause()` to stop *future* ticks +*and* `worker.wait_for_idle().await` to flush a mid-flight tick before +issuing the DEL sweep. + +Pausing the worker is what shows off the mixed-staleness behavior: leave +it paused for longer than `streaming_ttl_seconds` and the streaming fields +disappear from every user's hash one by one, while the batch fields remain +under the longer key-level `EXPIRE`. The demo's `Pause / resume` button +lets you see this happen in real time. + +## The batch builder + +`build_features.rs` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/rust/build_features.rs)). +It generates synthetic feature rows and calls `store.bulk_load` once. The +synthesis itself is not the point — in a real deployment the equivalent +code reads from the offline store (Snowflake, BigQuery, Iceberg) and writes +the resulting hashes into Redis. + +Run the builder on its own (independently of the demo server) to populate +Redis from the command line: + +```bash +cargo run --release --bin build_features -- --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, which +is how a typical operator would pre-seed a feature store from the command +line when debugging. + +## The interactive demo + +`demo_server.rs` runs the axum HTTP server on port 8090. The HTML page lets +you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. +* See the **store state**: user count, batch / streaming TTLs, cumulative + read/write counters. +* See the **streaming worker** status and **pause or resume** it. +* Run an **inference read** for any user with a chosen feature subset, and + see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users. +* **Inspect** any user's full hash with field-level TTLs and the key-level + TTL. + +The server holds one `FeatureStore` and one `StreamingWorker` for the +lifetime of the process. Both wrap clones of the same `ConnectionManager`, +so every HTTP handler and the streaming worker share the underlying +multiplexed socket. Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; the + demo relies on per-field TTL for the mixed-staleness story. +* **Rust 1.74 or later.** The demo uses `async fn` in traits, `let-else`, + and other recent ergonomics. Earlier stable Rust may compile after small + tweaks. +* **redis-rs 0.27 or later.** The demo's `Cargo.toml` pins 0.27 with the + `tokio-comp`, `aio`, and `connection-manager` features. Per-field TTL + commands are issued via `redis::cmd("HEXPIRE")`. + +If your Redis server is running elsewhere, start the demo with +`--redis-url redis://host:port/`. + +## Running the demo + +### Get the source files + +The demo lives in a small Cargo project under +[`feature-store/rust`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/rust). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/rust +cargo build --release +``` + +### Start the demo server + +From the project directory: + +```bash +cargo run --release --bin demo_server +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8090 +Using Redis at redis://127.0.0.1/ with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +Open [http://127.0.0.1:8090](http://127.0.0.1:8090). Useful things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused + for ~5 minutes (or restart the server with + `--streaming-ttl-seconds 30` to make it visible in seconds). Re-run + **Read features** on any user and watch the streaming fields disappear + while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level TTLs. +* Click **Reset** to drop every user and start over. + +## Production usage + +The guidance below focuses on the production concerns specific to running +a feature store on Redis. For the generic redis-rs production checklist +— TLS, AUTH, retry/backoff, error handling — see the +[redis-rs client guide]({{< relref "/develop/clients/rust" >}}) and the +[error-handling notes]({{< relref "/develop/clients/rust/error-handling" >}}). +The feature-store demo runs against `localhost` with the defaults; a real +deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness +from a broken batch pipeline. Set it longer than your worst-case batch +outage so a single missed run doesn't take the feature store offline, +but short enough that a sustained outage causes loud failures (missing +entities) rather than quiet ones (yesterday's features being scored as +today's). The standard choice is one cycle of "expected refresh +interval × 2" — for a daily batch, 48 hours; for a 6-hour batch, 12 +hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker +causes visible freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the +schema in your offline store. The batch materialization step is your +chance to flatten joins, encode categoricals, and project to whatever +shape the model server wants — so the request path is exactly one +`HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own schema; +the serving pipeline reads from Redis with the flattened serving +schema. Keeping those two pipelines as the same code path is what +prevents training-serving skew. + +### Pipeline batch reads across shards + +On a single Redis instance, a pipelined `HMGET` across `N` users is one +round trip. A Redis Cluster is different: a single `redis::pipe()` ships +through one connection to one node, so on a cluster you need redis-rs's +[`cluster_async`](https://docs.rs/redis/0.27/redis/cluster_async/index.html) +client. Either fan out parallel `hget` calls (the cluster client routes +each one to the right shard) or group entity IDs by hash slot and issue +one pipeline against each shard in parallel. + +A hash tag like `fs:user:{vip}:u0001` forces a known set of keys onto +the same shard so one pipeline can cover them all in a single round +trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the +streaming write applies `HEXPIRE` *every time*. If a streaming worker +writes a field without renewing its TTL, the field carries whatever +expiry was there before — possibly none, possibly stale — and the +mixed-staleness invariant breaks. Keep the `HSET` and `HEXPIRE` in the +same pipeline (or, even safer, in the same +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model +doesn't need. With dozens of features per entity, that is wasted +serialization work on the server and wasted bandwidth on the wire. +Always specify the field list explicitly with `hget(&[...])` (or the +typed `hmget`) in the model server. + +The exception is debugging and feature-set discovery, where you +genuinely want the full hash. The demo's "Inspect" button uses +`hgetall` for exactly this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the +hash (either it was never written, or it expired); `-1` means the field +has no TTL set (and is therefore covered only by the key-level +`EXPIRE`); any positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a whole + feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset of + features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on + streaming features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL + aligned with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one + network round trip — see + [transactions and pipelining]({{< relref "/develop/clients/rust/transpipe" >}}). + +See the [redis-rs documentation]({{< relref "/develop/clients/rust" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the +deeper conceptual model — including the listpack encoding that makes +small hashes particularly compact in memory, which matters at +feature-store scale. diff --git a/content/develop/use-cases/feature-store/rust/build_features.rs b/content/develop/use-cases/feature-store/rust/build_features.rs new file mode 100644 index 0000000000..e6bdf5b637 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/build_features.rs @@ -0,0 +1,116 @@ +//! Synthesize a small batch of users with realistic-looking features +//! and bulk-load them into Redis with a 24-hour key-level TTL. +//! +//! Stands in for the nightly Spark / Feast materialization job in a +//! real deployment. In production the equivalent of this script lives +//! in an offline pipeline that reads from the offline store and +//! writes the serving-time hashes into Redis via `HSET` + `EXPIRE`. + +use std::collections::BTreeMap; + +use rand::rngs::StdRng; +use rand::{Rng, SeedableRng}; + +use crate::feature_store::{FeatureMap, FeatureValue}; + +const COUNTRY_CHOICES: &[&str] = &[ + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", +]; +const RISK_SEGMENTS: &[&str] = &["low", "medium", "high"]; +const RISK_WEIGHTS: &[u32] = &[70, 25, 5]; +const CHARGEBACK_BUCKETS: &[i64] = &[0, 1, 2, 3]; +const CHARGEBACK_WEIGHTS: &[u32] = &[85, 10, 4, 1]; + +/// Generate `count` synthetic user feature rows. The shape mirrors a +/// small fraud-scoring feature set: country and risk segment as +/// TAG-like categorical features, plus a few numeric aggregates over +/// recent windows. +pub fn synthesize_users(count: usize, seed: u64) -> Vec<(String, FeatureMap)> { + let mut rng = StdRng::seed_from_u64(seed); + let mut users = Vec::with_capacity(count); + for i in 1..=count { + let uid = format!("u{:04}", i); + let mut row: FeatureMap = BTreeMap::new(); + row.insert("country_iso".into(), FeatureValue::Str( + COUNTRY_CHOICES[rng.gen_range(0..COUNTRY_CHOICES.len())].into())); + row.insert("risk_segment".into(), FeatureValue::Str( + weighted_str(&mut rng, RISK_SEGMENTS, RISK_WEIGHTS).into())); + row.insert("account_age_days".into(), FeatureValue::Int(rng.gen_range(7..=2400))); + row.insert("tx_count_7d".into(), FeatureValue::Int(rng.gen_range(0..=80))); + let avg = (rng.gen_range(5.0..350.0_f64) * 100.0).round() / 100.0; + row.insert("avg_amount_30d".into(), FeatureValue::Float(avg)); + row.insert( + "chargeback_count_180d".into(), + FeatureValue::Int(weighted_int(&mut rng, CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS)), + ); + users.push((uid, row)); + } + users +} + +fn weighted_str(rng: &mut R, items: &[&'static str], weights: &[u32]) -> &'static str { + let total: u32 = weights.iter().sum(); + let mut r = rng.gen_range(0..total); + for (i, w) in weights.iter().enumerate() { + if r < *w { return items[i]; } + r -= w; + } + items[items.len() - 1] +} + +fn weighted_int(rng: &mut R, items: &[i64], weights: &[u32]) -> i64 { + let total: u32 = weights.iter().sum(); + let mut r = rng.gen_range(0..total); + for (i, w) in weights.iter().enumerate() { + if r < *w { return items[i]; } + r -= w; + } + items[items.len() - 1] +} + +/// CLI entry point invoked by `build_features_bin.rs`. Parses flags, +/// opens a one-shot `ConnectionManager`, and calls `bulk_load`. +pub async fn cli_main() -> redis::RedisResult<()> { + let mut redis_url = "redis://127.0.0.1/".to_string(); + let mut count: usize = 200; + let mut ttl_seconds: u64 = 24 * 60 * 60; + let mut key_prefix = "fs:user:".to_string(); + let mut seed: u64 = 42; + + let args: Vec = std::env::args().skip(1).collect(); + let mut i = 0usize; + while i < args.len() { + match args[i].as_str() { + "--redis-url" => { redis_url = args[i + 1].clone(); i += 2; } + "--count" => { count = args[i + 1].parse().unwrap_or(count); i += 2; } + "--ttl-seconds" => { ttl_seconds = args[i + 1].parse().unwrap_or(ttl_seconds); i += 2; } + "--key-prefix" => { key_prefix = args[i + 1].clone(); i += 2; } + "--seed" => { seed = args[i + 1].parse().unwrap_or(seed); i += 2; } + "-h" | "--help" => { + println!("Usage: build_features [--redis-url URL] [--count N] [--ttl-seconds S] [--key-prefix PREFIX] [--seed N]"); + return Ok(()); + } + other => { + eprintln!("Unknown argument: {other}"); + std::process::exit(2); + } + } + } + + let client = redis::Client::open(redis_url.as_str())?; + let conn = redis::aio::ConnectionManager::new(client).await?; + let store = crate::feature_store::FeatureStore::new( + conn, + &key_prefix, + ttl_seconds, + crate::feature_store::DEFAULT_STREAMING_TTL_SECONDS, + ); + + let rows = synthesize_users(count, seed); + let loaded = store.bulk_load(&rows, ttl_seconds).await?; + println!( + "Materialized {} users at {}* with a {}s key-level TTL.", + loaded, key_prefix, ttl_seconds + ); + Ok(()) +} diff --git a/content/develop/use-cases/feature-store/rust/build_features_bin.rs b/content/develop/use-cases/feature-store/rust/build_features_bin.rs new file mode 100644 index 0000000000..72e7776e16 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/build_features_bin.rs @@ -0,0 +1,14 @@ +//! Tiny shim that runs the batch materializer in the +//! ``feature_store_demo`` library. Run with: +//! +//! cargo run --release --bin build_features -- --count 500 --ttl-seconds 3600 +fn main() { + let rt = tokio::runtime::Builder::new_multi_thread() + .enable_all() + .build() + .expect("tokio runtime"); + if let Err(e) = rt.block_on(feature_store_demo::build_features::cli_main()) { + eprintln!("build_features failed: {e}"); + std::process::exit(1); + } +} diff --git a/content/develop/use-cases/feature-store/rust/demo_server.rs b/content/develop/use-cases/feature-store/rust/demo_server.rs new file mode 100644 index 0000000000..9d6e625a30 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/demo_server.rs @@ -0,0 +1,466 @@ +//! Redis feature-store demo server (Rust + redis-rs + axum + tokio). +//! +//! Run with `cargo run --release --bin demo_server` and visit +//! to watch an online feature store at work: +//! a batch materialization loads N users with a 24-hour key-level +//! TTL, a streaming worker overwrites a handful of users' real-time +//! features every second with a per-field `HEXPIRE`, and the +//! inference panel reads any subset of features for any user with +//! `HMGET` in a single round trip. + +use std::net::SocketAddr; +use std::sync::Arc; +use std::time::{Duration, Instant}; + +use axum::{ + body::Bytes, + extract::{Query, State}, + http::StatusCode, + response::{Html, IntoResponse, Json}, + routing::{get, post}, + Router, +}; +use serde::{Deserialize, Serialize}; +use tokio::sync::Mutex; + +use feature_store_demo::build_features::synthesize_users; +use feature_store_demo::feature_store::{ + FeatureStore, Stats, DEFAULT_BATCH_FIELDS, DEFAULT_STREAMING_FIELDS, +}; +use feature_store_demo::streaming_worker::{StreamingWorker, WorkerStats}; + +#[derive(Clone)] +struct AppState { + store: FeatureStore, + worker: StreamingWorker, + key_prefix: String, + /// Serializes materialize / reset / toggle-worker against each + /// other so the streaming-worker pause-and-wait-idle dance can't + /// race with a concurrent bulk-load. + demo_lock: Arc>, + seed: u64, +} + +#[derive(Debug, Clone)] +struct Args { + host: String, + port: u16, + redis_url: String, + key_prefix: String, + batch_ttl_seconds: u64, + streaming_ttl_seconds: u64, + users_per_tick: usize, + seed_users: usize, + reset_on_start: bool, +} + +impl Default for Args { + fn default() -> Self { + Self { + host: "127.0.0.1".into(), + port: 8090, + redis_url: "redis://127.0.0.1/".into(), + key_prefix: "fs:user:".into(), + batch_ttl_seconds: 24 * 60 * 60, + streaming_ttl_seconds: 5 * 60, + users_per_tick: 5, + seed_users: 200, + reset_on_start: true, + } + } +} + +fn parse_args() -> Args { + let mut a = Args::default(); + let argv: Vec = std::env::args().skip(1).collect(); + let mut i = 0usize; + while i < argv.len() { + match argv[i].as_str() { + "--host" => { a.host = argv[i + 1].clone(); i += 2; } + "--port" => { a.port = argv[i + 1].parse().unwrap_or(a.port); i += 2; } + "--redis-url" => { a.redis_url = argv[i + 1].clone(); i += 2; } + "--key-prefix" => { a.key_prefix = argv[i + 1].clone(); i += 2; } + "--batch-ttl-seconds" => { a.batch_ttl_seconds = argv[i + 1].parse().unwrap_or(a.batch_ttl_seconds); i += 2; } + "--streaming-ttl-seconds" => { a.streaming_ttl_seconds = argv[i + 1].parse().unwrap_or(a.streaming_ttl_seconds); i += 2; } + "--users-per-tick" => { a.users_per_tick = argv[i + 1].parse().unwrap_or(a.users_per_tick); i += 2; } + "--seed-users" => { a.seed_users = argv[i + 1].parse().unwrap_or(a.seed_users); i += 2; } + "--no-reset" => { a.reset_on_start = false; i += 1; } + "-h" | "--help" => { + println!("Usage: demo_server [--host H] [--port P] [--redis-url URL] [--key-prefix PFX] [--batch-ttl-seconds S] [--streaming-ttl-seconds S] [--users-per-tick N] [--seed-users N] [--no-reset]"); + std::process::exit(0); + } + other => { + eprintln!("Unknown argument: {other}"); + std::process::exit(2); + } + } + } + a +} + +#[tokio::main] +async fn main() -> Result<(), Box> { + let args = parse_args(); + let client = redis::Client::open(args.redis_url.as_str())?; + let conn = redis::aio::ConnectionManager::new(client).await?; + let store = FeatureStore::new( + conn, + args.key_prefix.clone(), + args.batch_ttl_seconds, + args.streaming_ttl_seconds, + ); + + if args.reset_on_start { + println!( + "Dropping any existing users under '{}*' for a clean demo run (pass --no-reset to keep them).", + args.key_prefix + ); + store.reset().await?; + store.reset_stats(); + } + + let rows = synthesize_users(args.seed_users, 42); + let seeded = store.bulk_load(&rows, args.batch_ttl_seconds).await?; + + let worker = StreamingWorker::new( + store.clone(), + Duration::from_secs(1), + args.users_per_tick, + 1337, + ); + worker.start().await; + + let state = AppState { + store, + worker, + key_prefix: args.key_prefix.clone(), + demo_lock: Arc::new(Mutex::new(())), + seed: 42, + }; + + let app = Router::new() + .route("/", get(index)) + .route("/state", get(get_state)) + .route("/inspect", get(inspect)) + .route("/bulk-load", post(bulk_load)) + .route("/reset", post(reset)) + .route("/worker/toggle", post(toggle_worker)) + .route("/read", post(read)) + .route("/batch-read", post(batch_read)) + .with_state(state); + + let addr: SocketAddr = format!("{}:{}", args.host, args.port).parse()?; + println!("Redis feature-store demo server listening on http://{}", addr); + println!( + "Using Redis at {} with key prefix '{}' (batch TTL {}s, streaming TTL {}s)", + args.redis_url, args.key_prefix, args.batch_ttl_seconds, args.streaming_ttl_seconds + ); + println!("Materialized {} user(s); streaming worker running.", seeded); + + let listener = tokio::net::TcpListener::bind(addr).await?; + axum::serve(listener, app).await?; + Ok(()) +} + +// --------------------------------------------------------------- +// Handlers +// --------------------------------------------------------------- + +async fn index(State(state): State) -> Html { + Html(render_html_page(&state.key_prefix, + state.store.streaming_ttl_seconds(), + state.worker.users_per_tick())) +} + +#[derive(Serialize)] +struct StateResponse { + key_prefix: String, + batch_ttl_seconds: u64, + streaming_ttl_seconds: u64, + entity_count: i64, + entity_ids: Vec, + stats: Stats, + worker: WorkerStats, +} + +async fn get_state(State(state): State) -> impl IntoResponse { + let ids = state.store.list_entity_ids(500).await.unwrap_or_default(); + let count = state.store.count_entities().await.unwrap_or(0); + Json(StateResponse { + key_prefix: state.store.key_prefix().to_string(), + batch_ttl_seconds: state.store.batch_ttl_seconds(), + streaming_ttl_seconds: state.store.streaming_ttl_seconds(), + entity_count: count, + entity_ids: ids, + stats: state.store.stats(), + worker: state.worker.stats(), + }) +} + +#[derive(Deserialize)] +struct InspectParams { user: Option } + +async fn inspect( + State(state): State, + Query(params): Query, +) -> impl IntoResponse { + let user = params.user.unwrap_or_default(); + if user.is_empty() { + return (StatusCode::BAD_REQUEST, + Json(serde_json::json!({"error": "user is required"}))); + } + let full = match state.store.get_all_features(&user).await { + Ok(m) => m, + Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": e.to_string()}))), + }; + let key_ttl = state.store.key_ttl_seconds(&user).await.unwrap_or(-2); + if full.is_empty() { + return (StatusCode::OK, + Json(serde_json::json!({"exists": false, "key_ttl_seconds": key_ttl}))); + } + // Iterate the known schema (batch + streaming) plus any extras + // the hash carries so expired streaming fields surface as + // ttl_seconds=-2 in the Inspect view rather than silently + // disappearing. + let mut names: Vec = DEFAULT_BATCH_FIELDS.iter().map(|s| s.to_string()).collect(); + names.extend(DEFAULT_STREAMING_FIELDS.iter().map(|s| s.to_string())); + for k in full.keys() { + if !names.contains(k) { names.push(k.clone()); } + } + let names_ref: Vec<&str> = names.iter().map(|s| s.as_str()).collect(); + let ttls = state + .store + .field_ttls_seconds(&user, &names_ref) + .await + .unwrap_or_default(); + let mut fields: Vec = names + .iter() + .map(|n| serde_json::json!({ + "name": n, + "value": full.get(n).cloned().unwrap_or_default(), + "ttl_seconds": ttls.get(n).copied().unwrap_or(-2), + })) + .collect(); + fields.sort_by(|a, b| a["name"].as_str().cmp(&b["name"].as_str())); + (StatusCode::OK, Json(serde_json::json!({ + "exists": true, + "key_ttl_seconds": key_ttl, + "fields": fields, + }))) +} + +async fn bulk_load( + State(state): State, + body: Bytes, +) -> impl IntoResponse { + let _guard = state.demo_lock.lock().await; + let form = parse_form_multi(&body); + let count = clamp(parse_int(form.get("count").and_then(|v| v.first()), 200), 1, 2000) as usize; + let ttl = clamp(parse_int(form.get("ttl").and_then(|v| v.first()), 86400), 5, 172_800) as u64; + let rows = synthesize_users(count, state.seed); + let start = Instant::now(); + let loaded = match state.store.bulk_load(&rows, ttl).await { + Ok(n) => n, + Err(e) => return (StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": e.to_string()}))), + }; + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + (StatusCode::OK, Json(serde_json::json!({ + "loaded": loaded, + "ttl_seconds": ttl, + "elapsed_ms": elapsed_ms, + }))) +} + +async fn reset(State(state): State) -> impl IntoResponse { + let _guard = state.demo_lock.lock().await; + // Pause + wait-for-idle around the DEL sweep so a concurrent + // tick can't recreate a user that was just enumerated for + // deletion (streaming HSET creates the key if it's missing, + // leaving a streaming-only hash with no key-level TTL). + let was_paused = state.worker.is_paused(); + if state.worker.is_running() { + if !was_paused { state.worker.pause(); } + state.worker.wait_for_idle().await; + } + let result = state.store.reset().await; + state.store.reset_stats(); + state.worker.reset_stats(); + if state.worker.is_running() && !was_paused { state.worker.resume(); } + match result { + Ok(n) => (StatusCode::OK, Json(serde_json::json!({"deleted": n}))), + Err(e) => (StatusCode::INTERNAL_SERVER_ERROR, + Json(serde_json::json!({"error": e.to_string()}))), + } +} + +async fn toggle_worker(State(state): State) -> impl IntoResponse { + let _guard = state.demo_lock.lock().await; + if !state.worker.is_running() { + state.worker.start().await; + } + if state.worker.is_paused() { + state.worker.resume(); + } else { + state.worker.pause(); + } + Json(serde_json::json!({ + "paused": state.worker.is_paused(), + "running": state.worker.is_running(), + })) +} + +async fn read( + State(state): State, + body: Bytes, +) -> impl IntoResponse { + let form = parse_form_multi(&body); + let user = form + .get("user") + .and_then(|v| v.first()) + .cloned() + .unwrap_or_default(); + if user.is_empty() { + return (StatusCode::BAD_REQUEST, + Json(serde_json::json!({"error": "user is required"}))); + } + let fields: Vec = form + .get("field") + .cloned() + .unwrap_or_default() + .into_iter() + .filter(|f| !f.is_empty()) + .collect(); + let field_refs: Vec<&str> = fields.iter().map(|s| s.as_str()).collect(); + let start = Instant::now(); + let values = state + .store + .get_features(&user, &field_refs) + .await + .unwrap_or_default(); + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + let ttls = state + .store + .field_ttls_seconds(&user, &field_refs) + .await + .unwrap_or_default(); + let key_ttl = state.store.key_ttl_seconds(&user).await.unwrap_or(-2); + (StatusCode::OK, Json(serde_json::json!({ + "requested": fields, + "values": values, + "ttls": ttls, + "key_ttl_seconds": key_ttl, + "returned_count": values.len(), + "elapsed_ms": elapsed_ms, + }))) +} + +async fn batch_read( + State(state): State, + body: Bytes, +) -> impl IntoResponse { + let form = parse_form_multi(&body); + let count = clamp(parse_int(form.get("count").and_then(|v| v.first()), 100), 1, 500) as usize; + let mut fields: Vec = form + .get("field") + .cloned() + .unwrap_or_default() + .into_iter() + .filter(|f| !f.is_empty()) + .collect(); + if fields.is_empty() { + fields = DEFAULT_STREAMING_FIELDS.iter().map(|s| s.to_string()).collect(); + fields.push("risk_segment".into()); + } + let field_refs: Vec<&str> = fields.iter().map(|s| s.as_str()).collect(); + + let all_ids = state + .store + .list_entity_ids(count.saturating_mul(2).max(2000)) + .await + .unwrap_or_default(); + let ids: Vec = all_ids.into_iter().take(count).collect(); + let start = Instant::now(); + let rows = state + .store + .batch_get_features(&ids, &field_refs) + .await + .unwrap_or_default(); + let elapsed_ms = start.elapsed().as_secs_f64() * 1000.0; + let sample: Vec = ids + .iter() + .take(10) + .map(|id| serde_json::json!({ + "id": id, + "field_count": rows.get(id).map(|r| r.len()).unwrap_or(0), + })) + .collect(); + (StatusCode::OK, Json(serde_json::json!({ + "entity_count": ids.len(), + "field_count": fields.len(), + "elapsed_ms": elapsed_ms, + "sample": sample, + }))) +} + +fn clamp(v: i64, lo: i64, hi: i64) -> i64 { + v.max(lo).min(hi) +} + +fn parse_int(s: Option<&String>, def: i64) -> i64 { + s.and_then(|v| v.parse::().ok()).unwrap_or(def) +} + +/// Parse an `application/x-www-form-urlencoded` body into a +/// multi-value map. `axum::Form` uses `serde_urlencoded` under the +/// hood, which silently drops repeated keys — we need every +/// `field=` entry, so we parse the body manually. +fn parse_form_multi(body: &Bytes) -> std::collections::HashMap> { + let mut out: std::collections::HashMap> = std::collections::HashMap::new(); + let s = std::str::from_utf8(body).unwrap_or(""); + if s.is_empty() { return out; } + for pair in s.split('&') { + let (k, v) = match pair.split_once('=') { + Some((k, v)) => (urldecode(k), urldecode(v)), + None => (urldecode(pair), String::new()), + }; + out.entry(k).or_default().push(v); + } + out +} + +/// Tiny URL decoder for `+` (space) and `%XX` (hex byte). UTF-8 safe +/// because we buffer bytes and convert at the end. +fn urldecode(s: &str) -> String { + let bytes = s.as_bytes(); + let mut out: Vec = Vec::with_capacity(bytes.len()); + let mut i = 0; + while i < bytes.len() { + match bytes[i] { + b'+' => { out.push(b' '); i += 1; } + b'%' if i + 2 < bytes.len() => { + if let Ok(byte) = u8::from_str_radix(&s[i + 1..i + 3], 16) { + out.push(byte); i += 3; + } else { + out.push(bytes[i]); i += 1; + } + } + c => { out.push(c); i += 1; } + } + } + String::from_utf8_lossy(&out).into_owned() +} + +fn render_html_page(key_prefix: &str, streaming_ttl: u64, users_per_tick: usize) -> String { + let batch_json = serde_json::to_string(&DEFAULT_BATCH_FIELDS).unwrap(); + let stream_json = serde_json::to_string(&DEFAULT_STREAMING_FIELDS).unwrap(); + HTML_TEMPLATE + .replace("__KEY_PREFIX__", key_prefix) + .replace("__STREAM_TTL__", &streaming_ttl.to_string()) + .replace("__USERS_PER_TICK__", &users_per_tick.to_string()) + .replace("__BATCH_FIELDS_JSON__", &batch_json) + .replace("__STREAM_FIELDS_JSON__", &stream_json) +} + +const HTML_TEMPLATE: &str = include_str!("./demo_template.html"); diff --git a/content/develop/use-cases/feature-store/rust/demo_template.html b/content/develop/use-cases/feature-store/rust/demo_template.html new file mode 100644 index 0000000000..3e5b0ddcd1 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/demo_template.html @@ -0,0 +1,342 @@ + + + + + + Redis Feature Store Demo (Rust) + + + +
+
redis-rs + axum + tokio
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one redis::pipe() — the whole batch + ships in one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + redis::pipe(). One network round trip for the + whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + diff --git a/content/develop/use-cases/feature-store/rust/feature_store.rs b/content/develop/use-cases/feature-store/rust/feature_store.rs new file mode 100644 index 0000000000..4e63c10226 --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/feature_store.rs @@ -0,0 +1,482 @@ +//! `FeatureStore` async helper around a `redis::aio::ConnectionManager`. +//! +//! ConnectionManager is the canonical "single-connection across many +//! async tasks" abstraction in redis-rs: it owns the multiplexed +//! `MultiplexedConnection` underneath and transparently reconnects on +//! a closed socket. The struct holds `Clone` handles, so wiring one +//! into HTTP handlers and a streaming worker is just `clone()`s. + +use std::collections::BTreeMap; +use std::sync::atomic::{AtomicI64, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use redis::aio::ConnectionManager; +use redis::{AsyncCommands, FromRedisValue, RedisResult, Value}; +use serde::Serialize; + +/// Default batch feature schema. Daily aggregates computed offline +/// and bulk-loaded once per materialization cycle. +pub const DEFAULT_BATCH_FIELDS: &[&str] = &[ + "country_iso", + "risk_segment", + "account_age_days", + "tx_count_7d", + "avg_amount_30d", + "chargeback_count_180d", +]; + +/// Default streaming feature schema. Updated by the streaming worker +/// as new events arrive, with a per-field TTL so each field +/// self-expires when its upstream pipeline stops. +pub const DEFAULT_STREAMING_FIELDS: &[&str] = &[ + "last_login_ts", + "last_device_id", + "tx_count_5m", + "failed_logins_15m", + "session_country", +]; + +pub const DEFAULT_BATCH_TTL_SECONDS: u64 = 24 * 60 * 60; +pub const DEFAULT_STREAMING_TTL_SECONDS: u64 = 5 * 60; +pub const DEFAULT_KEY_PREFIX: &str = "fs:user:"; + +/// Hard cap on how long a single redis call can block before the +/// `tokio::time::timeout` wrapper bails out. Stops a stuck server from +/// hanging the demo's web UI indefinitely. +pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); + +/// One feature value the helper knows how to encode. +#[derive(Debug, Clone)] +pub enum FeatureValue { + Str(String), + Bool(bool), + Int(i64), + Float(f64), +} + +impl FeatureValue { + /// Render the value as the string Redis hash fields require. + /// Booleans become "true"/"false" so they round-trip cleanly + /// through redis-cli and the other clients. + pub fn encode(&self) -> String { + match self { + FeatureValue::Str(s) => s.clone(), + FeatureValue::Bool(b) => (if *b { "true" } else { "false" }).into(), + FeatureValue::Int(i) => i.to_string(), + FeatureValue::Float(f) => { + if f.fract() == 0.0 { + format!("{:.1}", f) + } else { + f.to_string() + } + } + } + } +} + +impl From<&str> for FeatureValue { + fn from(s: &str) -> Self { FeatureValue::Str(s.into()) } +} +impl From for FeatureValue { + fn from(s: String) -> Self { FeatureValue::Str(s) } +} +impl From for FeatureValue { + fn from(b: bool) -> Self { FeatureValue::Bool(b) } +} +impl From for FeatureValue { + fn from(i: i32) -> Self { FeatureValue::Int(i as i64) } +} +impl From for FeatureValue { + fn from(i: i64) -> Self { FeatureValue::Int(i) } +} +impl From for FeatureValue { + fn from(i: u64) -> Self { FeatureValue::Int(i as i64) } +} +impl From for FeatureValue { + fn from(f: f64) -> Self { FeatureValue::Float(f) } +} + +/// One row of features for one entity. Insertion order is preserved +/// so the demo's debug views read in a stable order. +pub type FeatureMap = BTreeMap; + +#[derive(Debug, Clone, Serialize)] +pub struct Stats { + pub batch_writes_total: i64, + pub streaming_writes_total: i64, + pub reads_total: i64, + pub read_fields_total: i64, +} + +#[derive(Clone)] +pub struct FeatureStore { + conn: ConnectionManager, + key_prefix: String, + batch_ttl_seconds: u64, + streaming_ttl_seconds: u64, + counters: Arc, +} + +struct Counters { + batch_writes_total: AtomicI64, + streaming_writes_total: AtomicI64, + reads_total: AtomicI64, + read_fields_total: AtomicI64, +} + +impl FeatureStore { + pub fn new( + conn: ConnectionManager, + key_prefix: impl Into, + batch_ttl_seconds: u64, + streaming_ttl_seconds: u64, + ) -> Self { + Self { + conn, + key_prefix: key_prefix.into(), + batch_ttl_seconds, + streaming_ttl_seconds, + counters: Arc::new(Counters { + batch_writes_total: AtomicI64::new(0), + streaming_writes_total: AtomicI64::new(0), + reads_total: AtomicI64::new(0), + read_fields_total: AtomicI64::new(0), + }), + } + } + + pub fn key_prefix(&self) -> &str { &self.key_prefix } + pub fn batch_ttl_seconds(&self) -> u64 { self.batch_ttl_seconds } + pub fn streaming_ttl_seconds(&self) -> u64 { self.streaming_ttl_seconds } + + pub fn key_for(&self, entity_id: &str) -> String { + format!("{}{}", self.key_prefix, entity_id) + } + + // --------------------------------------------------------------- + // Batch ingestion (materialization) + // --------------------------------------------------------------- + + /// Materialize a batch of entities into Redis. + /// + /// One `HSET` plus one `EXPIRE` per entity, all queued through a + /// non-transactional pipeline so the whole batch ships in a + /// single network round trip. + pub async fn bulk_load( + &self, + rows: &[(String, FeatureMap)], + ttl_seconds: u64, + ) -> RedisResult { + if rows.is_empty() { return Ok(0); } + let mut pipe = redis::pipe(); + for (entity_id, fields) in rows { + let key = self.key_for(entity_id); + let encoded: Vec<(&str, String)> = fields + .iter() + .map(|(k, v)| (k.as_str(), v.encode())) + .collect(); + pipe.hset_multiple(&key, &encoded).ignore(); + pipe.expire(&key, ttl_seconds as i64).ignore(); + } + let mut conn = self.conn.clone(); + pipe.query_async::<()>(&mut conn).await?; + self.counters + .batch_writes_total + .fetch_add(rows.len() as i64, Ordering::Relaxed); + Ok(rows.len()) + } + + pub async fn update_batch_feature( + &self, + entity_id: &str, + field: &str, + value: FeatureValue, + ) -> RedisResult<()> { + let mut conn = self.conn.clone(); + let _: () = conn.hset(self.key_for(entity_id), field, value.encode()).await?; + self.counters + .batch_writes_total + .fetch_add(1, Ordering::Relaxed); + Ok(()) + } + + // --------------------------------------------------------------- + // Streaming ingestion + // --------------------------------------------------------------- + + /// Write streaming features with a per-field TTL. + /// + /// `HSET` + `HEXPIRE` are queued in a single pipeline so Redis + /// runs them in order on the server: the `HSET` first creates or + /// overwrites the fields, then `HEXPIRE` attaches a TTL to each + /// of those same fields. + /// + /// `HEXPIRE` returns one status code per field: + /// `1` = TTL set, `2` = the expiry was 0 or in the past (so Redis + /// deleted the field instead), `0` = an NX/XX/GT/LT conditional + /// flag wasn't met, `-2` = no such field or key. We always + /// follow `HSET` with `HEXPIRE` so any code other than `1` means + /// the per-field TTL invariant didn't hold — the helper returns + /// a `RedisError` rather than silently leaving a streaming field + /// without an expiry attached. + pub async fn update_streaming( + &self, + entity_id: &str, + fields: &FeatureMap, + ttl_seconds: u64, + ) -> RedisResult<()> { + if fields.is_empty() { return Ok(()); } + let key = self.key_for(entity_id); + let encoded: Vec<(&str, String)> = fields + .iter() + .map(|(k, v)| (k.as_str(), v.encode())) + .collect(); + let names: Vec<&str> = fields.keys().map(|s| s.as_str()).collect(); + + let mut pipe = redis::pipe(); + pipe.hset_multiple(&key, &encoded).ignore(); + // HEXPIRE wire form: HEXPIRE key seconds FIELDS count field... + let mut hexpire = redis::cmd("HEXPIRE"); + hexpire.arg(&key).arg(ttl_seconds).arg("FIELDS").arg(names.len()); + for n in &names { hexpire.arg(n); } + pipe.add_command(hexpire); + + let mut conn = self.conn.clone(); + // The pipeline returns one entry per non-ignored command; + // HSET's reply was dropped with .ignore() above, so there is + // exactly one entry left — HEXPIRE's per-field code list + // (itself a Vec). We unwrap the outer Vec to get at + // those codes. + let pipe_result: Vec> = pipe.query_async(&mut conn).await?; + let codes = pipe_result.into_iter().next().unwrap_or_default(); + for code in &codes { + if *code != 1 { + return Err(redis::RedisError::from(( + redis::ErrorKind::ResponseError, + "HEXPIRE invariant violated", + format!( + "HEXPIRE did not set every field TTL for {key}: {codes:?}" + ), + ))); + } + } + self.counters + .streaming_writes_total + .fetch_add(fields.len() as i64, Ordering::Relaxed); + Ok(()) + } + + // --------------------------------------------------------------- + // Inference reads + // --------------------------------------------------------------- + + /// Retrieve a subset of features for one entity with `HMGET`. + /// Returns only the fields that actually exist on the hash — + /// missing fields are dropped from the result. + pub async fn get_features( + &self, + entity_id: &str, + field_names: &[&str], + ) -> RedisResult> { + if field_names.is_empty() { + return Ok(BTreeMap::new()); + } + let key = self.key_for(entity_id); + let mut conn = self.conn.clone(); + let values: Vec> = conn.hget(&key, field_names).await?; + let mut out = BTreeMap::new(); + for (n, v) in field_names.iter().zip(values.into_iter()) { + if let Some(s) = v { + out.insert((*n).to_string(), s); + } + } + self.counters.reads_total.fetch_add(1, Ordering::Relaxed); + self.counters + .read_fields_total + .fetch_add(out.len() as i64, Ordering::Relaxed); + Ok(out) + } + + /// Full-hash read via `HGETALL`. Useful for debugging but the + /// model server should always go through `get_features` with an + /// explicit field list. + pub async fn get_all_features( + &self, + entity_id: &str, + ) -> RedisResult> { + let mut conn = self.conn.clone(); + let map: BTreeMap = conn.hgetall(self.key_for(entity_id)).await?; + self.counters.reads_total.fetch_add(1, Ordering::Relaxed); + self.counters + .read_fields_total + .fetch_add(map.len() as i64, Ordering::Relaxed); + Ok(map) + } + + /// Pipeline `HMGET` across many entities for batch scoring. One + /// round trip for the whole batch. + pub async fn batch_get_features( + &self, + entity_ids: &[String], + field_names: &[&str], + ) -> RedisResult>> { + if entity_ids.is_empty() || field_names.is_empty() { + return Ok(BTreeMap::new()); + } + let mut pipe = redis::pipe(); + for id in entity_ids { + pipe.hget(self.key_for(id), field_names); + } + let mut conn = self.conn.clone(); + let rows: Vec>> = pipe.query_async(&mut conn).await?; + + let mut out = BTreeMap::new(); + let mut seen = 0i64; + for (id, values) in entity_ids.iter().zip(rows.into_iter()) { + let mut row = BTreeMap::new(); + for (n, v) in field_names.iter().zip(values.into_iter()) { + if let Some(s) = v { + row.insert((*n).to_string(), s); + seen += 1; + } + } + out.insert(id.clone(), row); + } + self.counters + .reads_total + .fetch_add(entity_ids.len() as i64, Ordering::Relaxed); + self.counters.read_fields_total.fetch_add(seen, Ordering::Relaxed); + Ok(out) + } + + // --------------------------------------------------------------- + // TTL inspection (used by the demo UI) + // --------------------------------------------------------------- + + /// Seconds until the entity key expires: positive means TTL + /// remaining, `-1` means no key-level TTL set, `-2` means the + /// key doesn't exist. + pub async fn key_ttl_seconds(&self, entity_id: &str) -> RedisResult { + let mut conn = self.conn.clone(); + conn.ttl(self.key_for(entity_id)).await + } + + /// Per-field TTL via `HTTL` (Redis 7.4+). Each value mirrors + /// `TTL`'s convention: positive seconds remaining, `-1` no field + /// TTL, `-2` field (or key) missing. + pub async fn field_ttls_seconds( + &self, + entity_id: &str, + field_names: &[&str], + ) -> RedisResult> { + if field_names.is_empty() { + return Ok(BTreeMap::new()); + } + let key = self.key_for(entity_id); + let mut cmd = redis::cmd("HTTL"); + cmd.arg(&key).arg("FIELDS").arg(field_names.len()); + for n in field_names { cmd.arg(*n); } + let mut conn = self.conn.clone(); + // HTTL on a missing key still returns a flat list of -2s + // (one per requested field), so we don't need a defensive + // shape coercion here the way redis-py and Lettuce do for + // their respective clients. + let values: Value = cmd.query_async(&mut conn).await?; + let codes: Vec = Vec::::from_redis_value(&values).unwrap_or_else(|_| { + // Defensive fallback: if the client ever returns nil for + // a missing key, treat every field as -2. + field_names.iter().map(|_| -2).collect() + }); + let mut out = BTreeMap::new(); + for (n, v) in field_names.iter().zip(codes.into_iter()) { + out.insert((*n).to_string(), v); + } + Ok(out) + } + + // --------------------------------------------------------------- + // Demo housekeeping + // --------------------------------------------------------------- + + /// Enumerate up to `limit` entity IDs by scanning `keyPrefix*`. + /// `SCAN` is non-blocking; the demo uses it for UI dropdowns, + /// not as a serving primitive. Result is sorted. + pub async fn list_entity_ids(&self, limit: usize) -> RedisResult> { + use redis::AsyncIter; + let pattern = format!("{}*", self.key_prefix); + let mut conn = self.conn.clone(); + let mut iter: AsyncIter = conn.scan_match(&pattern).await?; + let mut ids: Vec = Vec::new(); + let prefix_len = self.key_prefix.len(); + while let Some(k) = iter.next_item().await { + if k.len() > prefix_len { + ids.push(k[prefix_len..].to_string()); + } + if ids.len() >= limit { break; } + } + ids.sort(); + Ok(ids) + } + + /// Count every entity under the key prefix. Loops `SCAN` without + /// an in-memory cap so the UI can show the true total even when + /// more keys exist than `list_entity_ids` returns. + pub async fn count_entities(&self) -> RedisResult { + use redis::AsyncIter; + let pattern = format!("{}*", self.key_prefix); + let mut conn = self.conn.clone(); + let mut iter: AsyncIter = conn.scan_match(&pattern).await?; + let mut n = 0i64; + while let Some(_k) = iter.next_item().await { + n += 1; + } + Ok(n) + } + + pub async fn delete_entity(&self, entity_id: &str) -> RedisResult { + let mut conn = self.conn.clone(); + conn.del(self.key_for(entity_id)).await + } + + /// Drop every entity under the key prefix. Used by the demo + /// reset path; collects keys with `SCAN` then issues variadic + /// `DEL` in batches of 500. + pub async fn reset(&self) -> RedisResult { + use redis::AsyncIter; + let pattern = format!("{}*", self.key_prefix); + let mut conn = self.conn.clone(); + let mut deleted = 0i64; + let mut batch: Vec = Vec::with_capacity(500); + let mut iter: AsyncIter = conn.scan_match(&pattern).await?; + while let Some(k) = iter.next_item().await { + batch.push(k); + if batch.len() >= 500 { + let n: i64 = self.conn.clone().del(batch.as_slice()).await?; + deleted += n; + batch.clear(); + } + } + if !batch.is_empty() { + let n: i64 = self.conn.clone().del(batch.as_slice()).await?; + deleted += n; + } + Ok(deleted) + } + + pub fn stats(&self) -> Stats { + Stats { + batch_writes_total: self.counters.batch_writes_total.load(Ordering::Relaxed), + streaming_writes_total: self.counters.streaming_writes_total.load(Ordering::Relaxed), + reads_total: self.counters.reads_total.load(Ordering::Relaxed), + read_fields_total: self.counters.read_fields_total.load(Ordering::Relaxed), + } + } + + pub fn reset_stats(&self) { + self.counters.batch_writes_total.store(0, Ordering::Relaxed); + self.counters.streaming_writes_total.store(0, Ordering::Relaxed); + self.counters.reads_total.store(0, Ordering::Relaxed); + self.counters.read_fields_total.store(0, Ordering::Relaxed); + } +} diff --git a/content/develop/use-cases/feature-store/rust/lib.rs b/content/develop/use-cases/feature-store/rust/lib.rs new file mode 100644 index 0000000000..44d7c4c2ab --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/lib.rs @@ -0,0 +1,29 @@ +//! Redis online feature store backed by per-entity Hashes. +//! +//! Each entity (here, a user) lives at a deterministic key such as +//! `fs:user:{id}`. The hash holds every feature for that entity as +//! one field per feature — batch-materialized aggregates (refreshed +//! on a daily cycle) alongside streaming-updated signals (refreshed +//! every few seconds). One `HMGET` returns whichever subset the +//! model needs in one network round trip. +//! +//! Two TTL layers solve the *mixed staleness* problem: +//! +//! * A key-level `EXPIRE` aligned with the batch materialization +//! cycle causes the whole entity to disappear if its batch +//! refresher fails, so inference sees a missing entity (which the +//! model handler can detect and fall back on) rather than silently +//! outdated values. +//! * A per-field `HEXPIRE` on each streaming field gives that field +//! its own shorter expiry, independent of the rest of the hash. +//! When the streaming pipeline stops updating a field, the field +//! self-cleans while the rest of the entity stays populated. +//! +//! `HEXPIRE` and `HTTL` require Redis 7.4 or later. The redis-rs +//! crate up to 0.27 doesn't ship typed bindings for the field-TTL +//! commands yet; the helper issues them with the generic +//! `redis::cmd("HEXPIRE")` builder, which sends the same wire bytes. + +pub mod build_features; +pub mod feature_store; +pub mod streaming_worker; diff --git a/content/develop/use-cases/feature-store/rust/streaming_worker.rs b/content/develop/use-cases/feature-store/rust/streaming_worker.rs new file mode 100644 index 0000000000..9e3e0e43ab --- /dev/null +++ b/content/develop/use-cases/feature-store/rust/streaming_worker.rs @@ -0,0 +1,275 @@ +//! Streaming feature updater for the demo. +//! +//! Stands in for whatever Flink, Kafka Streams, or bespoke service +//! computes the real-time features in a real deployment. In +//! production this code lives in the streaming layer; here it runs +//! as a tokio task next to the demo server so the UI can start, +//! pause, and resume it. +//! +//! Every tick the worker picks a few random users and writes a new +//! value for each streaming feature, with a per-field `HEXPIRE` so +//! the field self-expires if the worker is paused. Pause it for +//! longer than `streaming_ttl_seconds` and the streaming fields drop +//! out of the hash while the batch fields remain populated under the +//! longer key-level TTL — the *mixed staleness* story made visible. + +use std::collections::BTreeMap; +use std::sync::atomic::{AtomicBool, AtomicI64, Ordering}; +use std::sync::Arc; +use std::time::Duration; + +use rand::rngs::StdRng; +use rand::seq::SliceRandom; +use rand::{Rng, SeedableRng}; +use serde::Serialize; +use tokio::sync::Mutex; +use tokio::task::JoinHandle; +use tokio::time; + +use crate::feature_store::{FeatureMap, FeatureStore, FeatureValue}; + +const DEVICE_IDS: &[&str] = &[ + "ios-1a4c", "ios-9f02", "and-7b21", "and-2d18", + "web-chr-1", "web-saf-1", "web-ff-2", +]; +const SESSION_COUNTRIES: &[&str] = &[ + "US", "GB", "DE", "FR", "IN", "BR", "JP", "AU", "CA", "NL", +]; +const FAILED_LOGIN_BUCKETS: &[i64] = &[0, 1, 2, 5]; +const FAILED_LOGIN_WEIGHTS: &[u32] = &[70, 20, 8, 2]; + +#[derive(Debug, Clone, Serialize)] +pub struct WorkerStats { + pub running: bool, + pub paused: bool, + pub tick_count: i64, + pub writes_count: i64, +} + +/// Shared state the worker task and the public API both hold. +struct State { + store: FeatureStore, + tick: Duration, + users_per_tick: usize, + rng: Mutex, + running: AtomicBool, + paused: AtomicBool, + tick_in_flight: AtomicBool, + tick_count: AtomicI64, + writes_count: AtomicI64, + /// Signals the task to exit. Tokio doesn't need a separate done + /// channel because awaiting the JoinHandle gives us the same + /// "wait until the task actually exits" semantics. + stop: AtomicBool, +} + +#[derive(Clone)] +pub struct StreamingWorker { + state: Arc, + /// Owned by the controlling code so that Stop() can `await` it. + /// Wrapped in `Mutex>` so `start()` and `stop()` can + /// swap the handle in and out without consuming the worker. + handle: Arc>>>, +} + +impl StreamingWorker { + pub fn new( + store: FeatureStore, + tick: Duration, + users_per_tick: usize, + seed: u64, + ) -> Self { + let tick = if tick.is_zero() { Duration::from_secs(1) } else { tick }; + let users_per_tick = users_per_tick.max(1); + Self { + state: Arc::new(State { + store, + tick, + users_per_tick, + rng: Mutex::new(StdRng::seed_from_u64(seed)), + running: AtomicBool::new(false), + paused: AtomicBool::new(false), + tick_in_flight: AtomicBool::new(false), + tick_count: AtomicI64::new(0), + writes_count: AtomicI64::new(0), + stop: AtomicBool::new(false), + }), + handle: Arc::new(Mutex::new(None)), + } + } + + pub fn users_per_tick(&self) -> usize { self.state.users_per_tick } + + /// Spawn the tick task. No-op if already running. + pub async fn start(&self) { + if self + .state + .running + .compare_exchange(false, true, Ordering::Relaxed, Ordering::Relaxed) + .is_err() + { + return; + } + self.state.paused.store(false, Ordering::Relaxed); + self.state.stop.store(false, Ordering::Relaxed); + let state = self.state.clone(); + let handle = tokio::spawn(async move { run(state).await }); + *self.handle.lock().await = Some(handle); + } + + /// Signal the task to exit, await its JoinHandle, then settle. + pub async fn stop(&self) { + if self + .state + .running + .compare_exchange(true, false, Ordering::Relaxed, Ordering::Relaxed) + .is_err() + { + return; + } + self.state.stop.store(true, Ordering::Relaxed); + let mut slot = self.handle.lock().await; + if let Some(h) = slot.take() { + let _ = h.await; + } + // Final flag reset in case the task panicked before its own + // `finally`-style cleanup ran. + self.state.tick_in_flight.store(false, Ordering::Relaxed); + } + + pub fn pause(&self) { self.state.paused.store(true, Ordering::Relaxed); } + pub fn resume(&self) { self.state.paused.store(false, Ordering::Relaxed); } + + pub fn is_running(&self) -> bool { self.state.running.load(Ordering::Relaxed) } + pub fn is_paused(&self) -> bool { self.state.paused.load(Ordering::Relaxed) } + + /// Block until any in-flight tick has finished. `pause()` only + /// stops *future* ticks from running; this is what callers (a + /// reset that's about to DEL every entity, for example) use to + /// flush a mid-flight tick before they touch state the tick + /// might still be writing to. + pub async fn wait_for_idle(&self) { + while self.state.tick_in_flight.load(Ordering::Relaxed) { + tokio::time::sleep(Duration::from_millis(20)).await; + } + } + + pub fn stats(&self) -> WorkerStats { + WorkerStats { + running: self.is_running(), + paused: self.is_paused(), + tick_count: self.state.tick_count.load(Ordering::Relaxed), + writes_count: self.state.writes_count.load(Ordering::Relaxed), + } + } + + pub fn reset_stats(&self) { + self.state.tick_count.store(0, Ordering::Relaxed); + self.state.writes_count.store(0, Ordering::Relaxed); + } +} + +async fn run(state: Arc) { + // Clear `running` and `tick_in_flight` no matter how this future + // exits — a panic in `do_tick`, a manual stop signal, or anything + // else. Without this, a one-shot failure would leave the worker + // looking like it's running and refusing to restart. + struct Guard<'a>(&'a State); + impl Drop for Guard<'_> { + fn drop(&mut self) { + self.0.running.store(false, Ordering::Relaxed); + self.0.tick_in_flight.store(false, Ordering::Relaxed); + } + } + let _guard = Guard(&state); + + let mut interval = time::interval(state.tick); + // The first tick fires immediately by default; skip it so we + // behave like every other client's worker (wait one tick before + // the first write). + interval.set_missed_tick_behavior(time::MissedTickBehavior::Skip); + interval.tick().await; + + loop { + if state.stop.load(Ordering::Relaxed) { return; } + interval.tick().await; + if state.stop.load(Ordering::Relaxed) { return; } + + // Set tick_in_flight *before* the pause check so a concurrent + // pause()+wait_for_idle() can never observe tick_in_flight=false + // in the window between the pause check and the actual + // do_tick call. The flag is cleared in all exit paths below. + state.tick_in_flight.store(true, Ordering::Relaxed); + let result = if !state.paused.load(Ordering::Relaxed) { + do_tick(&state).await + } else { + Ok(()) + }; + state.tick_in_flight.store(false, Ordering::Relaxed); + if let Err(e) = result { + eprintln!("[streaming-worker] tick failed: {e}"); + } + } +} + +async fn do_tick(state: &State) -> redis::RedisResult<()> { + let ids = state.store.list_entity_ids(500).await?; + if ids.is_empty() { return Ok(()); } + + let now_ms = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .map(|d| d.as_millis() as i64) + .unwrap_or(0); + + // Pick `users_per_tick` random IDs without replacement. + let picks = { + let mut rng = state.rng.lock().await; + let mut pool: Vec<&String> = ids.iter().collect(); + pool.shuffle(&mut *rng); + pool.into_iter() + .take(state.users_per_tick) + .cloned() + .collect::>() + }; + + let mut writes = 0i64; + for id in &picks { + let fields = { + let mut rng = state.rng.lock().await; + let mut m: FeatureMap = BTreeMap::new(); + m.insert("last_login_ts".into(), FeatureValue::Int(now_ms)); + m.insert( + "last_device_id".into(), + FeatureValue::Str(DEVICE_IDS[rng.gen_range(0..DEVICE_IDS.len())].into()), + ); + m.insert("tx_count_5m".into(), FeatureValue::Int(rng.gen_range(0..13))); + m.insert( + "failed_logins_15m".into(), + FeatureValue::Int(weighted_pick(&mut *rng, FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS)), + ); + m.insert( + "session_country".into(), + FeatureValue::Str(SESSION_COUNTRIES[rng.gen_range(0..SESSION_COUNTRIES.len())].into()), + ); + m + }; + state + .store + .update_streaming(id, &fields, state.store.streaming_ttl_seconds()) + .await?; + writes += fields.len() as i64; + } + state.tick_count.fetch_add(1, Ordering::Relaxed); + state.writes_count.fetch_add(writes, Ordering::Relaxed); + Ok(()) +} + +fn weighted_pick(rng: &mut R, items: &[i64], weights: &[u32]) -> i64 { + let total: u32 = weights.iter().sum(); + let mut r = rng.gen_range(0..total); + for (i, w) in weights.iter().enumerate() { + if r < *w { return items[i]; } + r -= w; + } + items[items.len() - 1] +} From 9defcce22552dc8e58dda0c8c2b394fb5e82a32d Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 11:01:47 +0100 Subject: [PATCH 09/20] DOC-6661 Rust and .NET fixes after review --- .../feature-store/dotnet/FeatureStore.cs | 7 +- .../use-cases/feature-store/dotnet/Program.cs | 4 +- .../feature-store/dotnet/StreamingWorker.cs | 43 +++++-- .../use-cases/feature-store/dotnet/_index.md | 87 ++++++++++++- .../use-cases/feature-store/rust/Cargo.toml | 6 + .../use-cases/feature-store/rust/_index.md | 50 ++++++++ .../feature-store/rust/build_features.rs | 16 ++- .../feature-store/rust/demo_server.rs | 118 +++++++----------- .../feature-store/rust/feature_store.rs | 6 - 9 files changed, 236 insertions(+), 101 deletions(-) diff --git a/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs index 46985f2fd4..77b6a8583d 100644 --- a/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs +++ b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs @@ -146,7 +146,7 @@ public async Task BulkLoadAsync( /// Redis deleted the field instead of applying a TTL. /// ConditionNotMet (= 0): NX/XX/GT/LT condition /// not met (we never use one here). - /// NotExist (= -2): no such field, or no such key. + /// NoSuchField (= -2): no such field, or no such key. /// /// We always follow HSET with HEXPIRE so any code /// other than Success means the per-field TTL invariant @@ -310,12 +310,15 @@ public async Task> FieldTtlsSecondsAsync( if (fieldNames.Count == 0) return out_; var values = fieldNames.Select(f => (RedisValue)f).ToArray(); var ms = await _db.HashFieldGetTimeToLiveAsync(KeyFor(entityId), values); + // SE.Redis 2.13 returns a flat long[] of length == fieldNames.Count + // (filled with -2s for a missing key). Coerce defensively against + // any future version that might return a shorter or empty array. for (int i = 0; i < fieldNames.Count; i++) { // HTTL returns ms remaining; negative sentinels pass // through. Convert positive durations to whole seconds // for parity with the other clients' helpers. - long v = ms[i]; + long v = i < ms.Length ? ms[i] : -2L; out_[fieldNames[i]] = v < 0 ? v : v / 1000; } return out_; diff --git a/content/develop/use-cases/feature-store/dotnet/Program.cs b/content/develop/use-cases/feature-store/dotnet/Program.cs index 59af580f91..da9a17bf10 100644 --- a/content/develop/use-cases/feature-store/dotnet/Program.cs +++ b/content/develop/use-cases/feature-store/dotnet/Program.cs @@ -70,7 +70,7 @@ BuildFeatures.SynthesizeUsers(seedUsers, demoSeed), batchTtlSeconds); -worker.Start(); +await worker.StartAsync(); var builder = WebApplication.CreateBuilder(args); builder.WebHost.UseUrls($"http://{host}:{port}"); @@ -191,7 +191,7 @@ string IndexHtml() => await demoLock.WaitAsync(); try { - if (!worker.IsRunning) worker.Start(); + if (!worker.IsRunning) await worker.StartAsync(); if (worker.IsPaused) worker.Resume(); else worker.Pause(); return Results.Json(new { paused = worker.IsPaused, running = worker.IsRunning }); diff --git a/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs index 1d4b1a84f6..67b18c3c7f 100644 --- a/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs +++ b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs @@ -49,6 +49,11 @@ public sealed class StreamingWorker private CancellationTokenSource? _cts; private Task? _task; + // Serializes start/stop so a Ctrl+C-triggered StopAsync can't + // race with a /worker/toggle Start() (or vice versa). Without + // this, the two could each be observing or replacing _cts/_task + // while the other is mid-flight. + private readonly SemaphoreSlim _lifecycleLock = new(1, 1); public StreamingWorker(FeatureStore store, TimeSpan tick, int usersPerTick, int seed) { @@ -62,21 +67,41 @@ public StreamingWorker(FeatureStore store, TimeSpan tick, int usersPerTick, int // Lifecycle // --------------------------------------------------------------- - public void Start() + public async Task StartAsync() { - if (Interlocked.CompareExchange(ref _running, 1, 0) != 0) return; - Interlocked.Exchange(ref _paused, 0); - _cts = new CancellationTokenSource(); - _task = Task.Run(() => RunAsync(_cts.Token)); + await _lifecycleLock.WaitAsync(); + try + { + if (Interlocked.CompareExchange(ref _running, 1, 0) != 0) return; + Interlocked.Exchange(ref _paused, 0); + _cts = new CancellationTokenSource(); + _task = Task.Run(() => RunAsync(_cts.Token)); + } + finally { _lifecycleLock.Release(); } } public async Task StopAsync() { - if (Interlocked.Exchange(ref _running, 0) != 1) return; - _cts?.Cancel(); - try { if (_task is not null) await _task; } + // Capture the task/CTS locally under the lifecycle lock so + // a concurrent StartAsync can't clear them on us before we + // get to await. + Task? task; + CancellationTokenSource? cts; + await _lifecycleLock.WaitAsync(); + try + { + if (Interlocked.Exchange(ref _running, 0) != 1) return; + task = _task; + cts = _cts; + _task = null; + _cts = null; + } + finally { _lifecycleLock.Release(); } + + cts?.Cancel(); + try { if (task is not null) await task; } catch (OperationCanceledException) { /* expected */ } - _task = null; + cts?.Dispose(); Interlocked.Exchange(ref _tickInFlight, 0); } diff --git a/content/develop/use-cases/feature-store/dotnet/_index.md b/content/develop/use-cases/feature-store/dotnet/_index.md index 8d8aa55cd0..ccb56ae8b5 100644 --- a/content/develop/use-cases/feature-store/dotnet/_index.md +++ b/content/develop/use-cases/feature-store/dotnet/_index.md @@ -87,6 +87,73 @@ is doing right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, background task. The HTTP handlers in `Program.cs` read any subset of those features through `FeatureStore`'s helper class. +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `BuildFeatures.SynthesizeUsers(N, seed)` (in + production, the equivalent computation lives in an offline pipeline + against the warehouse). The result is + `Dictionary>` keyed by user + ID. +2. `store.BulkLoadAsync(rows, ttlSeconds)` queues one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user on an `IBatch`, + calls `batch.Execute()` to ship the whole thing in one round trip, then + `Task.WhenAll` waits for every per-command reply. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming +layer computes whatever real-time signals fall out of that event and +calls `store.UpdateStreamingAsync(userId, fields, ttlSeconds)`. That queues: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field values. + Redis is single-threaded per shard, so this is atomic against any + concurrent batch write on the same hash — no version columns, no locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the + fields that were written, with the streaming TTL. Each streaming field + carries its own per-field expiry independent of the rest of the hash. + Stop the worker and these fields drop out one by one as their TTLs + elapse, while the batch fields remain populated under the longer + key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is owned + by the model, not the store). +2. It calls `store.GetFeaturesAsync(userId, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). StackExchange.Redis returns + the values in the same order as the requested fields, with + `RedisValue.Null` for any field that doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.BatchGetFeaturesAsync(userIds, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` + users in a single network round trip via `IBatch`. + +### Project layout + +The csproj sits at the project root with every C# source file next to it, +mirroring every other client demo in this use case: + +```text +feature-store/dotnet/ +├── FeatureStoreDemo.csproj +├── Program.cs — main() + ASP.NET Core minimal-API routes +├── FeatureStore.cs — FeatureStore class + EncodeValue + Stats record +├── BuildFeatures.cs — SynthesizeUsers + RunCliAsync +├── StreamingWorker.cs — background-task worker +└── HtmlTemplate.cs — inlined HTML page (C# 11 raw string literal) +``` + +Build and run with `dotnet run -c Release`. The `--mode build-features` +flag short-circuits to the CLI builder before the HTTP server starts up. + ## The feature-store helper The `FeatureStore` class wraps the read/write paths @@ -270,7 +337,7 @@ TTL to each of those same fields. `HashFieldExpireAsync` returns one deleted the field instead of applying a TTL. * `ExpireResult.ConditionNotMet` (= `0`): an `NX | XX | GT | LT` conditional flag was specified and not met (we never use one here). -* `ExpireResult.NotExist` (= `-2`): no such field, or no such key. +* `ExpireResult.NoSuchField` (= `-2`): no such field, or no such key. We always follow `HSET` with `HEXPIRE` so any code other than `Success` means the per-field TTL invariant didn't hold — the helper throws an @@ -543,13 +610,25 @@ The guidance below focuses on the production concerns specific to running a feature store on Redis. For the generic StackExchange.Redis production checklist — [`ConfigurationOptions`]({{< relref "/develop/clients/dotnet/connect" >}}) -tuning, AUTH/ACL, retry/backoff, error handling — see the -[client guide]({{< relref "/develop/clients/dotnet" >}}). For TLS -specifically, follow the +tuning, AUTH/ACL, retry/backoff, multiplexer lifetime, and exception +handling — see the +[StackExchange.Redis production usage guide]({{< relref "/develop/clients/dotnet/produsage" >}}). +For TLS specifically, follow the [connect-with-TLS recipe]({{< relref "/develop/clients/dotnet/connect#connect-to-your-production-redis-with-tls" >}}). The feature-store demo runs against `localhost` with the defaults; a real deployment should harden the client first. +### Adopting the helper outside ASP.NET Core + +`FeatureStore.cs` omits `.ConfigureAwait(false)` on its `await` calls +because ASP.NET Core 8 has no synchronization context — every `await` +resumes on a thread-pool thread, so the flag is a no-op and just +clutters the source. If you copy the helper into a context that *does* +have a synchronization context (a Windows Forms or WPF app, classic +ASP.NET, a Xamarin or MAUI UI thread, or a library that needs to play +nicely with any consumer) add `.ConfigureAwait(false)` after every +`await` to avoid deadlocking the UI thread on the resumption. + ### Pick the batch TTL to outlast a failed refresher The whole-entity `EXPIRE` is your safety net against silent staleness diff --git a/content/develop/use-cases/feature-store/rust/Cargo.toml b/content/develop/use-cases/feature-store/rust/Cargo.toml index 4ea9e05835..b017cfaf14 100644 --- a/content/develop/use-cases/feature-store/rust/Cargo.toml +++ b/content/develop/use-cases/feature-store/rust/Cargo.toml @@ -23,6 +23,12 @@ path = "build_features_bin.rs" redis = { version = "0.27", features = ["tokio-comp", "aio", "connection-manager"] } tokio = { version = "1", features = ["full"] } axum = "0.7" +# axum-extra ships an alternative Form extractor that uses +# `serde_html_form` under the hood; unlike axum's default Form (backed +# by serde_urlencoded) it deserializes repeated keys into `Vec`, +# which is what we need to accept the demo's `field=name1&field=name2` +# request bodies. +axum-extra = { version = "0.9", features = ["form"] } serde = { version = "1.0", features = ["derive"] } serde_json = "1.0" rand = "0.8" diff --git a/content/develop/use-cases/feature-store/rust/_index.md b/content/develop/use-cases/feature-store/rust/_index.md index e3e1ef15fc..ce3fa2b216 100644 --- a/content/develop/use-cases/feature-store/rust/_index.md +++ b/content/develop/use-cases/feature-store/rust/_index.md @@ -83,6 +83,56 @@ right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, Streams job. The HTTP handlers in `demo_server.rs` read any subset of those features through `feature_store.rs`'s helper struct. +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `synthesize_users(N, seed)` (in production, the + equivalent computation lives in an offline pipeline against the + warehouse). The result is a `Vec<(String, FeatureMap)>` for every user + in this cycle. +2. `store.bulk_load(&rows, ttl_seconds).await` queues one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through a + non-transactional `redis::pipe()`, so the whole batch ships in a single + round trip. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming +layer computes whatever real-time signals fall out of that event and +calls `store.update_streaming(user_id, &fields, ttl_seconds).await`. That +queues: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field + values. Redis is single-threaded per shard, so this is atomic against + any concurrent batch write on the same hash — no version columns, no + locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the + fields that were written, with the streaming TTL. Each streaming field + carries its own per-field expiry independent of the rest of the hash. + Stop the worker and these fields drop out one by one as their TTLs + elapse, while the batch fields remain populated under the longer + key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is + owned by the model, not the store). +2. It calls `store.get_features(user_id, &names).await`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values + in the same order as the requested fields, with `None` for any field + that doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.batch_get_features(&user_ids, &names).await`, which pipelines + one [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all + `N` users in a single network round trip via `redis::pipe()`. + ## The feature-store helper The `FeatureStore` struct wraps the read/write paths diff --git a/content/develop/use-cases/feature-store/rust/build_features.rs b/content/develop/use-cases/feature-store/rust/build_features.rs index e6bdf5b637..c42956d967 100644 --- a/content/develop/use-cases/feature-store/rust/build_features.rs +++ b/content/develop/use-cases/feature-store/rust/build_features.rs @@ -78,14 +78,20 @@ pub async fn cli_main() -> redis::RedisResult<()> { let mut seed: u64 = 42; let args: Vec = std::env::args().skip(1).collect(); + let need_value = |flag: &str, idx: usize| -> &String { + args.get(idx + 1).unwrap_or_else(|| { + eprintln!("Missing value for {flag}"); + std::process::exit(2); + }) + }; let mut i = 0usize; while i < args.len() { match args[i].as_str() { - "--redis-url" => { redis_url = args[i + 1].clone(); i += 2; } - "--count" => { count = args[i + 1].parse().unwrap_or(count); i += 2; } - "--ttl-seconds" => { ttl_seconds = args[i + 1].parse().unwrap_or(ttl_seconds); i += 2; } - "--key-prefix" => { key_prefix = args[i + 1].clone(); i += 2; } - "--seed" => { seed = args[i + 1].parse().unwrap_or(seed); i += 2; } + "--redis-url" => { redis_url = need_value("--redis-url", i).clone(); i += 2; } + "--count" => { count = need_value("--count", i).parse().unwrap_or(count); i += 2; } + "--ttl-seconds" => { ttl_seconds = need_value("--ttl-seconds", i).parse().unwrap_or(ttl_seconds); i += 2; } + "--key-prefix" => { key_prefix = need_value("--key-prefix", i).clone(); i += 2; } + "--seed" => { seed = need_value("--seed", i).parse().unwrap_or(seed); i += 2; } "-h" | "--help" => { println!("Usage: build_features [--redis-url URL] [--count N] [--ttl-seconds S] [--key-prefix PREFIX] [--seed N]"); return Ok(()); diff --git a/content/develop/use-cases/feature-store/rust/demo_server.rs b/content/develop/use-cases/feature-store/rust/demo_server.rs index 9d6e625a30..6b2bcf5900 100644 --- a/content/develop/use-cases/feature-store/rust/demo_server.rs +++ b/content/develop/use-cases/feature-store/rust/demo_server.rs @@ -13,13 +13,17 @@ use std::sync::Arc; use std::time::{Duration, Instant}; use axum::{ - body::Bytes, extract::{Query, State}, http::StatusCode, response::{Html, IntoResponse, Json}, routing::{get, post}, Router, }; +// `axum-extra`'s Form extractor wraps `serde_html_form`, which (unlike +// axum's default `Form`/`serde_urlencoded`) keeps every value when a +// form key repeats. That's what lets `field=a&field=b` deserialize as +// `Vec` in `ReadForm` and `BatchReadForm` below. +use axum_extra::extract::Form; use serde::{Deserialize, Serialize}; use tokio::sync::Mutex; @@ -73,17 +77,23 @@ impl Default for Args { fn parse_args() -> Args { let mut a = Args::default(); let argv: Vec = std::env::args().skip(1).collect(); + let need_value = |flag: &str, idx: usize| -> &String { + argv.get(idx + 1).unwrap_or_else(|| { + eprintln!("Missing value for {flag}"); + std::process::exit(2); + }) + }; let mut i = 0usize; while i < argv.len() { match argv[i].as_str() { - "--host" => { a.host = argv[i + 1].clone(); i += 2; } - "--port" => { a.port = argv[i + 1].parse().unwrap_or(a.port); i += 2; } - "--redis-url" => { a.redis_url = argv[i + 1].clone(); i += 2; } - "--key-prefix" => { a.key_prefix = argv[i + 1].clone(); i += 2; } - "--batch-ttl-seconds" => { a.batch_ttl_seconds = argv[i + 1].parse().unwrap_or(a.batch_ttl_seconds); i += 2; } - "--streaming-ttl-seconds" => { a.streaming_ttl_seconds = argv[i + 1].parse().unwrap_or(a.streaming_ttl_seconds); i += 2; } - "--users-per-tick" => { a.users_per_tick = argv[i + 1].parse().unwrap_or(a.users_per_tick); i += 2; } - "--seed-users" => { a.seed_users = argv[i + 1].parse().unwrap_or(a.seed_users); i += 2; } + "--host" => { a.host = need_value("--host", i).clone(); i += 2; } + "--port" => { a.port = need_value("--port", i).parse().unwrap_or(a.port); i += 2; } + "--redis-url" => { a.redis_url = need_value("--redis-url", i).clone(); i += 2; } + "--key-prefix" => { a.key_prefix = need_value("--key-prefix", i).clone(); i += 2; } + "--batch-ttl-seconds" => { a.batch_ttl_seconds = need_value("--batch-ttl-seconds", i).parse().unwrap_or(a.batch_ttl_seconds); i += 2; } + "--streaming-ttl-seconds" => { a.streaming_ttl_seconds = need_value("--streaming-ttl-seconds", i).parse().unwrap_or(a.streaming_ttl_seconds); i += 2; } + "--users-per-tick" => { a.users_per_tick = need_value("--users-per-tick", i).parse().unwrap_or(a.users_per_tick); i += 2; } + "--seed-users" => { a.seed_users = need_value("--seed-users", i).parse().unwrap_or(a.seed_users); i += 2; } "--no-reset" => { a.reset_on_start = false; i += 1; } "-h" | "--help" => { println!("Usage: demo_server [--host H] [--port P] [--redis-url URL] [--key-prefix PFX] [--batch-ttl-seconds S] [--streaming-ttl-seconds S] [--users-per-tick N] [--seed-users N] [--no-reset]"); @@ -250,14 +260,16 @@ async fn inspect( }))) } +#[derive(Deserialize)] +struct BulkLoadForm { count: Option, ttl: Option } + async fn bulk_load( State(state): State, - body: Bytes, + Form(form): Form, ) -> impl IntoResponse { let _guard = state.demo_lock.lock().await; - let form = parse_form_multi(&body); - let count = clamp(parse_int(form.get("count").and_then(|v| v.first()), 200), 1, 2000) as usize; - let ttl = clamp(parse_int(form.get("ttl").and_then(|v| v.first()), 86400), 5, 172_800) as u64; + let count = clamp(form.count.unwrap_or(200), 1, 2000) as usize; + let ttl = clamp(form.ttl.unwrap_or(86400), 5, 172_800) as u64; let rows = synthesize_users(count, state.seed); let start = Instant::now(); let loaded = match state.store.bulk_load(&rows, ttl).await { @@ -311,24 +323,24 @@ async fn toggle_worker(State(state): State) -> impl IntoResponse { })) } +#[derive(Deserialize)] +struct ReadForm { + user: Option, + #[serde(default)] + field: Vec, +} + async fn read( State(state): State, - body: Bytes, + Form(form): Form, ) -> impl IntoResponse { - let form = parse_form_multi(&body); - let user = form - .get("user") - .and_then(|v| v.first()) - .cloned() - .unwrap_or_default(); + let user = form.user.unwrap_or_default(); if user.is_empty() { return (StatusCode::BAD_REQUEST, Json(serde_json::json!({"error": "user is required"}))); } let fields: Vec = form - .get("field") - .cloned() - .unwrap_or_default() + .field .into_iter() .filter(|f| !f.is_empty()) .collect(); @@ -356,16 +368,20 @@ async fn read( }))) } +#[derive(Deserialize)] +struct BatchReadForm { + count: Option, + #[serde(default)] + field: Vec, +} + async fn batch_read( State(state): State, - body: Bytes, + Form(form): Form, ) -> impl IntoResponse { - let form = parse_form_multi(&body); - let count = clamp(parse_int(form.get("count").and_then(|v| v.first()), 100), 1, 500) as usize; + let count = clamp(form.count.unwrap_or(100), 1, 500) as usize; let mut fields: Vec = form - .get("field") - .cloned() - .unwrap_or_default() + .field .into_iter() .filter(|f| !f.is_empty()) .collect(); @@ -408,50 +424,6 @@ fn clamp(v: i64, lo: i64, hi: i64) -> i64 { v.max(lo).min(hi) } -fn parse_int(s: Option<&String>, def: i64) -> i64 { - s.and_then(|v| v.parse::().ok()).unwrap_or(def) -} - -/// Parse an `application/x-www-form-urlencoded` body into a -/// multi-value map. `axum::Form` uses `serde_urlencoded` under the -/// hood, which silently drops repeated keys — we need every -/// `field=` entry, so we parse the body manually. -fn parse_form_multi(body: &Bytes) -> std::collections::HashMap> { - let mut out: std::collections::HashMap> = std::collections::HashMap::new(); - let s = std::str::from_utf8(body).unwrap_or(""); - if s.is_empty() { return out; } - for pair in s.split('&') { - let (k, v) = match pair.split_once('=') { - Some((k, v)) => (urldecode(k), urldecode(v)), - None => (urldecode(pair), String::new()), - }; - out.entry(k).or_default().push(v); - } - out -} - -/// Tiny URL decoder for `+` (space) and `%XX` (hex byte). UTF-8 safe -/// because we buffer bytes and convert at the end. -fn urldecode(s: &str) -> String { - let bytes = s.as_bytes(); - let mut out: Vec = Vec::with_capacity(bytes.len()); - let mut i = 0; - while i < bytes.len() { - match bytes[i] { - b'+' => { out.push(b' '); i += 1; } - b'%' if i + 2 < bytes.len() => { - if let Ok(byte) = u8::from_str_radix(&s[i + 1..i + 3], 16) { - out.push(byte); i += 3; - } else { - out.push(bytes[i]); i += 1; - } - } - c => { out.push(c); i += 1; } - } - } - String::from_utf8_lossy(&out).into_owned() -} - fn render_html_page(key_prefix: &str, streaming_ttl: u64, users_per_tick: usize) -> String { let batch_json = serde_json::to_string(&DEFAULT_BATCH_FIELDS).unwrap(); let stream_json = serde_json::to_string(&DEFAULT_STREAMING_FIELDS).unwrap(); diff --git a/content/develop/use-cases/feature-store/rust/feature_store.rs b/content/develop/use-cases/feature-store/rust/feature_store.rs index 4e63c10226..d138725880 100644 --- a/content/develop/use-cases/feature-store/rust/feature_store.rs +++ b/content/develop/use-cases/feature-store/rust/feature_store.rs @@ -9,7 +9,6 @@ use std::collections::BTreeMap; use std::sync::atomic::{AtomicI64, Ordering}; use std::sync::Arc; -use std::time::Duration; use redis::aio::ConnectionManager; use redis::{AsyncCommands, FromRedisValue, RedisResult, Value}; @@ -41,11 +40,6 @@ pub const DEFAULT_BATCH_TTL_SECONDS: u64 = 24 * 60 * 60; pub const DEFAULT_STREAMING_TTL_SECONDS: u64 = 5 * 60; pub const DEFAULT_KEY_PREFIX: &str = "fs:user:"; -/// Hard cap on how long a single redis call can block before the -/// `tokio::time::timeout` wrapper bails out. Stops a stuck server from -/// hanging the demo's web UI indefinitely. -pub const REQUEST_TIMEOUT: Duration = Duration::from_secs(10); - /// One feature value the helper knows how to encode. #[derive(Debug, Clone)] pub enum FeatureValue { From 5693f278d84126b43fc6a161b1f6f97a57cc31f5 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 12:01:30 +0100 Subject: [PATCH 10/20] DOC-6661 fully reviewed Ruby and PHP examples --- .gitignore | 5 + .../develop/use-cases/feature-store/_index.md | 2 + .../feature-store/php/BuildFeatures.php | 69 ++ .../feature-store/php/FeatureStore.php | 386 +++++++++ .../feature-store/php/StreamingWorker.php | 147 ++++ .../use-cases/feature-store/php/_index.md | 743 ++++++++++++++++++ .../feature-store/php/build_features.php | 48 ++ .../use-cases/feature-store/php/composer.json | 18 + .../feature-store/php/demo_server.php | 451 +++++++++++ .../feature-store/php/demo_template.html | 342 ++++++++ .../feature-store/php/streaming_worker.php | 27 + .../use-cases/feature-store/ruby/Gemfile | 11 + .../use-cases/feature-store/ruby/_index.md | 663 ++++++++++++++++ .../feature-store/ruby/build_features.rb | 89 +++ .../feature-store/ruby/demo_server.rb | 665 ++++++++++++++++ .../feature-store/ruby/feature_store.rb | 294 +++++++ .../feature-store/ruby/streaming_worker.rb | 181 +++++ 17 files changed, 4141 insertions(+) create mode 100644 content/develop/use-cases/feature-store/php/BuildFeatures.php create mode 100644 content/develop/use-cases/feature-store/php/FeatureStore.php create mode 100644 content/develop/use-cases/feature-store/php/StreamingWorker.php create mode 100644 content/develop/use-cases/feature-store/php/_index.md create mode 100644 content/develop/use-cases/feature-store/php/build_features.php create mode 100644 content/develop/use-cases/feature-store/php/composer.json create mode 100644 content/develop/use-cases/feature-store/php/demo_server.php create mode 100644 content/develop/use-cases/feature-store/php/demo_template.html create mode 100644 content/develop/use-cases/feature-store/php/streaming_worker.php create mode 100644 content/develop/use-cases/feature-store/ruby/Gemfile create mode 100644 content/develop/use-cases/feature-store/ruby/_index.md create mode 100644 content/develop/use-cases/feature-store/ruby/build_features.rb create mode 100644 content/develop/use-cases/feature-store/ruby/demo_server.rb create mode 100644 content/develop/use-cases/feature-store/ruby/feature_store.rb create mode 100644 content/develop/use-cases/feature-store/ruby/streaming_worker.rb diff --git a/.gitignore b/.gitignore index 5c4cc19956..3897a97956 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,8 @@ package-lock.json # .NET build output for the docs demos /content/develop/use-cases/**/dotnet/bin/ /content/develop/use-cases/**/dotnet/obj/ +# PHP and Ruby build output for the docs demos +/content/develop/use-cases/**/php/vendor/ +/content/develop/use-cases/**/php/composer.lock +/content/develop/use-cases/**/ruby/.gems/ +/content/develop/use-cases/**/ruby/Gemfile.lock diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index 4f65e0d2b1..e1cdcd54ab 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -162,3 +162,5 @@ for a single user under 1 ms, and pipeline batch reads across a hundred users. * [Lettuce (Java)]({{< relref "/develop/use-cases/feature-store/java-lettuce" >}}) * [redis-rs (Rust)]({{< relref "/develop/use-cases/feature-store/rust" >}}) * [StackExchange.Redis (C#)]({{< relref "/develop/use-cases/feature-store/dotnet" >}}) +* [Predis (PHP)]({{< relref "/develop/use-cases/feature-store/php" >}}) +* [redis-rb (Ruby)]({{< relref "/develop/use-cases/feature-store/ruby" >}}) diff --git a/content/develop/use-cases/feature-store/php/BuildFeatures.php b/content/develop/use-cases/feature-store/php/BuildFeatures.php new file mode 100644 index 0000000000..c75cce2e8c --- /dev/null +++ b/content/develop/use-cases/feature-store/php/BuildFeatures.php @@ -0,0 +1,69 @@ +> + */ + public static function synthesizeUsers(int $count, int $seed = 42): array + { + mt_srand($seed); + $users = []; + for ($i = 1; $i <= $count; $i++) { + $uid = sprintf('u%04d', $i); + $users[$uid] = [ + 'country_iso' => self::COUNTRY_CHOICES[mt_rand(0, count(self::COUNTRY_CHOICES) - 1)], + 'risk_segment' => self::weightedStr(self::RISK_SEGMENTS, self::RISK_WEIGHTS), + 'account_age_days' => mt_rand(7, 2400), + 'tx_count_7d' => mt_rand(0, 80), + 'avg_amount_30d' => round(5.0 + mt_rand() / mt_getrandmax() * 345.0, 2), + 'chargeback_count_180d' => self::weightedInt(self::CHARGEBACK_BUCKETS, self::CHARGEBACK_WEIGHTS), + ]; + } + return $users; + } + + private static function weightedStr(array $items, array $weights): string + { + $total = array_sum($weights); + $r = mt_rand(0, $total - 1); + foreach ($items as $i => $item) { + $r -= $weights[$i]; + if ($r < 0) return $item; + } + return $items[count($items) - 1]; + } + + private static function weightedInt(array $items, array $weights): int + { + $total = array_sum($weights); + $r = mt_rand(0, $total - 1); + foreach ($items as $i => $item) { + $r -= $weights[$i]; + if ($r < 0) return $item; + } + return $items[count($items) - 1]; + } +} diff --git a/content/develop/use-cases/feature-store/php/FeatureStore.php b/content/develop/use-cases/feature-store/php/FeatureStore.php new file mode 100644 index 0000000000..356d0e0f03 --- /dev/null +++ b/content/develop/use-cases/feature-store/php/FeatureStore.php @@ -0,0 +1,386 @@ +redis = $redis; + $this->keyPrefix = $keyPrefix; + $this->batchTtlSeconds = $batchTtlSeconds; + $this->streamingTtlSeconds = $streamingTtlSeconds; + } + + public function keyFor(string $entityId): string + { + return $this->keyPrefix . $entityId; + } + + // ------------------------------------------------------------------ + // Batch ingestion (materialization) + // ------------------------------------------------------------------ + + /** + * Materialize a batch of entities into Redis. + * + * One `HSET` plus one `EXPIRE` per entity, all queued through a + * non-transactional pipeline so the whole batch ships in a single + * round trip. + * + * @param array> $rows + */ + public function bulkLoad(array $rows, ?int $ttlSeconds = null): int + { + if (count($rows) === 0) return 0; + $ttl = $ttlSeconds ?? $this->batchTtlSeconds; + $this->redis->pipeline(function ($pipe) use ($rows, $ttl) { + foreach ($rows as $entityId => $fields) { + $key = $this->keyFor((string)$entityId); + // Predis 3's `hset` accepts variadic field/value + // pairs (key, f1, v1, f2, v2, ...) but not a + // single field=>value map argument the way Predis + // 2 did — flatten the encoded map into that shape. + $flat = []; + foreach ($fields as $name => $value) { + $flat[] = $name; + $flat[] = self::encodeValue($value); + } + $pipe->hset($key, ...$flat); + $pipe->expire($key, $ttl); + } + }); + $this->incrStat('batch_writes_total', count($rows)); + return count($rows); + } + + public function updateBatchFeature(string $entityId, string $field, mixed $value): void + { + $this->redis->hset($this->keyFor($entityId), $field, self::encodeValue($value)); + $this->incrStat('batch_writes_total', 1); + } + + // ------------------------------------------------------------------ + // Streaming ingestion + // ------------------------------------------------------------------ + + /** + * Write streaming features with a per-field TTL. + * + * `HSET` and `HEXPIRE` are queued in the same pipeline so Redis + * runs them in order: the `HSET` first creates or overwrites the + * fields, then `HEXPIRE` attaches a TTL to each of those same + * fields. + * + * `HEXPIRE` returns one status code per field: + * * 1 = TTL set / updated. + * * 2 = the expiry was 0 or in the past (so Redis deleted the + * field instead of applying a TTL). + * * 0 = an NX | XX | GT | LT conditional flag was specified and + * not met (we never use one here). + * * -2 = no such field, or no such key. + * We always follow `HSET` with `HEXPIRE` so any code other than 1 + * means the per-field TTL invariant didn't hold -- throw rather + * than silently leave a streaming field with no expiry attached. + * + * @param array $fields + */ + public function updateStreaming(string $entityId, array $fields, ?int $ttlSeconds = null): void + { + if (count($fields) === 0) return; + $ttl = $ttlSeconds ?? $this->streamingTtlSeconds; + $key = $this->keyFor($entityId); + $flat = []; + $names = []; + foreach ($fields as $name => $value) { + $names[] = $name; + $flat[] = $name; + $flat[] = self::encodeValue($value); + } + + $results = $this->redis->pipeline(function ($pipe) use ($key, $flat, $names, $ttl) { + // Predis 3 hset wants variadic field/value pairs, not a + // single array map; spread the flattened list. + $pipe->hset($key, ...$flat); + $pipe->hexpire($key, $ttl, $names); + }); + // $results[0] = HSET reply (count of new fields set) + // $results[1] = HEXPIRE reply (array of per-field codes) + $codes = $results[1] ?? []; + foreach ($codes as $code) { + if ((int)$code !== 1) { + throw new RuntimeException( + "HEXPIRE did not set every field TTL for {$key}: " . json_encode($codes) + ); + } + } + $this->incrStat('streaming_writes_total', count($fields)); + } + + // ------------------------------------------------------------------ + // Inference reads + // ------------------------------------------------------------------ + + /** + * Retrieve a subset of features for one entity with `HMGET`. + * Pass `$fieldNames=null` to fetch the entire hash with `HGETALL` -- + * useful for debugging but rarely the right call on the request + * path. + * + * @return array + */ + public function getFeatures(string $entityId, ?array $fieldNames): array + { + $key = $this->keyFor($entityId); + if ($fieldNames === null) { + $data = $this->redis->hgetall($key); + $this->incrStat('reads_total', 1); + $this->incrStat('read_fields_total', count($data)); + return $data; + } + if (count($fieldNames) === 0) return []; + $values = $this->redis->hmget($key, $fieldNames); + $out = []; + foreach ($fieldNames as $i => $n) { + if ($values[$i] !== null) $out[$n] = (string)$values[$i]; + } + $this->incrStat('reads_total', 1); + $this->incrStat('read_fields_total', count($out)); + return $out; + } + + /** + * Pipeline `HMGET` across many entities for batch scoring. One + * round trip for the whole batch. + * + * @return array> + */ + public function batchGetFeatures(array $entityIds, array $fieldNames): array + { + if (count($entityIds) === 0 || count($fieldNames) === 0) return []; + $rows = $this->redis->pipeline(function ($pipe) use ($entityIds, $fieldNames) { + foreach ($entityIds as $id) { + $pipe->hmget($this->keyFor($id), $fieldNames); + } + }); + $out = []; + $seen = 0; + foreach ($entityIds as $i => $id) { + $values = $rows[$i] ?? []; + $row = []; + foreach ($fieldNames as $j => $n) { + if (($values[$j] ?? null) !== null) { + $row[$n] = (string)$values[$j]; + $seen++; + } + } + $out[$id] = $row; + } + $this->incrStat('reads_total', count($entityIds)); + $this->incrStat('read_fields_total', $seen); + return $out; + } + + // ------------------------------------------------------------------ + // TTL inspection (used by the demo UI) + // ------------------------------------------------------------------ + + public function keyTtlSeconds(string $entityId): int + { + return (int)$this->redis->ttl($this->keyFor($entityId)); + } + + /** + * Per-field TTL via `HTTL` (Redis 7.4+). Each value mirrors the + * `TTL` convention: positive seconds remaining, `-1` no field TTL, + * `-2` field (or key) missing. + * + * @return array + */ + public function fieldTtlsSeconds(string $entityId, array $fieldNames): array + { + if (count($fieldNames) === 0) return []; + $codes = $this->redis->httl($this->keyFor($entityId), $fieldNames); + // HTTL on a missing key returns a flat array of -2s. No + // defensive shim needed for this client. + $out = []; + foreach ($fieldNames as $i => $n) { + $out[$n] = isset($codes[$i]) ? (int)$codes[$i] : -2; + } + return $out; + } + + // ------------------------------------------------------------------ + // Demo housekeeping + // ------------------------------------------------------------------ + + public function listEntityIds(int $limit = 200): array + { + $ids = []; + $cursor = '0'; + $prefixLen = strlen($this->keyPrefix); + do { + [$cursor, $keys] = $this->redis->scan( + $cursor, + ['MATCH' => $this->keyPrefix . '*', 'COUNT' => 200], + ); + foreach ($keys as $k) { + if (strlen($k) > $prefixLen) { + $ids[] = substr($k, $prefixLen); + if (count($ids) >= $limit) { sort($ids); return $ids; } + } + } + } while ($cursor !== '0'); + sort($ids); + return $ids; + } + + public function countEntities(): int + { + $n = 0; + $cursor = '0'; + do { + [$cursor, $keys] = $this->redis->scan( + $cursor, + ['MATCH' => $this->keyPrefix . '*', 'COUNT' => 500], + ); + $n += count($keys); + } while ($cursor !== '0'); + return $n; + } + + public function deleteEntity(string $entityId): int + { + return (int)$this->redis->del($this->keyFor($entityId)); + } + + /** + * Drop every entity under the key prefix. Used by the demo reset + * path; scans in batches and issues one variadic `DEL` per batch. + */ + public function reset(): int + { + $deleted = 0; + $cursor = '0'; + $batch = []; + do { + [$cursor, $keys] = $this->redis->scan( + $cursor, + ['MATCH' => $this->keyPrefix . '*', 'COUNT' => 500], + ); + foreach ($keys as $k) { + $batch[] = $k; + if (count($batch) >= 500) { + $deleted += (int)$this->redis->del(...$batch); + $batch = []; + } + } + } while ($cursor !== '0'); + if (count($batch) > 0) { + $deleted += (int)$this->redis->del(...$batch); + } + return $deleted; + } + + // ------------------------------------------------------------------ + // Stats — kept in Redis under `fs:stats:*` so the demo server and + // the streaming worker (separate OS processes under `php -S`) can + // both increment and read them. + // ------------------------------------------------------------------ + + public function statsSnapshot(): array + { + return [ + 'batch_writes_total' => (int)$this->redis->get('fs:stats:batch_writes_total'), + 'streaming_writes_total' => (int)$this->redis->get('fs:stats:streaming_writes_total'), + 'reads_total' => (int)$this->redis->get('fs:stats:reads_total'), + 'read_fields_total' => (int)$this->redis->get('fs:stats:read_fields_total'), + ]; + } + + public function resetStats(): void + { + $this->redis->del(...[ + 'fs:stats:batch_writes_total', + 'fs:stats:streaming_writes_total', + 'fs:stats:reads_total', + 'fs:stats:read_fields_total', + ]); + } + + private function incrStat(string $name, int $by): void + { + if ($by <= 0) return; + $this->redis->incrby("fs:stats:{$name}", $by); + } + + /** + * Render a feature value as a string for hash storage. Booleans + * become `"true"`/`"false"` so they round-trip cleanly through + * other clients and redis-cli. + */ + public static function encodeValue(mixed $value): string + { + if ($value === null) return ''; + if (is_bool($value)) return $value ? 'true' : 'false'; + return (string)$value; + } +} diff --git a/content/develop/use-cases/feature-store/php/StreamingWorker.php b/content/develop/use-cases/feature-store/php/StreamingWorker.php new file mode 100644 index 0000000000..116fcbde15 --- /dev/null +++ b/content/develop/use-cases/feature-store/php/StreamingWorker.php @@ -0,0 +1,147 @@ +redis->set('fs:control:worker_pid', (string)getmypid()); + $this->redis->set('fs:control:running', '1'); + + // Trap SIGTERM / SIGINT so a `kill ` from the demo + // server's shutdown path clears the Redis state instead of + // leaving stale `running`/`paused` flags behind. + if (function_exists('pcntl_signal')) { + pcntl_async_signals(true); + $shutdown = function () { + $this->redis->set('fs:control:stop', '1'); + }; + pcntl_signal(SIGTERM, $shutdown); + pcntl_signal(SIGINT, $shutdown); + } + + try { + while (true) { + if ($this->redis->get('fs:control:stop') === '1') break; + $this->microsleep($this->tickSeconds); + if ($this->redis->get('fs:control:stop') === '1') break; + + // Set tick_in_flight *before* the pause check so a + // concurrent pause + wait_for_idle (reset path) can + // never observe tick_in_flight=0 in the window + // between the pause check and the actual tick. The + // finally block clears the flag whether we paused, + // succeeded, or threw. + $this->redis->set('fs:control:tick_in_flight', '1'); + try { + if ($this->redis->get('fs:control:paused') !== '1') { + $this->doTick(); + } + } catch (\Throwable $e) { + fwrite(STDERR, "[streaming-worker] tick failed: " . $e->getMessage() . "\n"); + } finally { + $this->redis->set('fs:control:tick_in_flight', '0'); + } + } + } finally { + // Clear running, tick_in_flight, and stop no matter how + // the loop exits so a later restart can spin a fresh + // worker with a clean slate. + $this->redis->del(...[ + 'fs:control:running', + 'fs:control:tick_in_flight', + 'fs:control:worker_pid', + 'fs:control:stop', + ]); + } + } + + private function doTick(): void + { + $ids = $this->store->listEntityIds(500); + if (count($ids) === 0) return; + $picks = $this->sample($ids, $this->usersPerTick); + $nowMs = (int)(microtime(true) * 1000); + $writes = 0; + foreach ($picks as $id) { + $fields = [ + 'last_login_ts' => $nowMs, + 'last_device_id' => self::DEVICE_IDS[array_rand(self::DEVICE_IDS)], + 'tx_count_5m' => random_int(0, 12), + 'failed_logins_15m' => $this->weighted(self::FAILED_LOGIN_BUCKETS, self::FAILED_LOGIN_WEIGHTS), + 'session_country' => self::SESSION_COUNTRIES[array_rand(self::SESSION_COUNTRIES)], + ]; + $this->store->updateStreaming($id, $fields); + $writes += count($fields); + } + $this->redis->incrby('fs:control:tick_count', 1); + $this->redis->incrby('fs:control:writes_count', $writes); + } + + private function sample(array $items, int $k): array + { + $n = min($k, count($items)); + if ($n === 0) return []; + $keys = array_rand($items, $n); + if (!is_array($keys)) $keys = [$keys]; + $out = []; + foreach ($keys as $key) $out[] = $items[$key]; + return $out; + } + + private function weighted(array $items, array $weights): int + { + $total = array_sum($weights); + $r = random_int(0, $total - 1); + foreach ($items as $i => $item) { + $r -= $weights[$i]; + if ($r < 0) return $item; + } + return $items[count($items) - 1]; + } + + private function microsleep(float $seconds): void + { + usleep((int)($seconds * 1_000_000)); + } +} diff --git a/content/develop/use-cases/feature-store/php/_index.md b/content/develop/use-cases/feature-store/php/_index.md new file mode 100644 index 0000000000..7adee426ae --- /dev/null +++ b/content/develop/use-cases/feature-store/php/_index.md @@ -0,0 +1,743 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in PHP with Predis +linkTitle: Predis example (PHP) +title: Redis feature store with Predis +weight: 9 +--- + +This guide shows you how to build a small Redis-backed online feature store +in PHP with [Predis]({{< relref "/develop/clients/php" >}}). The demo runs +on top of PHP's built-in development server (`php -S`) and uses a detached +CLI process for the streaming worker, so you can bulk-load a batch of users +with a key-level TTL, watch real-time features expire per-field via +`HEXPIRE`, retrieve any subset of features for one user under 2 ms, and +pipeline `HMGET` across a hundred users for batch scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the +model needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an +application-side cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with + the batch materialization cycle (24 hours in the demo). If the batch + refresher fails, the whole entity disappears at the next cycle and + inference sees a missing entity — which the model handler can detect and + fall back on — rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) + on each streaming feature gives that field its own shorter expiry, + independent of the rest of the hash. If the streaming pipeline stops + updating a feature, the field self-cleans while the batch fields stay + populated. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in + practice the bottleneck is the network round trip plus the model's own + feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected + field expire on its own timer. + +## How PHP's request model shapes the demo + +PHP's hosting model is different from every other client in this use case. +`php -S` gives each request a fresh PHP execution context, so a long-lived +streaming worker can't live inside the demo router the way it does in +Python, Node.js, Go, or Java. The demo handles this by spawning the +streaming worker as a **separate, detached CLI process** the first time +the demo server is hit. The router and the worker then share state through +Redis itself: + +* `fs:control:worker_pid` — PID of the running worker. The router checks + it on every request and respawns the worker if the PID is no longer + alive. +* `fs:control:paused` — `1` while paused, `0` otherwise. The worker polls + this between ticks. +* `fs:control:tick_in_flight` — set by the worker *before* each tick and + cleared after. The router's `/reset` handler waits for this to flip to + `0` before it issues the `DEL` sweep. +* `fs:control:tick_count` / `fs:control:writes_count` — counters the + router reads to populate the UI. +* `fs:control:stop` — graceful-shutdown flag the worker checks each tick. + +This is the same race-free pause-and-wait-idle pattern as every other +client; it's just implemented through Redis primitives because there's no +shared memory between the router and the worker. + +Predis-specific notes: + +* Predis 3 ships typed `hexpire()` and `httl()` methods. The helper uses + them directly. `HEXPIRE` returns one status code per field (`1` set, + `2` deleted because TTL was 0/past, `0` conditional flag not met, `-2` + no such field/key). +* Predis 3's `hset()` accepts variadic `field, value, field, value, ...` + pairs but **not** a single field=>value map argument the way Predis 2 + did. The helper flattens the encoded map and spreads it: + `$pipe->hset($key, ...$flat)`. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `BuildFeatures::synthesizeUsers($count, $seed)` + (in production, the equivalent computation lives in an offline + pipeline against the warehouse). The result is + `array>` keyed by user ID. +2. `$store->bulkLoad($rows, $ttlSeconds)` queues one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through + `$redis->pipeline(function ($pipe) { ... })`, so the whole batch ships + in a single round trip. + +### Streaming path (per tick) + +The detached `streaming_worker.php` process polls Redis once per tick and +calls `$store->updateStreaming($userId, $fields)` for a handful of random +users. That queues: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field + values. Redis is single-threaded per shard, so this is atomic against + any concurrent batch write on the same hash — no version columns, no + locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the + fields that were written, with the streaming TTL. Each streaming field + carries its own per-field expiry independent of the rest of the hash. + Pause the worker (or stop it entirely) and these fields drop out one + by one as their TTLs elapse, while the batch fields remain populated + under the longer key-level TTL. + +### Inference path (per HTTP request) + +1. The model server picks the feature subset it needs (the schema is + owned by the model, not the store). +2. It calls `$store->getFeatures($userId, $names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values + in the same order as the requested fields, with `null` for any field + that doesn't exist (or has expired). +3. For batch inference, the model server calls + `$store->batchGetFeatures($userIds, $names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` + users in a single network round trip. + +### Project layout + +```text +feature-store/php/ +├── composer.json — predis/predis ^3, PHP >= 8.1 +├── FeatureStore.php — FeatureStore class +├── StreamingWorker.php — worker tick loop (used by the CLI process) +├── BuildFeatures.php — synthesize_users + helpers +├── build_features.php — CLI entry point for the materializer +├── streaming_worker.php — CLI entry point for the worker process +├── demo_server.php — php -S router (HTTP routes + worker spawn) +└── demo_template.html — HTML page, loaded by file_get_contents +``` + +Run the demo with `composer install && composer start`, or directly: +`php -S 127.0.0.1:8094 demo_server.php`. + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/php/FeatureStore.php)): + +```php +bulkLoad([ + 'u0001' => [ + 'country_iso' => 'US', 'risk_segment' => 'low', + 'tx_count_7d' => 14, 'avg_amount_30d' => 92.40, + 'account_age_days' => 612, 'chargeback_count_180d' => 0, + ], +], 24 * 60 * 60); + +// Streaming write: HSET + HEXPIRE on just the fields that changed. +$store->updateStreaming('u0001', [ + 'last_login_ts' => (int)(microtime(true) * 1000), + 'last_device_id' => 'ios-9f02', + 'tx_count_5m' => 3, + 'failed_logins_15m' => 0, + 'session_country' => 'US', +], 5 * 60); + +// Inference read: HMGET of whatever the model needs. +$features = $store->getFeatures('u0001', [ + 'risk_segment', 'tx_count_7d', 'avg_amount_30d', + 'tx_count_5m', 'failed_logins_15m', +]); + +// Batch scoring: pipelined HMGET across many users. +$batch = $store->batchGetFeatures( + ['u0001', 'u0002', 'u0003'], + ['risk_segment', 'tx_count_5m', 'failed_logins_15m'], +); +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis +hash fields are bytes on the wire, so the helper encodes booleans as +`'true'` / `'false'` (`FeatureStore::encodeValue()`) and uses +`(string)$value` for everything else. The model server is responsible +for parsing back to the right type, the same way it would when reading +any serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +### Bulk-loading batch features + +`bulkLoad` queues one `HSET` and one `EXPIRE` per user through +`$redis->pipeline(...)`, so the whole batch ships in a single round trip. + +```php +public function bulkLoad(array $rows, ?int $ttlSeconds = null): int +{ + if (count($rows) === 0) return 0; + $ttl = $ttlSeconds ?? $this->batchTtlSeconds; + $this->redis->pipeline(function ($pipe) use ($rows, $ttl) { + foreach ($rows as $entityId => $fields) { + $key = $this->keyFor((string)$entityId); + // Predis 3's `hset` accepts variadic field/value pairs + // (key, f1, v1, f2, v2, ...) but not a single field=>value + // map argument the way Predis 2 did — flatten the encoded + // map into that shape. + $flat = []; + foreach ($fields as $name => $value) { + $flat[] = $name; + $flat[] = self::encodeValue($value); + } + $pipe->hset($key, ...$flat); + $pipe->expire($key, $ttl); + } + }); + ... +} +``` + +`$redis->pipeline(callable)` is a non-transactional batch: commands queue +up and ship in one round trip but they don't run inside a `MULTI/EXEC` +block. That's the right choice here because each user's `HSET` + +`EXPIRE` pair is independent of every other user's. For the rare case +where the pair has to be inseparable, use `$redis->transaction(...)` (or +a Lua script via [`EVAL`]({{< relref "/commands/eval" >}}) / +[Eval scripting]({{< relref "/develop/programmability/eval-intro" >}})). + +In production, the equivalent of this script runs as an offline pipeline +(a Spark or Feast `materialize` job) that reads from the warehouse and +writes into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`updateStreaming` is the linchpin of the mixed-staleness story: + +```php +public function updateStreaming(string $entityId, array $fields, ?int $ttlSeconds = null): void +{ + if (count($fields) === 0) return; + $ttl = $ttlSeconds ?? $this->streamingTtlSeconds; + $key = $this->keyFor($entityId); + $flat = []; + $names = []; + foreach ($fields as $name => $value) { + $names[] = $name; + $flat[] = $name; + $flat[] = self::encodeValue($value); + } + + $results = $this->redis->pipeline(function ($pipe) use ($key, $flat, $names, $ttl) { + $pipe->hset($key, ...$flat); + $pipe->hexpire($key, $ttl, $names); + }); + $codes = $results[1] ?? []; + foreach ($codes as $code) { + if ((int)$code !== 1) { + throw new RuntimeException( + "HEXPIRE did not set every field TTL for {$key}: " . json_encode($codes) + ); + } + } + ... +} +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on +*individual* hash fields, not on the whole key. The two commands are +queued in the same `pipeline` callback so Redis runs them in order: the +`HSET` first creates or overwrites the fields, then `HEXPIRE` attaches a +TTL to each of those same fields. `hexpire()` returns one status code +per field: + +* `1` — TTL set / updated. +* `2` — the expiry was 0 or in the past, so Redis deleted the field + instead of applying a TTL. +* `0` — an `NX | XX | GT | LT` conditional flag was specified and not + met (we never use one here). +* `-2` — no such field, or no such key. + +The helper throws if any code is anything other than `1`, so the "every +streaming write renews its TTL" invariant fails loudly rather than +silently leaving a streaming field with no expiry attached. + +If a streaming pipeline stops, the streaming fields drop out one by one +as their per-field TTLs elapse. `fieldTtlsSeconds` (which wraps `httl()`) +lets the model side inspect the remaining TTL on any field — useful +both for debugging and as a freshness signal in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level +> TTL commands were added in Redis 7.4. Predis 3.0 was the first major +> release with the typed bindings; the demo's `composer.json` pins +> `^3.0`. + +### Inference reads with HMGET + +`getFeatures` is one `HMGET`: + +```php +public function getFeatures(string $entityId, ?array $fieldNames): array +{ + $key = $this->keyFor($entityId); + if ($fieldNames === null) { + return $this->redis->hgetall($key); + } + if (count($fieldNames) === 0) return []; + $values = $this->redis->hmget($key, $fieldNames); + $out = []; + foreach ($fieldNames as $i => $n) { + if ($values[$i] !== null) $out[$n] = (string)$values[$i]; + } + return $out; +} +``` + +The model knows exactly which features it consumes, so the request path +always takes the `hmget` branch with an explicit field list — that's +the sub-millisecond path. `hgetall` is the right call for debugging +(which is what the demo's "Inspect" panel does) but not for serving: +it forces Redis to serialize every field, including ones the model +doesn't need. + +Fields that don't exist (because they were never written, or because +they expired) come back as `null`. The helper drops them from the +result array so the caller sees only the features that are actually +available. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```php +public function batchGetFeatures(array $entityIds, array $fieldNames): array +{ + if (count($entityIds) === 0 || count($fieldNames) === 0) return []; + $rows = $this->redis->pipeline(function ($pipe) use ($entityIds, $fieldNames) { + foreach ($entityIds as $id) { + $pipe->hmget($this->keyFor($id), $fieldNames); + } + }); + ... +} +``` + +One round trip for the whole batch. The demo regularly returns a +30-user batch in ~1 ms against a local Redis. + +A Redis Cluster is different: a single `pipeline()` block ships through +one connection to one node. For batch reads on a cluster, configure +Predis with a cluster connection profile and fan out parallel +non-pipelined `hmget` calls (the cluster client routes each one to the +right shard), or group entity IDs by hash slot and run one pipeline +against each shard's node-connection in parallel. A hash tag like +`fs:user:{vip}:u0001` forces a known set of keys onto the same shard +so one pipeline can cover them all in a single round trip. + +## The streaming worker + +`streaming_worker.php` is a small CLI shim that loads `StreamingWorker` +and runs its tick loop until the demo server flips +`fs:control:stop` to `1` (or SIGTERM lands). The class itself lives in +`StreamingWorker.php` +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/php/StreamingWorker.php)): + +```php +public function run(): void +{ + $this->redis->set('fs:control:worker_pid', (string)getmypid()); + $this->redis->set('fs:control:running', '1'); + // SIGTERM / SIGINT trap so the demo server's shutdown path + // can cleanly kill us via `posix_kill($pid, SIGTERM)`. + ... + try { + while (true) { + if ($this->redis->get('fs:control:stop') === '1') break; + $this->microsleep($this->tickSeconds); + if ($this->redis->get('fs:control:stop') === '1') break; + // Set tick_in_flight *before* the pause check so a + // concurrent pause + wait_for_idle (reset path) can + // never observe tick_in_flight=0 in the window between + // the pause check and the actual tick call. + $this->redis->set('fs:control:tick_in_flight', '1'); + try { + if ($this->redis->get('fs:control:paused') !== '1') { + $this->doTick(); + } + } catch (\Throwable $e) { + fwrite(STDERR, "[streaming-worker] tick failed: " . $e->getMessage() . "\n"); + } finally { + $this->redis->set('fs:control:tick_in_flight', '0'); + } + } + } finally { + // Clear running, tick_in_flight, stop no matter how the loop + // exits so a later restart can spin a fresh worker with a + // clean slate. + $this->redis->del(...[ + 'fs:control:running', 'fs:control:tick_in_flight', + 'fs:control:worker_pid', 'fs:control:stop', + ]); + } +} +``` + +The pre-flight `tick_in_flight = 1` before the pause check, and the +outer `finally` block that clears every control key on every exit +path, are the same correctness levers as every other client in this +use case. The only difference is that the flags live in Redis rather +than in process memory. + +The demo server's `/reset` handler reads the same Redis keys: it sets +`fs:control:paused = 1`, polls `fs:control:tick_in_flight` until it +sees `0`, then issues the `DEL` sweep. That's the cross-process +equivalent of `worker.pause() + worker.wait_for_idle()` in the +single-process clients. + +`demo_server.php` spawns the worker on the first request with +`nohup ... &` (detached so it survives the per-request `php -S` +process) and checks `pid_alive($pid)` on every subsequent request. +If the worker has died, it's respawned on the next request. + +To shut the worker down cleanly from outside the demo (the detached +process isn't tied to the foreground `php -S`), flip the stop flag +with `redis-cli`: + +```bash +redis-cli SET fs:control:stop 1 +``` + +The worker's tick loop checks `fs:control:stop` at the top of every +iteration and exits, clearing every `fs:control:*` key on the way +out so the next demo run starts from a clean slate. + +## The batch builder + +`build_features.php` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/php/build_features.php)). +It generates synthetic feature rows and calls `$store->bulkLoad()` +once. The synthesis itself is not the point — in a real deployment the +equivalent code reads from the offline store (Snowflake, BigQuery, +Iceberg) and writes the resulting hashes into Redis. + +Run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +php build_features.php --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, +which is how a typical operator would pre-seed a feature store from +the command line when debugging. + +## The interactive demo + +`demo_server.php` runs as a router script under `php -S` on port 8094. +The HTML page (loaded via `file_get_contents` from +`demo_template.html`) lets you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. +* See the **store state**: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status and **pause or resume** it. The + pause flag goes into Redis at `fs:control:paused`; the detached + worker process reads it between ticks. +* Run an **inference read** for any user with a chosen feature subset, + and see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users. +* **Inspect** any user's full hash with field-level TTLs and the + key-level TTL. + +Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +> **PHP's `$_POST` doesn't preserve repeated keys.** The demo's `/read` +> and `/batch-read` handlers parse the raw `php://input` body +> manually via `parse_multi_form()` so the model can request several +> features in one call (`field=a&field=b&field=c`). PHP's built-in +> form-parsing would keep only the last `field=` value. + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) + and [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; + the demo relies on per-field TTL for the mixed-staleness story. +* **PHP 8.1 or later.** The demo uses readonly properties, named + arguments, and first-class callable syntax. +* **Predis 3.0 or later.** The demo's `composer.json` pins `^3.0`. + Typed bindings for the field-TTL commands ship from 3.0. +* **A POSIX shell environment** for the worker spawn (`nohup`, + `posix_kill`). The demo has been tested on macOS and Linux; Windows + would need a different process-detach approach. + +If your Redis server is running elsewhere, start the demo with +`REDIS_URI=tcp://host:port php -S 127.0.0.1:8094 demo_server.php`. + +## Running the demo + +### Get the source files + +The demo lives in a small Composer project under +[`feature-store/php`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/php). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/php +composer install +``` + +### Start the demo server + +From the project directory: + +```bash +composer start +# or, equivalently: +php -S 127.0.0.1:8094 demo_server.php +``` + +The first request to the server triggers the one-time bootstrap +(reset + seed the store, spawn the streaming worker). You should see: + +```text +[Mon Jun 1 ...] PHP 8.4 Development Server (http://127.0.0.1:8094) started +[Mon Jun 1 ...] 127.0.0.1:... Accepted +``` + +Open [http://127.0.0.1:8094](http://127.0.0.1:8094). Useful things to +try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by + the key-level TTL) and streaming fields with a positive per-field + TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it + paused for ~5 minutes (or restart the server with + `STREAMING_TTL_SECONDS=30` to make it visible in seconds). Re-run + **Read features** on any user and watch the streaming fields + disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level + TTLs. +* Click **Reset** to drop every user and start over. + +## Production usage + +The guidance below focuses on the production concerns specific to +running a feature store on Redis. For the generic Predis production +checklist — connection options, +[transactions and pipelining]({{< relref "/develop/clients/php/transpipe" >}}), +TLS, AUTH, error handling — see the +[Predis client guide]({{< relref "/develop/clients/php" >}}) and the +[connect recipe]({{< relref "/develop/clients/php/connect" >}}). + +### Don't run `php -S` in production + +The built-in PHP development server is single-threaded and not +production-grade. A real deployment runs PHP-FPM behind nginx or +Apache, with the streaming worker as a separate systemd / supervisord / +Kubernetes-cron-job process. The router script in `demo_server.php` is +shaped for the demo; for production, extract the route handlers into a +proper PHP framework (Symfony, Laravel, Slim) that pools `Predis\Client` +connections per-worker. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness +from a broken batch pipeline. Set it longer than your worst-case batch +outage so a single missed run doesn't take the feature store offline, +but short enough that a sustained outage causes loud failures (missing +entities) rather than quiet ones (yesterday's features being scored as +today's). The standard choice is one cycle of "expected refresh +interval × 2" — for a daily batch, 48 hours; for a 6-hour batch, 12 +hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker +causes visible freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the +schema in your offline store. The batch materialization step is your +chance to flatten joins, encode categoricals, and project to whatever +shape the model server wants — so the request path is exactly one +`HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own +schema; the serving pipeline reads from Redis with the flattened +serving schema. Keeping those two pipelines as the same code path is +what prevents training-serving skew. + +### Run the streaming worker as a real process supervisor + +The demo spawns the worker with `nohup ... &` because it's the +simplest portable thing that works under `php -S`. In production, +manage the worker process with systemd / supervisord / Kubernetes — +something that restarts it on crash, captures its logs properly, and +gives you a clean shutdown path. The Redis-backed `fs:control:*` +state (pause flag, in-flight flag, counters) keeps working +unchanged — that's the point of putting it in Redis. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the +streaming write applies `HEXPIRE` *every time*. If a streaming worker +writes a field without renewing its TTL, the field carries whatever +expiry was there before — possibly none, possibly stale — and the +mixed-staleness invariant breaks. Keep the `HSET` and `HEXPIRE` in +the same pipeline (or, even safer, in the same +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model +doesn't need. With dozens of features per entity, that is wasted +serialization work on the server and wasted bandwidth on the wire. +Always specify the field list explicitly with `hmget` in the model +server. + +The exception is debugging and feature-set discovery, where you +genuinely want the full hash. The demo's "Inspect" button uses +`hgetall` for exactly this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m + +# Inspect the worker's control state +redis-cli MGET fs:control:worker_pid fs:control:paused \ + fs:control:tick_in_flight fs:control:tick_count +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the +hash (either it was never written, or it expired); `-1` means the +field has no TTL set (and is therefore covered only by the key-level +`EXPIRE`); any positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a + whole feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset + of features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on + streaming features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL + aligned with the batch materialization cycle. +* Pipelined `HMGET` across many entities for batch scoring with one + network round trip — see + [transactions and pipelining]({{< relref "/develop/clients/php/transpipe" >}}). + +See the [Predis documentation]({{< relref "/develop/clients/php" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the +deeper conceptual model. diff --git a/content/develop/use-cases/feature-store/php/build_features.php b/content/develop/use-cases/feature-store/php/build_features.php new file mode 100644 index 0000000000..704b675460 --- /dev/null +++ b/content/develop/use-cases/feature-store/php/build_features.php @@ -0,0 +1,48 @@ +bulkLoad($rows, $ttlSeconds); +echo "Materialized {$loaded} users at {$keyPrefix}* with a {$ttlSeconds}s key-level TTL.\n"; diff --git a/content/develop/use-cases/feature-store/php/composer.json b/content/develop/use-cases/feature-store/php/composer.json new file mode 100644 index 0000000000..2fc1f41ff8 --- /dev/null +++ b/content/develop/use-cases/feature-store/php/composer.json @@ -0,0 +1,18 @@ +{ + "name": "redis/feature-store-php-demo", + "description": "Redis online feature store demo using Predis (PHP).", + "require": { + "php": ">=8.1", + "predis/predis": "^3.0" + }, + "autoload": { + "psr-4": { + "RedisFeatureStoreDemo\\": "src/" + } + }, + "scripts": { + "start": "php -S 127.0.0.1:8094 demo_server.php", + "build-features": "php build_features.php", + "worker": "php streaming_worker.php" + } +} diff --git a/content/develop/use-cases/feature-store/php/demo_server.php b/content/develop/use-cases/feature-store/php/demo_server.php new file mode 100644 index 0000000000..632964675a --- /dev/null +++ b/content/develop/use-cases/feature-store/php/demo_server.php @@ -0,0 +1,451 @@ +get('fs:control:bootstrapped') === '1') return; + + // Use a short-lived lock so concurrent first requests don't both + // run the seed path. + $lockAcquired = $redis->set('fs:control:bootstrap_lock', '1', 'EX', 30, 'NX'); + if (!$lockAcquired) { + // Wait briefly for the other request to finish. + for ($i = 0; $i < 30; $i++) { + usleep(100_000); + if ($redis->get('fs:control:bootstrapped') === '1') return; + } + return; // give up; we'll try again next request + } + try { + if ($resetOnStart) { + $store->reset(); + $store->resetStats(); + $redis->del(...['fs:control:tick_count', 'fs:control:writes_count']); + } + $rows = BuildFeatures::synthesizeUsers($seedUsers, 42); + $store->bulkLoad($rows, $batchTtlSeconds); + $redis->set('fs:control:bootstrapped', '1'); + } finally { + $redis->del(...['fs:control:bootstrap_lock']); + } +} + +function spawn_worker_if_needed( + Client $redis, string $redisUri, string $keyPrefix, + int $batchTtl, int $streamingTtl, int $usersPerTick +): void { + $pid = (int)$redis->get('fs:control:worker_pid'); + if ($pid > 0 && pid_alive($pid)) return; + + // SET NX guards against two concurrent requests both observing a + // dead PID and both spawning a worker. The 30 s expiry releases + // the lock if this request dies before clearing it. + $lock = $redis->set('fs:control:spawn_lock', '1', 'EX', 30, 'NX'); + if ($lock !== true) return; + try { + // Re-check inside the lock: another request may have spawned + // a worker between the first check and the lock acquisition. + $pid = (int)$redis->get('fs:control:worker_pid'); + if ($pid > 0 && pid_alive($pid)) return; + + // Stale state cleanup, then spawn. + $redis->del(...[ + 'fs:control:worker_pid', 'fs:control:running', + 'fs:control:tick_in_flight', 'fs:control:stop', + ]); + + $script = escapeshellarg(__DIR__ . '/streaming_worker.php'); + $logFile = escapeshellarg(sys_get_temp_dir() . '/fs-streaming-worker.log'); + $env = [ + 'REDIS_URI=' . escapeshellarg($redisUri), + 'KEY_PREFIX=' . escapeshellarg($keyPrefix), + 'BATCH_TTL_SECONDS=' . (int)$batchTtl, + 'STREAMING_TTL_SECONDS=' . (int)$streamingTtl, + 'USERS_PER_TICK=' . (int)$usersPerTick, + ]; + $envStr = implode(' ', $env); + + // `nohup ... &` is the portable way to spawn a detached child + // that survives the dying `php -S` request handler. `` truncates on each respawn so a crash + // loop can't fill the disk). macOS doesn't have GNU setsid, + // so we intentionally avoid it. + $cmd = "/usr/bin/env {$envStr} nohup /usr/bin/env php {$script} " + . ">{$logFile} 2>&1 get('fs:control:worker_pid'); + if ($pid > 0 && pid_alive($pid)) return; + } + } finally { + $redis->del(...['fs:control:spawn_lock']); + } +} + +function pid_alive(int $pid): bool +{ + if ($pid <= 0) return false; + if (function_exists('posix_kill')) { + return @posix_kill($pid, 0); + } + // Fallback: ps lookup. + return (int)trim(@shell_exec("ps -p {$pid} -o pid= 2>/dev/null") ?? '') === $pid; +} + +// ---------------------------------------------------------------------- +// Handlers +// ---------------------------------------------------------------------- + +function build_state(Client $redis, FeatureStore $store): array +{ + $ids = $store->listEntityIds(500); + $count = $store->countEntities(); + return [ + 'key_prefix' => $store->keyPrefix, + 'batch_ttl_seconds' => $store->batchTtlSeconds, + 'streaming_ttl_seconds' => $store->streamingTtlSeconds, + 'entity_count' => $count, + 'entity_ids' => $ids, + 'stats' => $store->statsSnapshot(), + 'worker' => worker_stats($redis), + ]; +} + +function worker_stats(Client $redis): array +{ + $pid = (int)$redis->get('fs:control:worker_pid'); + $running = $pid > 0 && pid_alive($pid); + return [ + 'running' => $running, + 'paused' => $redis->get('fs:control:paused') === '1', + 'tick_count' => (int)$redis->get('fs:control:tick_count'), + 'writes_count' => (int)$redis->get('fs:control:writes_count'), + ]; +} + +function handle_inspect(Client $redis, FeatureStore $store): void +{ + $user = trim((string)($_GET['user'] ?? '')); + if ($user === '') { + send_json(400, ['error' => 'user is required']); + return; + } + $full = $store->getFeatures($user, null); + $keyTtl = $store->keyTtlSeconds($user); + if (count($full) === 0) { + send_json(200, ['exists' => false, 'key_ttl_seconds' => $keyTtl]); + return; + } + // Iterate the known schema (batch + streaming) plus any extras + // the hash carries so expired streaming fields surface as + // ttl_seconds=-2 in the Inspect view rather than silently + // disappearing. + $names = array_merge( + FeatureStore::DEFAULT_BATCH_FIELDS, + FeatureStore::DEFAULT_STREAMING_FIELDS, + ); + foreach ($full as $k => $_) { + if (!in_array($k, $names, true)) $names[] = $k; + } + $ttls = $store->fieldTtlsSeconds($user, $names); + sort($names); + $fields = []; + foreach ($names as $n) { + $fields[] = [ + 'name' => $n, + 'value' => $full[$n] ?? '', + 'ttl_seconds' => $ttls[$n] ?? -2, + ]; + } + send_json(200, [ + 'exists' => true, + 'key_ttl_seconds' => $keyTtl, + 'fields' => $fields, + ]); +} + +function handle_bulk_load(Client $redis, FeatureStore $store): void +{ + $count = clamp_int($_POST['count'] ?? 200, 1, 2000); + $ttl = clamp_int($_POST['ttl'] ?? 86400, 5, 172_800); + $t0 = microtime(true); + $loaded = $store->bulkLoad( + BuildFeatures::synthesizeUsers($count, 42), + $ttl, + ); + $elapsedMs = (microtime(true) - $t0) * 1000.0; + send_json(200, [ + 'loaded' => $loaded, + 'ttl_seconds' => $ttl, + 'elapsed_ms' => $elapsedMs, + ]); +} + +function handle_reset( + Client $redis, FeatureStore $store, + int $batchTtl, int $streamTtl, int $usersPerTick +): void { + // Pause the streaming worker (paused=1) and wait for its + // current tick to drain (tick_in_flight=0). Same race-protection + // pattern as every other client in this use case. + $wasPaused = $redis->get('fs:control:paused') === '1'; + $redis->set('fs:control:paused', '1'); + try { + // Drain any in-flight tick before the DEL sweep. Five seconds + // is well above the one-second tick interval (so a slow tick + // can finish) but short enough that a hung worker doesn't + // wedge the demo indefinitely. + $deadline = microtime(true) + 5.0; + while ($redis->get('fs:control:tick_in_flight') === '1') { + if (microtime(true) >= $deadline) { + send_json(503, ['error' => 'streaming worker did not become idle']); + return; + } + usleep(20_000); + } + $deleted = $store->reset(); + $store->resetStats(); + $redis->del(...['fs:control:tick_count', 'fs:control:writes_count']); + send_json(200, ['deleted' => $deleted]); + } finally { + if (!$wasPaused) $redis->set('fs:control:paused', '0'); + } +} + +function handle_worker_toggle( + Client $redis, string $redisUri, string $keyPrefix, + int $batchTtl, int $streamTtl, int $usersPerTick +): void { + // If the worker process died, respawn it. + spawn_worker_if_needed($redis, $redisUri, $keyPrefix, $batchTtl, $streamTtl, $usersPerTick); + $paused = $redis->get('fs:control:paused') === '1'; + $redis->set('fs:control:paused', $paused ? '0' : '1'); + send_json(200, [ + 'paused' => !$paused, + 'running' => true, + ]); +} + +function handle_read(Client $redis, FeatureStore $store): void +{ + // PHP's $_POST collapses repeated keys (the last `field=` wins), + // but the demo sends `field=a&field=b&field=c` so the model can + // request several features in one call. Parse the raw body + // ourselves to keep every value. + $form = parse_multi_form(); + $user = trim($form['user'][0] ?? ''); + if ($user === '') { + send_json(400, ['error' => 'user is required']); + return; + } + $fields = array_values(array_filter($form['field'] ?? [], fn($f) => $f !== '')); + $t0 = microtime(true); + $values = count($fields) > 0 ? $store->getFeatures($user, $fields) : []; + $elapsedMs = (microtime(true) - $t0) * 1000.0; + $ttls = count($fields) > 0 ? $store->fieldTtlsSeconds($user, $fields) : []; + $keyTtl = $store->keyTtlSeconds($user); + send_json(200, [ + 'requested' => $fields, + 'values' => (object)$values, + 'ttls' => (object)$ttls, + 'key_ttl_seconds' => $keyTtl, + 'returned_count' => count($values), + 'elapsed_ms' => $elapsedMs, + ]); +} + +function handle_batch_read(Client $redis, FeatureStore $store): void +{ + $form = parse_multi_form(); + $count = clamp_int($form['count'][0] ?? 100, 1, 500); + $fields = array_values(array_filter($form['field'] ?? [], fn($f) => $f !== '')); + if (count($fields) === 0) { + $fields = array_merge(FeatureStore::DEFAULT_STREAMING_FIELDS, ['risk_segment']); + } + $ids = $store->listEntityIds(max($count * 2, 2000)); + if (count($ids) > $count) $ids = array_slice($ids, 0, $count); + $t0 = microtime(true); + $rows = $store->batchGetFeatures($ids, $fields); + $elapsedMs = (microtime(true) - $t0) * 1000.0; + $sample = []; + foreach (array_slice($ids, 0, 10) as $id) { + $sample[] = ['id' => $id, 'field_count' => count($rows[$id] ?? [])]; + } + send_json(200, [ + 'entity_count' => count($ids), + 'field_count' => count($fields), + 'elapsed_ms' => $elapsedMs, + 'sample' => $sample, + ]); +} + +// ---------------------------------------------------------------------- +// Helpers +// ---------------------------------------------------------------------- + +function send_json(int $status, mixed $payload): void +{ + http_response_code($status); + header('Content-Type: application/json'); + echo json_encode($payload, JSON_UNESCAPED_SLASHES); +} + +function clamp_int(mixed $value, int $lo, int $hi): int +{ + $n = is_numeric($value) ? (int)$value : $lo; + return max($lo, min($hi, $n)); +} + +/** + * Parse the raw application/x-www-form-urlencoded request body into a + * multi-value map (`['field' => ['a', 'b', 'c'], ...]`). PHP's + * built-in `$_POST` collapses repeated keys (`field=a&field=b` keeps + * only `b`); the demo sends repeated `field=` for the inference and + * batch-read forms, so we parse the body ourselves to preserve every + * value. + * + * @return array> + */ +function parse_multi_form(): array +{ + $body = file_get_contents('php://input'); + $out = []; + if (!is_string($body) || $body === '') return $out; + foreach (explode('&', $body) as $pair) { + if ($pair === '') continue; + $eq = strpos($pair, '='); + if ($eq === false) { + $k = urldecode($pair); $v = ''; + } else { + $k = urldecode(substr($pair, 0, $eq)); + $v = urldecode(substr($pair, $eq + 1)); + } + $out[$k] ??= []; + $out[$k][] = $v; + } + return $out; +} + +function render_html(string $keyPrefix, int $streamingTtl, int $usersPerTick): string +{ + $tpl = file_get_contents(__DIR__ . '/demo_template.html'); + return strtr($tpl, [ + '__KEY_PREFIX__' => $keyPrefix, + '__STREAM_TTL__' => (string)$streamingTtl, + '__USERS_PER_TICK__' => (string)$usersPerTick, + '__BATCH_FIELDS_JSON__' => json_encode(FeatureStore::DEFAULT_BATCH_FIELDS), + '__STREAM_FIELDS_JSON__' => json_encode(FeatureStore::DEFAULT_STREAMING_FIELDS), + ]); +} diff --git a/content/develop/use-cases/feature-store/php/demo_template.html b/content/develop/use-cases/feature-store/php/demo_template.html new file mode 100644 index 0000000000..25770d5bcd --- /dev/null +++ b/content/develop/use-cases/feature-store/php/demo_template.html @@ -0,0 +1,342 @@ + + + + + + Redis Feature Store Demo (PHP) + + + +
+
Predis + PHP built-in server
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one Predis pipeline — the whole batch + ships in one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule — the same thing that happens if a daily refresher + fails. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + Predis pipelines. One network round trip for the + whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + diff --git a/content/develop/use-cases/feature-store/php/streaming_worker.php b/content/develop/use-cases/feature-store/php/streaming_worker.php new file mode 100644 index 0000000000..69e2c6f168 --- /dev/null +++ b/content/develop/use-cases/feature-store/php/streaming_worker.php @@ -0,0 +1,27 @@ +run(); diff --git a/content/develop/use-cases/feature-store/ruby/Gemfile b/content/develop/use-cases/feature-store/ruby/Gemfile new file mode 100644 index 0000000000..89fe44407a --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/Gemfile @@ -0,0 +1,11 @@ +source 'https://rubygems.org' + +# Redis 5.4+ ships HEXPIRE/HTTL semantics, but the typed `hexpire` +# binding mis-orders the FIELDS clause. The helper issues the +# field-TTL commands with `Redis#call` directly, which is also why +# we don't pin the typed-hexpire patch release. +gem 'redis', '~> 5.4' + +# WEBrick was removed from Ruby's default-gem set in 3.0; install it +# explicitly so the demo runs cleanly on modern Rubies. +gem 'webrick', '~> 1.9' diff --git a/content/develop/use-cases/feature-store/ruby/_index.md b/content/develop/use-cases/feature-store/ruby/_index.md new file mode 100644 index 0000000000..41b8d1a6f7 --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/_index.md @@ -0,0 +1,663 @@ +--- +categories: +- docs +- develop +- stack +- oss +- rs +- rc +description: Build a Redis-backed online feature store in Ruby with redis-rb +linkTitle: redis-rb example (Ruby) +title: Redis feature store with redis-rb +weight: 8 +--- + +This guide shows you how to build a small Redis-backed online feature store +in Ruby with the [`redis`]({{< relref "/develop/clients/ruby" >}}) gem. The +demo runs on top of WEBrick (the stdlib HTTP server) so you can bulk-load a +batch of users with a key-level TTL, run a streaming worker that overwrites +real-time features with per-field TTL, retrieve any subset of features for +one user under 2 ms, and pipeline `HMGET` across a hundred users for batch +scoring. + +## Overview + +Each entity (here, a user) is one Redis +[Hash]({{< relref "/develop/data-types/hashes" >}}) at a deterministic key — +`fs:user:{id}`. The hash holds every feature for that entity as one field per +feature: batch-materialized aggregates (refreshed once a day) alongside +streaming-updated signals (refreshed every few seconds). One +[`HMGET`]({{< relref "/commands/hmget" >}}) returns whichever subset the +model needs in one network round trip. + +Two TTL layers solve the *mixed staleness* problem without an +application-side cleaner: + +* A **key-level** [`EXPIRE`]({{< relref "/commands/expire" >}}) aligned with + the batch materialization cycle (24 hours in the demo). If the batch + refresher fails, the whole entity disappears at the next cycle and + inference sees a missing entity — which the model handler can detect and + fall back on — rather than silently outdated values. +* A **per-field** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) (Redis 7.4+) + on each streaming feature gives that field its own shorter expiry, + independent of the rest of the hash. If the streaming pipeline stops + updating a feature, the field self-cleans while the batch fields stay + populated. + +That gives you: + +* A single round trip for retrieval — any subset of features for one entity + in one [`HMGET`]({{< relref "/commands/hmget" >}}). +* Sub-millisecond hot path. The Redis-side work is microseconds; in practice + the bottleneck is the network round trip plus the model's own + feature-prep. +* Pipelined batch scoring — one round trip for `N` users at once. +* Independent freshness per feature, expressed as a server-side TTL rather + than as application logic. +* Self-cleanup on pipeline failure: a stalled batch refresher lets entities + expire on schedule, and a stalled streaming worker lets each affected + field expire on its own timer. + +## How redis-rb fits the demo + +Two gem facts shape the helper: + +* **One shared `Redis` client serves the whole process.** The `redis` gem + uses a single TCP connection per `Redis` instance — and the instance is + thread-safe (synchronized with a mutex). Handing the same `Redis` to + every WEBrick worker thread and the streaming worker is fine and is the + canonical way to run this kind of demo. +* **`Redis#call` is the escape hatch for commands not yet typed on the + gem.** redis-rb 5.4 ships no stable typed helpers for the per-field + TTL commands. The helper sends `HEXPIRE` and `HTTL` with + `r.call('HEXPIRE', key, ttl, 'FIELDS', count, *fields)` so the wire + bytes match the protocol exactly regardless of which patch release + is installed. + +In this example, the batch features describe a user's longer-term shape +(`country_iso`, `risk_segment`, `account_age_days`, `tx_count_7d`, +`avg_amount_30d`, `chargeback_count_180d`) and are bulk-loaded by +`build_features.rb` — the demo's stand-in for a nightly Spark / Feast +materialization job. The streaming features describe what the user is doing +right now (`last_login_ts`, `last_device_id`, `tx_count_5m`, +`failed_logins_15m`, `session_country`) and are written by +`streaming_worker.rb` — a daemon Ruby thread that stands in for a +Flink / Kafka Streams job. The WEBrick servlet in `demo_server.rb` reads +any subset of those features through `feature_store.rb`'s helper class. + +## How it works + +There are three paths: a **batch path** that bulk-loads features once per +materialization cycle, a **streaming path** that updates real-time features +as events arrive, and an **inference path** that reads features on the +request side. + +### Batch path (per materialization cycle) + +1. The batch job calls `synthesize_users(N, seed)` (in production, the + equivalent computation lives in an offline pipeline against the + warehouse). The result is `{user_id => {field => value, ...}}` for every + user in this cycle. +2. `store.bulk_load(rows, ttl_seconds:)` queues one + [`HSET`]({{< relref "/commands/hset" >}}) plus one + [`EXPIRE`]({{< relref "/commands/expire" >}}) per user through + `redis.pipelined`, so the whole batch ships in a single round trip. + +### Streaming path (per event) + +When a user does something (login, transaction, page view) the streaming +layer computes whatever real-time signals fall out of that event and calls +`store.update_streaming(user_id, fields)`. That queues: + +1. An [`HSET`]({{< relref "/commands/hset" >}}) writing the new field + values. Redis is single-threaded per shard, so this is atomic against + any concurrent batch write on the same hash — no version columns, no + locks. +2. An [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) over exactly the + fields that were written, with the streaming TTL. Each streaming field + carries its own per-field expiry independent of the rest of the hash. + Stop the worker and these fields drop out one by one as their TTLs + elapse, while the batch fields remain populated under the longer + key-level TTL. + +### Inference path (per request) + +1. The model server picks the feature subset it needs (the schema is + owned by the model, not the store). +2. It calls `store.get_features(user_id, names)`, which is one + [`HMGET`]({{< relref "/commands/hmget" >}}). Redis returns the values + in the same order as the requested fields, with `nil` for any field + that doesn't exist (or has expired). +3. For batch inference, the model server calls + `store.batch_get_features(user_ids, names)`, which pipelines one + [`HMGET`]({{< relref "/commands/hmget" >}}) per user across all `N` + users in a single network round trip. + +### Project layout + +```text +feature-store/ruby/ +├── Gemfile — redis ~> 5.4, webrick ~> 1.9 +├── feature_store.rb — FeatureStore class +├── streaming_worker.rb — daemon-thread worker +├── build_features.rb — synthesize_users + CLI main +└── demo_server.rb — WEBrick servlet + HTML page (single file) +``` + +Run with `bundle exec ruby demo_server.rb` or +`bundle exec ruby build_features.rb --count 500`. + +## The feature-store helper + +The `FeatureStore` class wraps the read/write paths +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/ruby/feature_store.rb)): + +```ruby +require 'redis' +require_relative 'feature_store' + +redis = Redis.new(url: 'redis://localhost:6379') +store = FeatureStore.new( + redis: redis, + key_prefix: 'fs:user:', + batch_ttl_seconds: 24 * 60 * 60, # whole-entity TTL aligned with the daily batch cycle + streaming_ttl_seconds: 5 * 60, # per-field TTL on each streaming feature +) + +# Batch materialization: one HSET + EXPIRE per user, all pipelined. +store.bulk_load({ + 'u0001' => { + 'country_iso' => 'US', 'risk_segment' => 'low', + 'tx_count_7d' => 14, 'avg_amount_30d' => 92.40, + 'account_age_days' => 612, 'chargeback_count_180d' => 0, + }, +}, ttl_seconds: 24 * 60 * 60) + +# Streaming write: HSET + HEXPIRE on just the fields that changed. +store.update_streaming('u0001', { + 'last_login_ts' => (Time.now.to_f * 1000).to_i, + 'last_device_id' => 'ios-9f02', + 'tx_count_5m' => 3, + 'failed_logins_15m' => 0, + 'session_country' => 'US', +}) + +# Inference read: HMGET of whatever the model needs. +features = store.get_features('u0001', [ + 'risk_segment', 'tx_count_7d', 'avg_amount_30d', + 'tx_count_5m', 'failed_logins_15m', +]) + +# Batch scoring: pipelined HMGET across many users. +batch = store.batch_get_features( + %w[u0001 u0002 u0003], + %w[risk_segment tx_count_5m failed_logins_15m], +) +``` + +### Data model + +Each user is one Redis Hash. Every value is stored as a string — Redis +hash fields are bytes on the wire, so the helper renders booleans as +`'true'` / `'false'` and uses `value.to_s` for everything else. The model +server is responsible for parsing back to the right type, the same way it +would when reading any serialized feature store. + +```text +fs:user:u0001 TTL = 86400 s (key-level) + country_iso=US + risk_segment=low + account_age_days=612 + tx_count_7d=14 + avg_amount_30d=92.40 + chargeback_count_180d=0 + last_login_ts=1716998413541 TTL = 300 s (per field, HEXPIRE) + last_device_id=ios-9f02 TTL = 300 s (per field, HEXPIRE) + tx_count_5m=3 TTL = 300 s (per field, HEXPIRE) + failed_logins_15m=0 TTL = 300 s (per field, HEXPIRE) + session_country=US TTL = 300 s (per field, HEXPIRE) +``` + +### Bulk-loading batch features + +`bulk_load` pipelines one `HSET` and one `EXPIRE` per user into a single +round trip via `redis.pipelined`. With 500 users that's 1000 commands in +one network call — Redis processes them sequentially on the server side +but the client only pays one RTT. + +```ruby +def bulk_load(rows, ttl_seconds: nil) + return 0 if rows.empty? + ttl = ttl_seconds || @batch_ttl_seconds + @redis.pipelined do |pipe| + rows.each do |entity_id, fields| + key = key_for(entity_id) + encoded = fields.transform_values { |v| encode_value(v) } + pipe.hset(key, encoded) + pipe.expire(key, ttl) + end + end + ... +end +``` + +`Redis#pipelined` is a non-transactional batch: commands queue up and ship +in one round trip but they don't run inside a `MULTI/EXEC` block. That's +the right choice here because each user's `HSET` + `EXPIRE` pair is +independent of every other user's, and an all-or-nothing transaction +would block the server for the duration of the batch. For the rare case +where the pair has to be inseparable, use `redis.multi do |tx| ... end` +or a Lua script via +[`EVAL`]({{< relref "/commands/eval" >}}) / +[Eval scripting]({{< relref "/develop/programmability/eval-intro" >}}). + +In production, the equivalent of this script runs as an offline pipeline +(a Spark or Feast `materialize` job) that reads from the warehouse and +writes into Redis. The +[Feast `RedisOnlineStore`](https://docs.feast.dev/reference/online-stores/redis) +provider does exactly this under the hood; the in-house +[Redis Feature Form]({{< relref "/develop/ai/featureform" >}}) integration +covers the materialize + serve path end-to-end. + +### Streaming writes with per-field TTL + +`update_streaming` is the linchpin of the mixed-staleness story: + +```ruby +def update_streaming(entity_id, fields, ttl_seconds: nil) + return if fields.empty? + ttl = ttl_seconds || @streaming_ttl_seconds + key = key_for(entity_id) + encoded = fields.transform_values { |v| encode_value(v) } + names = encoded.keys + + results = @redis.pipelined do |pipe| + pipe.hset(key, encoded) + pipe.call('HEXPIRE', key, ttl, 'FIELDS', names.size, *names) + end + codes = results[1] || [] + codes.each do |code| + unless code == 1 + raise "HEXPIRE did not set every field TTL for #{key}: #{codes.inspect}" + end + end + ... +end +``` + +[`HEXPIRE`]({{< relref "/commands/hexpire" >}}) sets the TTL on +*individual* hash fields, not on the whole key. The two commands are +queued in the same `pipelined` block so Redis runs them in order: the +`HSET` first creates or overwrites the fields, then `HEXPIRE` attaches a +TTL to each of those same fields. `HEXPIRE` returns one status code per +field: + +* `1` — TTL set / updated. +* `2` — the expiry was 0 or in the past, so Redis deleted the field + instead of applying a TTL. +* `0` — an `NX | XX | GT | LT` conditional flag was specified and not + met (we never use one here). +* `-2` — no such field, or no such key. + +The helper raises if any code is anything other than `1`, so the "every +streaming write renews its TTL" invariant fails loudly rather than +silently leaving a streaming field with no expiry attached. + +Why `redis.call('HEXPIRE', ...)` instead of a typed `redis.hexpire`? +redis-rb 5.4 ships no stable typed helpers for the per-field TTL +commands, so `Redis#call` is the canonical way to issue them. The wire +bytes match the protocol exactly. The same `r.call('HTTL', ...)` shape +appears in `field_ttls_seconds`. + +If a streaming pipeline stops, the streaming fields drop out one by one +as their per-field TTLs elapse. `field_ttls_seconds` lets the model side +inspect the remaining TTL on any field — useful both for debugging +("why is this feature missing?" → "it expired three seconds ago") and as +a freshness signal in the model itself. + +> **HEXPIRE requires Redis 7.4 or later.** `HEXPIRE` and the field-level +> TTL commands were added in Redis 7.4. The demo's `Gemfile` pins +> `redis ~> 5.4`, which speaks the protocol natively. + +### Inference reads with HMGET + +`get_features` is one `HMGET`: + +```ruby +def get_features(entity_id, field_names = nil) + key = key_for(entity_id) + if field_names.nil? + return @redis.hgetall(key) + end + return {} if field_names.empty? + values = @redis.hmget(key, *field_names) + out = {} + field_names.each_with_index do |n, i| + out[n] = values[i] unless values[i].nil? + end + out +end +``` + +The model knows exactly which features it consumes, so the request path +always takes the `hmget` branch with an explicit field list — that's the +sub-millisecond path. `hgetall` is the right call for debugging (which is +what the demo's "Inspect" panel does) but not for serving: it forces +Redis to serialize every field, including ones the model doesn't need. + +Fields that don't exist (because they were never written, or because they +expired) come back as `nil`. The helper drops them from the result hash +so the caller sees only the features that are actually available. + +### Batch scoring with pipelined HMGET + +For batch inference, the same `HMGET` shape pipelines across users: + +```ruby +def batch_get_features(entity_ids, field_names) + return {} if entity_ids.empty? || field_names.empty? + rows = @redis.pipelined do |pipe| + entity_ids.each { |id| pipe.hmget(key_for(id), *field_names) } + end + out = {} + entity_ids.each_with_index do |id, i| + values = rows[i] || [] + row = {} + field_names.each_with_index do |n, j| + row[n] = values[j] unless values[j].nil? + end + out[id] = row + end + out +end +``` + +One round trip for the whole batch. The demo returns a 30-user batch in +~2 ms against a local Redis. + +A Redis Cluster is different: a single `redis.pipelined` block ships +through one connection to one node. For batch reads on a cluster, use +the [`redis-clustering`](https://github.com/redis/redis-rb-cluster) gem +and either fan out parallel `hmget` calls (the cluster client routes +each one to the right shard) or, for tighter control, group entity IDs +by hash slot and run one `pipelined` block per shard in parallel. + +## The streaming worker + +`streaming_worker.rb` is the demo's stand-in for whatever Flink, Kafka +Streams, or bespoke service computes the real-time features +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/ruby/streaming_worker.rb)). +It runs as a daemon `Thread` next to the WEBrick server so the UI can +start, pause, and resume it. + +The lifecycle (start / stop / pause / resume / wait_for_idle) is the same +as every other client in this use case. The two correctness levers: + +```ruby +def run + until @stop + sleep(@tick) + break if @stop + # Set tick_in_flight *before* the pause check so a concurrent + # pause + wait_for_idle can never observe tick_in_flight=false + # in the window between the pause check and the actual tick call. + @tick_in_flight = true + begin + do_tick unless @paused + rescue => e + warn "[streaming-worker] tick failed: #{e.class}: #{e.message}" + ensure + @tick_in_flight = false + end + end +ensure + # Clear running and tick_in_flight no matter how the thread exits + # so a later start can spin a fresh thread. + @running = false + @tick_in_flight = false +end +``` + +The same pre-flight `@tick_in_flight = true` before the pause check and +the outer `ensure` block that clears both flags on every exit path +appears in every other client demo, for the same reason: a reset that's +about to `DEL` every key needs to be able to call `worker.pause` to stop +*future* ticks AND `worker.wait_for_idle` to flush a mid-flight tick +before issuing the DEL sweep. + +Pausing the worker is what shows off the mixed-staleness behavior: leave +it paused for longer than `streaming_ttl_seconds` and the streaming +fields disappear from every user's hash one by one, while the batch +fields remain under the longer key-level `EXPIRE`. The demo's +`Pause / resume` button lets you see this happen in real time. + +## The batch builder + +`build_features.rb` is the demo's nightly materializer +([source](https://github.com/redis/docs/blob/main/content/develop/use-cases/feature-store/ruby/build_features.rb)). +It generates synthetic feature rows and calls `store.bulk_load` once. + +Run the builder on its own (independently of the demo server) to +populate Redis from the command line: + +```bash +bundle exec ruby build_features.rb --count 500 --ttl-seconds 3600 +``` + +That writes 500 users at `fs:user:*` with a one-hour key-level TTL, +which is how a typical operator would pre-seed a feature store from the +command line when debugging. + +## The interactive demo + +`demo_server.rb` runs a WEBrick server on port 8093. The HTML page lets +you: + +* **Bulk-load** any number of users (default 200) with a configurable + key-level TTL. +* See the **store state**: user count, batch / streaming TTLs, + cumulative read/write counters. +* See the **streaming worker** status and **pause or resume** it. +* Run an **inference read** for any user with a chosen feature subset, + and see the value, the per-field TTL, and the read latency. +* Run **batch scoring** with a pipelined `HMGET` across `N` users. +* **Inspect** any user's full hash with field-level TTLs and the + key-level TTL. + +The server holds one `Redis` client, one `FeatureStore`, and one +`StreamingWorker` for the lifetime of the process. Every WEBrick request +thread shares the same `Redis` (the gem synchronizes its own access). +Endpoints: + +| Endpoint | What it does | +|---------------------------|-------------------------------------------------------------------------------------| +| `GET /state` | User count, TTL config, stats counters, worker status. | +| `POST /bulk-load` | Pipelined `HSET` + `EXPIRE` over N synthetic users with a chosen TTL. | +| `POST /worker/toggle` | Pause / resume the streaming worker. | +| `POST /read` | `HMGET` a chosen feature subset for one user; report latency and per-field TTLs. | +| `POST /batch-read` | Pipeline `HMGET` across N users; report total latency and per-entity field counts. | +| `GET /inspect` | `HGETALL` + `HTTL` for one user; full hash view with per-field TTLs. | +| `POST /reset` | Drop every user under the key prefix (used by the demo's reset button). | + +## Prerequisites + +* **Redis 7.4 or later.** [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) + and [`HTTL`]({{< relref "/commands/httl" >}}) were added in Redis 7.4; + the demo relies on per-field TTL for the mixed-staleness story. +* **Ruby 3.0 or later.** +* The `redis` and `webrick` gems. The demo's `Gemfile` pins + `redis ~> 5.4` and `webrick ~> 1.9`. WEBrick was removed from Ruby's + default-gem set in 3.0, so the explicit pin keeps the demo runnable + on modern Rubies. + +If your Redis server is running elsewhere, start the demo with +`--redis-url redis://host:port`. + +## Running the demo + +### Get the source files + +The demo lives in a small directory under +[`feature-store/ruby`](https://github.com/redis/docs/tree/main/content/develop/use-cases/feature-store/ruby). +Clone the repo or copy the directory: + +```bash +git clone https://github.com/redis/docs.git +cd docs/content/develop/use-cases/feature-store/ruby +bundle install +``` + +### Start the demo server + +From the project directory: + +```bash +bundle exec ruby demo_server.rb +``` + +You should see: + +```text +Dropping any existing users under 'fs:user:*' for a clean demo run (pass --no-reset to keep them). +Redis feature-store demo server listening on http://127.0.0.1:8093 +Using Redis at redis://localhost:6379 with key prefix 'fs:user:' (batch TTL 86400s, streaming TTL 300s) +Materialized 200 user(s); streaming worker running. +``` + +Open [http://127.0.0.1:8093](http://127.0.0.1:8093). Useful things to try: + +* Pick a user and click **Read features** with a mixed batch/streaming + subset — you'll see batch fields with no per-field TTL (covered by the + key-level TTL) and streaming fields with a positive per-field TTL. +* Click **Pipeline HMGET** with `count=100` to see the latency of a + 100-user batch read. +* Click **Pause / resume** on the streaming worker and leave it paused + for ~5 minutes (or restart the server with + `--streaming-ttl-seconds 30` to make it visible in seconds). Re-run + **Read features** on any user and watch the streaming fields + disappear while the batch fields stay. +* Click **Inspect** on a user to see the full hash with field-level + TTLs. +* Click **Reset** to drop every user and start over. + +## Production usage + +The guidance below focuses on the production concerns specific to +running a feature store on Redis. For the generic redis-rb production +checklist — connection options, TLS, AUTH, retry policy — see the +[`redis` gem documentation]({{< relref "/develop/clients/ruby" >}}). +The feature-store demo runs against `localhost` with the defaults; a +real deployment should harden the client first. + +### Pick the batch TTL to outlast a failed refresher + +The whole-entity `EXPIRE` is your safety net against silent staleness +from a broken batch pipeline. Set it longer than your worst-case batch +outage so a single missed run doesn't take the feature store offline, +but short enough that a sustained outage causes loud failures (missing +entities) rather than quiet ones (yesterday's features being scored as +today's). The standard choice is one cycle of "expected refresh +interval × 2" — for a daily batch, 48 hours; for a 6-hour batch, 12 +hours. + +The same logic applies to the per-field streaming TTL: a few times the +expected update interval so a slow-but-alive streaming worker doesn't +churn features needlessly, but short enough that a stalled worker +causes visible freshness failures. + +### Co-locate the online store with serving, not with training + +The online store's hash representation does *not* have to match the +schema in your offline store. The batch materialization step is your +chance to flatten joins, encode categoricals, and project to whatever +shape the model server wants — so the request path is exactly one +`HMGET` and zero transforms. + +The training pipeline reads from the offline store with its own schema; +the serving pipeline reads from Redis with the flattened serving +schema. Keeping those two pipelines as the same code path is what +prevents training-serving skew. + +### Use redis-clustering for cluster deployments + +A single `redis.pipelined` block ships through one connection to one +node. On a Redis Cluster you need the +[`redis-clustering`](https://github.com/redis/redis-rb-cluster) gem, +which routes each command to the right shard transparently. For batch +reads on a cluster, either fan out parallel `hmget` calls (each routed +per-shard) or group entity IDs by hash slot ahead of time and run one +`pipelined` block per shard in parallel. + +A hash tag like `fs:user:{vip}:u0001` forces a known set of keys onto +the same shard so one pipeline can cover them all in a single round +trip. + +### Make HEXPIRE part of every streaming write + +The single biggest correctness lever in this design is that the +streaming write applies `HEXPIRE` *every time*. If a streaming worker +writes a field without renewing its TTL, the field carries whatever +expiry was there before — possibly none, possibly stale — and the +mixed-staleness invariant breaks. Keep the `HSET` and `HEXPIRE` in the +same pipeline (or, even safer, in the same +[Lua script]({{< relref "/develop/programmability/eval-intro" >}}) if +you don't trust the call site). + +### Avoid HGETALL on the request path + +`HGETALL` reads every field on the hash, including ones the model +doesn't need. With dozens of features per entity, that is wasted +serialization work on the server and wasted bandwidth on the wire. +Always specify the field list explicitly with `hmget` in the model +server. + +The exception is debugging and feature-set discovery, where you +genuinely want the full hash. The demo's "Inspect" button uses +`hgetall` for exactly this reason. + +### Inspect the store directly with redis-cli + +When testing or troubleshooting, the cli tells you everything: + +```bash +# How many users currently in the store +redis-cli --scan --pattern 'fs:user:*' | wc -l + +# One user's full hash and key-level TTL +redis-cli HGETALL fs:user:u0001 +redis-cli TTL fs:user:u0001 + +# Per-field TTL on the streaming fields +redis-cli HTTL fs:user:u0001 FIELDS 5 \ + last_login_ts last_device_id tx_count_5m failed_logins_15m session_country + +# Sample HMGET as the model would issue it +redis-cli HMGET fs:user:u0001 risk_segment tx_count_7d avg_amount_30d tx_count_5m +``` + +A streaming field that returns `-2` from `HTTL` doesn't exist on the +hash (either it was never written, or it expired); `-1` means the +field has no TTL set (and is therefore covered only by the key-level +`EXPIRE`); any positive value is the remaining TTL in seconds. + +## Learn more + +This example uses the following Redis commands: + +* [`HSET`]({{< relref "/commands/hset" >}}) to write a feature or a + whole feature row in one call. +* [`HMGET`]({{< relref "/commands/hmget" >}}) to retrieve any subset + of features for one entity in one round trip. +* [`HGETALL`]({{< relref "/commands/hgetall" >}}) for debugging and + feature-set discovery. +* [`HEXPIRE`]({{< relref "/commands/hexpire" >}}) and + [`HTTL`]({{< relref "/commands/httl" >}}) for per-field TTL on + streaming features (Redis 7.4+). +* [`EXPIRE`]({{< relref "/commands/expire" >}}) and + [`TTL`]({{< relref "/commands/ttl" >}}) for the whole-entity TTL + aligned with the batch materialization cycle. + +See the [`redis` gem documentation]({{< relref "/develop/clients/ruby" >}}) +for the full client reference, and the +[Hashes overview]({{< relref "/develop/data-types/hashes" >}}) for the +deeper conceptual model. diff --git a/content/develop/use-cases/feature-store/ruby/build_features.rb b/content/develop/use-cases/feature-store/ruby/build_features.rb new file mode 100644 index 0000000000..4134108117 --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/build_features.rb @@ -0,0 +1,89 @@ +# Synthesize a small batch of users with realistic-looking features +# and bulk-load them into Redis with a 24-hour key-level TTL. +# +# Stands in for the nightly Spark / Feast materialization job in a +# real deployment. In production the equivalent of this script lives +# in an offline pipeline that reads from the offline store and writes +# the serving-time hashes into Redis via HSET + EXPIRE. + +require 'optparse' +require 'redis' +require_relative 'feature_store' + +COUNTRY_CHOICES = %w[US GB DE FR IN BR JP AU CA NL].freeze +RISK_SEGMENTS = %w[low medium high].freeze +RISK_WEIGHTS = [70, 25, 5].freeze +CHARGEBACK_BUCKETS = [0, 1, 2, 3].freeze +CHARGEBACK_WEIGHTS = [85, 10, 4, 1].freeze + +# Generate `count` synthetic user feature rows. The shape mirrors a +# small fraud-scoring feature set. +def synthesize_users(count, seed = 42) + rng = Random.new(seed) + users = {} + (1..count).each do |i| + uid = format('u%04d', i) + users[uid] = { + 'country_iso' => COUNTRY_CHOICES.sample(random: rng), + 'risk_segment' => weighted_str(rng, RISK_SEGMENTS, RISK_WEIGHTS), + 'account_age_days' => rng.rand(7..2400), + 'tx_count_7d' => rng.rand(0..80), + 'avg_amount_30d' => (rng.rand(5.0..350.0) * 100).round / 100.0, + 'chargeback_count_180d' => weighted_int(rng, CHARGEBACK_BUCKETS, CHARGEBACK_WEIGHTS), + } + end + users +end + +def weighted_str(rng, items, weights) + total = weights.sum + r = rng.rand(total) + items.each_with_index do |item, i| + r -= weights[i] + return item if r < 0 + end + items.last +end + +def weighted_int(rng, items, weights) + total = weights.sum + r = rng.rand(total) + items.each_with_index do |item, i| + r -= weights[i] + return item if r < 0 + end + items.last +end + +# CLI entry point. +def build_features_main(argv = ARGV) + redis_url = 'redis://localhost:6379' + count = 200 + ttl_seconds = 24 * 60 * 60 + key_prefix = 'fs:user:' + seed = 42 + + OptionParser.new do |opts| + opts.banner = 'Usage: ruby build_features.rb [options]' + opts.on('--redis-url URL') { |v| redis_url = v } + opts.on('--count N', Integer) { |v| count = v } + opts.on('--ttl-seconds S', Integer) { |v| ttl_seconds = v } + opts.on('--key-prefix PREFIX') { |v| key_prefix = v } + opts.on('--seed N', Integer) { |v| seed = v } + opts.on('-h', '--help') do + puts opts + exit + end + end.parse!(argv) + + redis = Redis.new(url: redis_url) + store = FeatureStore.new(redis: redis, key_prefix: key_prefix, + batch_ttl_seconds: ttl_seconds) + rows = synthesize_users(count, seed) + loaded = store.bulk_load(rows, ttl_seconds: ttl_seconds) + puts "Materialized #{loaded} users at #{key_prefix}* with a #{ttl_seconds}s key-level TTL." +ensure + redis&.close +end + +build_features_main if $PROGRAM_NAME == __FILE__ diff --git a/content/develop/use-cases/feature-store/ruby/demo_server.rb b/content/develop/use-cases/feature-store/ruby/demo_server.rb new file mode 100644 index 0000000000..6fd4121045 --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/demo_server.rb @@ -0,0 +1,665 @@ +#!/usr/bin/env ruby +# Redis feature-store demo server (Ruby + redis-rb + WEBrick). +# +# Run with `bundle exec ruby demo_server.rb` and visit +# to watch an online feature store at work: +# a batch materialization loads N users with a 24-hour key-level TTL, +# a streaming worker overwrites a handful of users' real-time features +# every second with a per-field HEXPIRE, and the inference panel reads +# any subset of features for any user with HMGET in a single round +# trip. + +require 'json' +require 'optparse' +require 'redis' +require 'uri' +require 'webrick' +require 'thread' + +require_relative 'feature_store' +require_relative 'streaming_worker' +require_relative 'build_features' + +# Bundles the FeatureStore + worker with the lifecycle hooks the HTTP +# handlers call into. A single mutex serializes materialize / reset / +# toggle so the worker pause-and-wait-idle dance can't race with a +# concurrent bulk-load. +class FeatureStoreDemo + def initialize(store:, worker:, seed: 42) + @store = store + @worker = worker + @seed = seed + @lock = Mutex.new + end + + def materialize(count, ttl_seconds) + @lock.synchronize do + rows = synthesize_users(count, @seed) + t0 = monotonic_ms + loaded = @store.bulk_load(rows, ttl_seconds: ttl_seconds) + elapsed_ms = monotonic_ms - t0 + { loaded: loaded, ttl_seconds: ttl_seconds, elapsed_ms: elapsed_ms } + end + end + + # Pause + wait-for-idle around the DEL sweep so a concurrent tick + # can't recreate a user that was just enumerated for deletion + # (streaming HSET creates the key if it's missing, leaving a + # streaming-only hash with no key-level TTL). + def reset + @lock.synchronize do + was_paused = @worker.paused? + if @worker.running? + @worker.pause unless was_paused + @worker.wait_for_idle + end + begin + deleted = @store.reset + @store.reset_stats + @worker.reset_stats + { deleted: deleted } + ensure + @worker.resume if @worker.running? && !was_paused + end + end + end + + def toggle_worker + @lock.synchronize do + @worker.start unless @worker.running? + @worker.paused? ? @worker.resume : @worker.pause + { paused: @worker.paused?, running: @worker.running? } + end + end + + private + + def monotonic_ms + Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000 + end +end + +class FeatureStoreServlet < WEBrick::HTTPServlet::AbstractServlet + def initialize(server, store, worker, demo, html_page) + super(server) + @store = store + @worker = worker + @demo = demo + @html_page = html_page + end + + def do_GET(req, res) + case req.path + when '/', '/index.html' + res.status = 200 + res['Content-Type'] = 'text/html; charset=utf-8' + res.body = @html_page + when '/state' + json_response(res, 200, build_state) + when '/inspect' + handle_inspect(req, res) + else + res.status = 404 + res.body = 'Not Found' + end + end + + def do_POST(req, res) + case req.path + when '/bulk-load' then handle_bulk_load(req, res) + when '/reset' then json_response(res, 200, @demo.reset) + when '/worker/toggle' then json_response(res, 200, @demo.toggle_worker) + when '/read' then handle_read(req, res) + when '/batch-read' then handle_batch_read(req, res) + else + res.status = 404 + res.body = 'Not Found' + end + end + + private + + def build_state + ids = @store.list_entity_ids(limit: 500) + { + key_prefix: @store.key_prefix, + batch_ttl_seconds: @store.batch_ttl_seconds, + streaming_ttl_seconds: @store.streaming_ttl_seconds, + # list_entity_ids caps at 500 for the UI dropdown; report the + # true count separately so the page doesn't silently understate + # how many users are in the store. + entity_count: @store.count_entities, + entity_ids: ids, + stats: @store.stats, + worker: @worker.stats, + } + end + + def handle_inspect(req, res) + user = (req.query['user'] || '').strip + return json_response(res, 400, { error: 'user is required' }) if user.empty? + + full = @store.get_features(user, nil) + key_ttl = @store.key_ttl_seconds(user) + if full.empty? + return json_response(res, 200, { exists: false, key_ttl_seconds: key_ttl }) + end + # Iterate the known schema (batch + streaming) plus any extras + # the hash carries so expired streaming fields surface as + # ttl_seconds=-2 in the Inspect view rather than silently + # disappearing. + names = FeatureStore::DEFAULT_BATCH_FIELDS + FeatureStore::DEFAULT_STREAMING_FIELDS + full.each_key { |k| names << k unless names.include?(k) } + ttls = @store.field_ttls_seconds(user, names) + fields = names.sort.map do |n| + { name: n, value: full[n] || '', ttl_seconds: ttls[n] || -2 } + end + json_response(res, 200, { exists: true, key_ttl_seconds: key_ttl, fields: fields }) + end + + def handle_bulk_load(req, res) + form = parse_form(req.body) + count = clamp(int_or(form['count'], 200), 1, 2000) + ttl = clamp(int_or(form['ttl'], 86_400), 5, 172_800) + json_response(res, 200, @demo.materialize(count, ttl)) + end + + def handle_read(req, res) + form = parse_form(req.body) + user = first(form['user']).to_s.strip + return json_response(res, 400, { error: 'user is required' }) if user.empty? + fields = (form['field'] || []).reject(&:empty?) + t0 = monotonic_ms + values = fields.empty? ? {} : @store.get_features(user, fields) + elapsed_ms = monotonic_ms - t0 + ttls = fields.empty? ? {} : @store.field_ttls_seconds(user, fields) + key_ttl = @store.key_ttl_seconds(user) + json_response(res, 200, { + requested: fields, + values: values, + ttls: ttls, + key_ttl_seconds: key_ttl, + returned_count: values.size, + elapsed_ms: elapsed_ms, + }) + end + + def handle_batch_read(req, res) + form = parse_form(req.body) + count = clamp(int_or(form['count'], 100), 1, 500) + fields = (form['field'] || []).reject(&:empty?) + fields = FeatureStore::DEFAULT_STREAMING_FIELDS + ['risk_segment'] if fields.empty? + ids = @store.list_entity_ids(limit: [count * 2, 2000].max) + ids = ids.first(count) if ids.size > count + t0 = monotonic_ms + rows = @store.batch_get_features(ids, fields) + elapsed_ms = monotonic_ms - t0 + sample = ids.first(10).map do |id| + { id: id, field_count: (rows[id] || {}).size } + end + json_response(res, 200, { + entity_count: ids.size, + field_count: fields.size, + elapsed_ms: elapsed_ms, + sample: sample, + }) + end + + # ---------- helpers ---------- + + # Parse an application/x-www-form-urlencoded body into a multi-value + # Hash>. URI.decode_www_form preserves + # repeated keys (field=a&field=b), which is what we need for the + # inference + batch-read forms. CGI.parse was removed in Ruby 4. + def parse_form(body) + out = Hash.new { |h, k| h[k] = [] } + return out if body.to_s.empty? + URI.decode_www_form(body.to_s).each { |k, v| out[k] << v } + out + end + + def first(values); values.is_a?(Array) ? values.first : values; end + def int_or(values, default); v = first(values); (v && !v.to_s.empty?) ? v.to_i : default; end + def clamp(v, lo, hi); v < lo ? lo : (v > hi ? hi : v); end + + def json_response(res, status, payload) + res.status = status + res['Content-Type'] = 'application/json' + res.body = JSON.generate(payload) + end + + def monotonic_ms + Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1000 + end +end + +# ---------------------------------------------------------------------- +# HTML page +# ---------------------------------------------------------------------- + +HTML_TEMPLATE = <<~'HTML' + + + + + + Redis Feature Store Demo (Ruby) + + + +
+
redis-rb + WEBrick
+

Redis Feature Store Demo

+

+ A small fraud-scoring feature store. Each user is one Redis hash + at __KEY_PREFIX__{id} with a batch-materialized + batch half (daily aggregates, + 24-hour key-level EXPIRE) and a streaming + streaming half (real-time + signals, __STREAM_TTL__s per-field HEXPIRE). + Inference reads any subset with one HMGET; batch + scoring pipelines HMGET across N users. +

+ +
+
+

Store state

+
Loading...
+
+ +
+

Materialize batch features

+

Calls HSET + EXPIRE for each user + through one redis.pipelined block, so the whole + batch ships in one round trip.

+ + + + +

+ Drop the TTL to e.g. 30 s and watch entities disappear on + schedule. +

+ + +
+ +
+

Streaming worker

+

Picks __USERS_PER_TICK__ users per tick, writes the + streaming features, applies HEXPIRE + __STREAM_TTL__s per field. Pause it and the + streaming fields drop out via per-field TTL while the batch + fields stay populated.

+
+ +
+ +
+

Inference read (HMGET)

+

Pick a user and a feature subset. One HMGET + round trip returns whatever the model needs.

+
+
+ + +
+
+ + +
+
+

Feature subset

+

+ Tick to include in the HMGET. Per-field TTL is + shown next to each field in the result table. +

+
+
+

Pick a user and click Read features.

+
+
+ +
+

Batch scoring

+

Pipelined HMGET across N random users via + redis.pipelined. One network round trip for the + whole batch.

+ + + +
+

(no batch read yet)

+
+
+ +
+

Inspect one user

+

HGETALL plus per-field HTTL and + key-level TTL. Useful for spotting which + streaming fields have already expired.

+ + + +
+

(pick a user and click Inspect)

+
+
+
+ +
+
+ + + + +HTML + +# ---------------------------------------------------------------------- +# main +# ---------------------------------------------------------------------- + +def parse_args(argv) + opts = { + host: '127.0.0.1', port: 8093, redis_url: 'redis://localhost:6379', + key_prefix: 'fs:user:', + batch_ttl_seconds: 24 * 60 * 60, + streaming_ttl_seconds: 5 * 60, + users_per_tick: 5, seed_users: 200, reset_on_start: true, + } + OptionParser.new do |o| + o.on('--host H') { |v| opts[:host] = v } + o.on('--port P', Integer) { |v| opts[:port] = v } + o.on('--redis-url URL') { |v| opts[:redis_url] = v } + o.on('--key-prefix PFX') { |v| opts[:key_prefix] = v } + o.on('--batch-ttl-seconds S', Integer) { |v| opts[:batch_ttl_seconds] = v } + o.on('--streaming-ttl-seconds S', Integer) { |v| opts[:streaming_ttl_seconds] = v } + o.on('--users-per-tick N', Integer) { |v| opts[:users_per_tick] = v } + o.on('--seed-users N', Integer) { |v| opts[:seed_users] = v } + o.on('--no-reset') { opts[:reset_on_start] = false } + end.parse!(argv) + opts +end + +if $PROGRAM_NAME == __FILE__ + opts = parse_args(ARGV) + redis = Redis.new(url: opts[:redis_url]) + store = FeatureStore.new( + redis: redis, + key_prefix: opts[:key_prefix], + batch_ttl_seconds: opts[:batch_ttl_seconds], + streaming_ttl_seconds: opts[:streaming_ttl_seconds], + ) + if opts[:reset_on_start] + puts "Dropping any existing users under '#{opts[:key_prefix]}*' for a clean demo run (pass --no-reset to keep them)." + store.reset + store.reset_stats + end + seeded = store.bulk_load( + synthesize_users(opts[:seed_users], 42), + ttl_seconds: opts[:batch_ttl_seconds], + ) + + worker = StreamingWorker.new( + store: store, + tick_seconds: 1.0, + users_per_tick: opts[:users_per_tick], + seed: 1337, + ) + worker.start + + demo = FeatureStoreDemo.new(store: store, worker: worker) + + html_page = HTML_TEMPLATE + .gsub('__KEY_PREFIX__', opts[:key_prefix]) + .gsub('__STREAM_TTL__', opts[:streaming_ttl_seconds].to_s) + .gsub('__USERS_PER_TICK__', opts[:users_per_tick].to_s) + .gsub('__BATCH_FIELDS_JSON__', JSON.generate(FeatureStore::DEFAULT_BATCH_FIELDS)) + .gsub('__STREAM_FIELDS_JSON__', JSON.generate(FeatureStore::DEFAULT_STREAMING_FIELDS)) + + server = WEBrick::HTTPServer.new( + BindAddress: opts[:host], + Port: opts[:port], + Logger: WEBrick::Log.new($stderr, WEBrick::Log::WARN), + AccessLog: [], + ) + server.mount('/', FeatureStoreServlet, store, worker, demo, html_page) + + trap('INT') do + puts "\nShutting down..." + worker.stop + server.shutdown + end + + puts "Redis feature-store demo server listening on http://#{opts[:host]}:#{opts[:port]}" + puts "Using Redis at #{opts[:redis_url]} with key prefix '#{opts[:key_prefix]}' (batch TTL #{opts[:batch_ttl_seconds]}s, streaming TTL #{opts[:streaming_ttl_seconds]}s)" + puts "Materialized #{seeded} user(s); streaming worker running." + + server.start +end diff --git a/content/develop/use-cases/feature-store/ruby/feature_store.rb b/content/develop/use-cases/feature-store/ruby/feature_store.rb new file mode 100644 index 0000000000..2000a80186 --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/feature_store.rb @@ -0,0 +1,294 @@ +# Redis online feature store backed by per-entity Hashes (redis-rb). +# +# Each entity (here, a user) lives at a deterministic key such as +# `fs:user:{id}`. The hash holds every feature for that entity as one +# field per feature -- batch-materialized aggregates (refreshed on a +# daily cycle) alongside streaming-updated signals (refreshed every +# few seconds). One `HMGET` returns whichever subset the model needs +# in one network round trip. +# +# Two TTL layers solve the *mixed staleness* problem: +# +# * A key-level `EXPIRE` aligned with the batch materialization +# cycle causes the whole entity to disappear if its batch +# refresher fails, so inference sees a missing entity (which the +# model handler can detect and fall back on) rather than silently +# outdated values. +# * A per-field `HEXPIRE` on each streaming field gives that field +# its own shorter expiry, independent of the rest of the hash. +# When the streaming pipeline stops updating a field, the field +# self-cleans while the rest of the entity stays populated. +# +# `HEXPIRE` and `HTTL` require Redis 7.4 or later. redis-rb 5.4 ships +# no stable typed helpers for the per-field TTL commands, so the +# helper issues them with `Redis#call` directly. The wire bytes are +# identical to what a typed binding would produce. + +require 'redis' +require 'thread' + +class FeatureStore + DEFAULT_BATCH_FIELDS = %w[ + country_iso + risk_segment + account_age_days + tx_count_7d + avg_amount_30d + chargeback_count_180d + ].freeze + + DEFAULT_STREAMING_FIELDS = %w[ + last_login_ts + last_device_id + tx_count_5m + failed_logins_15m + session_country + ].freeze + + DEFAULT_BATCH_TTL_SECONDS = 24 * 60 * 60 + DEFAULT_STREAMING_TTL_SECONDS = 5 * 60 + DEFAULT_KEY_PREFIX = 'fs:user:' + + attr_reader :key_prefix, :batch_ttl_seconds, :streaming_ttl_seconds + + def initialize(redis:, key_prefix: DEFAULT_KEY_PREFIX, + batch_ttl_seconds: DEFAULT_BATCH_TTL_SECONDS, + streaming_ttl_seconds: DEFAULT_STREAMING_TTL_SECONDS) + @redis = redis + @key_prefix = key_prefix + @batch_ttl_seconds = batch_ttl_seconds + @streaming_ttl_seconds = streaming_ttl_seconds + @stats_lock = Mutex.new + @batch_writes_total = 0 + @streaming_writes_total = 0 + @reads_total = 0 + @read_fields_total = 0 + end + + def key_for(entity_id) + "#{@key_prefix}#{entity_id}" + end + + # ------------------------------------------------------------------ + # Batch ingestion (materialization) + # ------------------------------------------------------------------ + + # Materialize a batch of entities into Redis. + # + # rows is `{entity_id => {field => value, ...}}`. One HSET plus one + # EXPIRE per entity, all queued through `Redis#pipelined` so the + # whole batch ships in a single round trip. The key-level EXPIRE + # is what makes the entity disappear if a future batch run fails. + def bulk_load(rows, ttl_seconds: nil) + return 0 if rows.empty? + ttl = ttl_seconds || @batch_ttl_seconds + @redis.pipelined do |pipe| + rows.each do |entity_id, fields| + key = key_for(entity_id) + encoded = fields.transform_values { |v| encode_value(v) } + pipe.hset(key, encoded) + pipe.expire(key, ttl) + end + end + @stats_lock.synchronize { @batch_writes_total += rows.size } + rows.size + end + + def update_batch_feature(entity_id, field, value) + @redis.hset(key_for(entity_id), field, encode_value(value)) + @stats_lock.synchronize { @batch_writes_total += 1 } + end + + # ------------------------------------------------------------------ + # Streaming ingestion + # ------------------------------------------------------------------ + + # Write streaming features with a per-field TTL. + # + # HSET and HEXPIRE are queued in the same pipeline so Redis runs + # them in order: the HSET first creates or overwrites the fields, + # then HEXPIRE attaches a TTL to each of those same fields. + # + # HEXPIRE returns one status code per field: + # 1 = TTL set, 2 = the expiry was 0 or in the past (field deleted), + # 0 = an NX|XX|GT|LT condition was not met (we never use one), + # -2 = no such field or no such key. + # We always follow HSET with HEXPIRE, so any code other than 1 + # means the per-field TTL invariant didn't hold -- raise rather + # than silently leave a streaming field with no expiry attached. + def update_streaming(entity_id, fields, ttl_seconds: nil) + return if fields.empty? + ttl = ttl_seconds || @streaming_ttl_seconds + key = key_for(entity_id) + encoded = fields.transform_values { |v| encode_value(v) } + names = encoded.keys + + results = @redis.pipelined do |pipe| + pipe.hset(key, encoded) + pipe.call('HEXPIRE', key, ttl, 'FIELDS', names.size, *names) + end + # results[0] = HSET fields-set count (ignored) + # results[1] = HEXPIRE per-field codes + codes = results[1] || [] + codes.each do |code| + unless code == 1 + raise "HEXPIRE did not set every field TTL for #{key}: #{codes.inspect}" + end + end + @stats_lock.synchronize { @streaming_writes_total += fields.size } + end + + # ------------------------------------------------------------------ + # Inference reads + # ------------------------------------------------------------------ + + # Retrieve a subset of features for one entity. Pass field_names=nil + # to fetch the entire hash with HGETALL -- useful for debugging but + # rarely the right call on the request path. + def get_features(entity_id, field_names = nil) + key = key_for(entity_id) + if field_names.nil? + data = @redis.hgetall(key) + @stats_lock.synchronize do + @reads_total += 1 + @read_fields_total += data.size + end + return data + end + return {} if field_names.empty? + values = @redis.hmget(key, *field_names) + out = {} + field_names.each_with_index do |n, i| + out[n] = values[i] unless values[i].nil? + end + @stats_lock.synchronize do + @reads_total += 1 + @read_fields_total += out.size + end + out + end + + # Pipeline HMGET across many entities for batch scoring. + def batch_get_features(entity_ids, field_names) + return {} if entity_ids.empty? || field_names.empty? + rows = @redis.pipelined do |pipe| + entity_ids.each { |id| pipe.hmget(key_for(id), *field_names) } + end + out = {} + seen = 0 + entity_ids.each_with_index do |id, i| + values = rows[i] || [] + row = {} + field_names.each_with_index do |n, j| + row[n] = values[j] unless values[j].nil? + end + out[id] = row + seen += row.size + end + @stats_lock.synchronize do + @reads_total += entity_ids.size + @read_fields_total += seen + end + out + end + + # ------------------------------------------------------------------ + # TTL inspection (used by the demo UI) + # ------------------------------------------------------------------ + + def key_ttl_seconds(entity_id) + @redis.ttl(key_for(entity_id)) + end + + # Per-field TTL via HTTL (Redis 7.4+). Each value mirrors the TTL + # convention: positive seconds remaining, -1 no field TTL, -2 + # field/key missing. + def field_ttls_seconds(entity_id, field_names) + return {} if field_names.empty? + codes = @redis.call('HTTL', key_for(entity_id), 'FIELDS', + field_names.size, *field_names) + # HTTL on a missing key returns a flat array of -2s. No + # defensive shim needed for this client. + codes ||= field_names.map { |_| -2 } + out = {} + field_names.each_with_index do |n, i| + out[n] = codes[i] || -2 + end + out + end + + # ------------------------------------------------------------------ + # Demo housekeeping + # ------------------------------------------------------------------ + + def list_entity_ids(limit: 200) + ids = [] + pattern = "#{@key_prefix}*" + prefix_len = @key_prefix.length + @redis.scan_each(match: pattern, count: 200) do |key| + ids << key[prefix_len..-1] if key.length > prefix_len + break if ids.size >= limit + end + ids.sort + end + + def count_entities + n = 0 + pattern = "#{@key_prefix}*" + @redis.scan_each(match: pattern, count: 500) { |_| n += 1 } + n + end + + def delete_entity(entity_id) + @redis.del(key_for(entity_id)) + end + + # Drop every entity under the key prefix. Used by the demo reset + # path; scans in batches and issues one variadic DEL per batch. + def reset + deleted = 0 + pattern = "#{@key_prefix}*" + batch = [] + @redis.scan_each(match: pattern, count: 500) do |key| + batch << key + if batch.size >= 500 + deleted += @redis.del(*batch) + batch.clear + end + end + deleted += @redis.del(*batch) unless batch.empty? + deleted + end + + def stats + @stats_lock.synchronize do + { + batch_writes_total: @batch_writes_total, + streaming_writes_total: @streaming_writes_total, + reads_total: @reads_total, + read_fields_total: @read_fields_total, + } + end + end + + def reset_stats + @stats_lock.synchronize do + @batch_writes_total = 0 + @streaming_writes_total = 0 + @reads_total = 0 + @read_fields_total = 0 + end + end + + # Render a feature value as a string for hash storage. Booleans + # become "true"/"false" so they round-trip cleanly through other + # clients and redis-cli. + def encode_value(value) + case value + when true then 'true' + when false then 'false' + when nil then '' + else value.to_s + end + end +end diff --git a/content/develop/use-cases/feature-store/ruby/streaming_worker.rb b/content/develop/use-cases/feature-store/ruby/streaming_worker.rb new file mode 100644 index 0000000000..ac685271f4 --- /dev/null +++ b/content/develop/use-cases/feature-store/ruby/streaming_worker.rb @@ -0,0 +1,181 @@ +# Streaming feature updater for the demo. +# +# Stands in for whatever Flink, Kafka Streams, or bespoke service +# computes the real-time features in a real deployment. In production +# this code lives in the streaming layer; here it runs as a daemon +# thread next to the demo server so the UI can start, pause, and +# resume it. + +require 'thread' + +class StreamingWorker + DEVICE_IDS = %w[ios-1a4c ios-9f02 and-7b21 and-2d18 web-chr-1 web-saf-1 web-ff-2].freeze + SESSION_COUNTRIES = %w[US GB DE FR IN BR JP AU CA NL].freeze + FAILED_LOGIN_BUCKETS = [0, 1, 2, 5].freeze + FAILED_LOGIN_WEIGHTS = [70, 20, 8, 2].freeze + + attr_reader :users_per_tick + + def initialize(store:, tick_seconds: 1.0, users_per_tick: 5, seed: 1337) + @store = store + @tick = tick_seconds + @users_per_tick = users_per_tick + @rng = Random.new(seed) + @rng_lock = Mutex.new + + @lifecycle_lock = Mutex.new + @running = false + @paused = false + @tick_in_flight = false + @tick_count = 0 + @writes_count = 0 + @stop = false + @thread = nil + end + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + + def start + @lifecycle_lock.synchronize do + return if @running + @running = true + @paused = false + @stop = false + @thread = Thread.new { run } + @thread.report_on_exception = true + end + end + + def stop + thread = nil + @lifecycle_lock.synchronize do + return unless @running + @stop = true + @running = false + thread = @thread + @thread = nil + end + if thread && !thread.join(2) + warn '[streaming-worker] stop timed out; waiting for tick to complete' + thread.join + end + # Defensive clear: the run loop's outer ensure already does this, + # but the thread may have exited via a path that skips it. + @tick_in_flight = false + end + + def pause; @paused = true; end + def resume; @paused = false; end + + def running?; @running; end + def paused?; @paused; end + + # Block until any in-flight tick has finished. `pause` only stops + # *future* ticks; callers (a reset that's about to DEL every + # entity, for example) use this to flush a mid-flight tick before + # they touch state the tick might still be writing to. + def wait_for_idle + sleep(0.02) while @tick_in_flight + end + + def stats + { + running: @running, + paused: @paused, + tick_count: @tick_count, + writes_count: @writes_count, + } + end + + def reset_stats + @tick_count = 0 + @writes_count = 0 + end + + # ------------------------------------------------------------------ + # Tick loop + # ------------------------------------------------------------------ + + private + + def run + until @stop + sleep(@tick) + break if @stop + + # Set tick_in_flight *before* the pause check so a concurrent + # pause+wait_for_idle can never observe tick_in_flight=false in + # the window between the pause check and the actual tick call. + # The ensure block clears the flag whether we paused, succeeded, + # or raised. + @tick_in_flight = true + begin + do_tick unless @paused + rescue => e + warn "[streaming-worker] tick failed: #{e.class}: #{e.message}" + ensure + @tick_in_flight = false + end + end + ensure + # Clear running and tick_in_flight no matter how the thread + # exits so a later start can spin a fresh thread. + @running = false + @tick_in_flight = false + end + + def do_tick + ids = @store.list_entity_ids(limit: 500) + return if ids.empty? + picks = sample(ids, @users_per_tick) + now_ms = (Time.now.to_f * 1000).to_i + writes = 0 + picks.each do |id| + fields = { + 'last_login_ts' => now_ms, + 'last_device_id' => choice(DEVICE_IDS), + 'tx_count_5m' => intn(13), + 'failed_logins_15m' => weighted_int(FAILED_LOGIN_BUCKETS, FAILED_LOGIN_WEIGHTS), + 'session_country' => choice(SESSION_COUNTRIES), + } + @store.update_streaming(id, fields) + writes += fields.size + end + @tick_count += 1 + @writes_count += writes + end + + def sample(items, k) + @rng_lock.synchronize do + pool = items.dup + out = [] + [k, pool.size].min.times do + idx = @rng.rand(pool.size) + out << pool.delete_at(idx) + end + out + end + end + + def choice(items) + @rng_lock.synchronize { items[@rng.rand(items.size)] } + end + + def intn(n) + @rng_lock.synchronize { @rng.rand(n) } + end + + def weighted_int(items, weights) + @rng_lock.synchronize do + total = weights.sum + r = @rng.rand(total) + items.each_with_index do |item, i| + r -= weights[i] + return item if r < 0 + end + items.last + end + end +end From 716d88ffcda2dd85a15f4d217901c2085fb7e1b2 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 14:40:08 +0100 Subject: [PATCH 11/20] DOC-6661 improvement to redis-py example learned from implementation experience --- .../use-cases/feature-store/redis-py/streaming_worker.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/content/develop/use-cases/feature-store/redis-py/streaming_worker.py b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py index 410a180690..ba5ef79de4 100644 --- a/content/develop/use-cases/feature-store/redis-py/streaming_worker.py +++ b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py @@ -16,6 +16,7 @@ from __future__ import annotations +import logging import random import threading import time @@ -73,6 +74,11 @@ def stop(self) -> None: thread = self._thread if thread is not None: thread.join(timeout=2.0) + if thread.is_alive(): + logging.getLogger(__name__).warning( + "stop timed out; waiting for tick to complete", + ) + thread.join() self._thread = None def pause(self) -> None: From 3e6418a2cc2110041a0f1b57c143a63bf8d3df0e Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 14:49:39 +0100 Subject: [PATCH 12/20] DOC-6661 Claude's lessons learned -> skill --- .agents/skills/redis-use-case-ports/SKILL.md | 1 + .../assets/audit-checklist.md | 68 +++++++++++++++++++ .../assets/redis-conventions.md | 65 ++++++++++++++++++ 3 files changed, 134 insertions(+) diff --git a/.agents/skills/redis-use-case-ports/SKILL.md b/.agents/skills/redis-use-case-ports/SKILL.md index 6c772f5995..8714386934 100644 --- a/.agents/skills/redis-use-case-ports/SKILL.md +++ b/.agents/skills/redis-use-case-ports/SKILL.md @@ -164,3 +164,4 @@ Keep `SKILL.md` itself focused on the workflow. The concrete artefacts live in ` - [`content/develop/use-cases/job-queue/`](../../../content/develop/use-cases/job-queue/) — the project that introduced rows 11–13 of [`audit-checklist.md`](assets/audit-checklist.md) (token-checked atomic state transitions, crash-window fallback timer, shared-keyspace collision in parallel smoke tests). - [`content/develop/use-cases/pub-sub/`](../../../content/develop/use-cases/pub-sub/) — the first non-keyspace use case ported. Introduced rows 14–18 of [`audit-checklist.md`](assets/audit-checklist.md) (subscribe-ack race, concurrent-name reservation, detached-worker PID capture, silent timeout fallthrough, server-wide PUBSUB introspection) plus the pub/sub conventions section in [`redis-conventions.md`](assets/redis-conventions.md). Also the project that motivated adding Phase 4b (independent review) after Codex caught four real bugs that Phase 4 cleared. - [`content/develop/use-cases/recommendation-engine/`](../../../content/develop/use-cases/recommendation-engine/) — the first ML / vector-search use case. Introduced the **ML / vector-search use cases** section in [`redis-conventions.md`](assets/redis-conventions.md) (per-client embedding library table, pre-computed `catalog.json` wire format, FFI / Ruby-version setup blockers, per-port deviation conventions) and rows 24–28 of [`audit-checklist.md`](assets/audit-checklist.md) (vector dim mismatch in client-side blend helpers, L2 normalisation silently skipped by the embedding wrapper, TAG escape must include the backslash itself, connection-wide state toggle race on a shared client, weight=0 must disable not normalise to default). Each of the five new rows came from a real bug — bugbot or Codex caught all of them; the Python reference shipped with the TAG-escape bug originally. +- [`content/develop/use-cases/feature-store/`](../../../content/develop/use-cases/feature-store/) — the first streaming-feature-store use case (HEXPIRE / HTTL per-field TTL + a long-lived in-process worker thread next to the demo server). Introduced the **Streaming-worker / background-task patterns** section in [`redis-conventions.md`](assets/redis-conventions.md) (pre-flight in-flight flag, worker lifetime decoupled from request lifetime, stop semantics, per-client HEXPIRE pipeline reply-shape table) and rows 29–31 of [`audit-checklist.md`](assets/audit-checklist.md) (HEXPIRE / HTTL per-field reply-code checking, pause-and-wait-idle race in worker-thread reset paths, worker stop with bounded join + silent thread abandonment). The reference Python implementation shipped without the in-flight flag and the stop-timeout recovery; Codex caught both on later ports and the Python retrofit followed. diff --git a/.agents/skills/redis-use-case-ports/assets/audit-checklist.md b/.agents/skills/redis-use-case-ports/assets/audit-checklist.md index ccfa7351b8..2df3865c3c 100644 --- a/.agents/skills/redis-use-case-ports/assets/audit-checklist.md +++ b/.agents/skills/redis-use-case-ports/assets/audit-checklist.md @@ -480,6 +480,74 @@ The first form lets a caller pass `0` to bypass the bonus entirely (and a downst --- +## 29. HEXPIRE / HTTL per-field reply-code checking + +**What to scan for:** every call site of `HEXPIRE`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, `HTTL`, `HPTTL`, or any client-library typed wrapper around them. Look at how the per-field array reply is consumed. + +**Pass criterion:** `HEXPIRE`-family commands return one status code per requested field, not a single success/failure. Each code is: + +* `1` — TTL set / updated. +* `2` — the expiry was `0` or in the past, so Redis deleted the field instead of attaching a TTL. +* `0` — an `NX | XX | GT | LT` conditional flag was specified and not met. +* `-2` — no such field, or no such key. + +The helper must **iterate the reply array and raise/throw on any code other than `1`** (when no conditional flag is in use), so the "every streaming write renews its TTL" invariant fails loudly rather than silently leaving a field with no expiry attached. A naked `await client.hexpire(...)` (or `pipe.hexpire(...)` whose result is discarded) is the wrong shape — the call can "succeed" at the RESP level and still have left every field un-TTL'd. + +`HTTL` returns the same array shape (per-field integer seconds, with `-2` for missing fields and `-1` for fields with no TTL). When the key is missing entirely, some libraries return a list-of-`-2` of the right length, others return `nil` / `None` / `null`. The helper must normalise to a per-field array of integers, defaulting missing/short replies to `-2` so callers never index out of range. + +**Sample audit prompt:** + +> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate every `HEXPIRE` (or family) call site and every `HTTL` call site. For HEXPIRE: confirm the helper iterates the per-field array and raises / throws on any code other than `1` (or documents why a specific non-`1` code is acceptable). A discarded reply or a check that only looks at the first element is a bug. For HTTL: confirm the helper normalises the reply to a per-field array even when the key is missing, with `-2` as the default for missing slots. Flag any port where a partial or `null` reply could cause an index-out-of-range error, a silent loss of the dead-letter signal, or a per-field TTL that never actually got set. + +**Why on list:** Feature-store use case, Codex independent review. The Python reference originally awaited `hexpire(...)` and discarded the per-field reply; for the streaming-feature-store pattern to work, every streaming write **must** renew the per-field TTL on every call. A single code of `2` (which means "Redis deleted the field because the expiry was already in the past") looks like success but is actually data loss. The defensive shim for HTTL was needed because redis-rs's typed wrapper, redis-rb's `call`-style return, and several of the pipelined clients all surface partial / `nil` arrays differently when the key has expired between the caller's check and the HTTL itself. + +--- + +## 30. Pause-and-wait-idle race in worker-thread reset paths + +**What to scan for:** every worker-thread tick loop that supports `pause()` plus an external `reset` / `clear` / `purge` path. Look at where the in-flight flag (`tick_in_flight`, `_tickInFlight`, `Volatile.Read(ref _tickInFlight)`, etc.) is set relative to the `paused` check inside the tick loop. + +**Pass criterion:** the in-flight flag must be set to `true` (or `1`) **before** the pause check, with a `finally` / `defer` / `ensure` block clearing it on every exit path. The combination lets an external caller do: + +``` +worker.pause() # stop future ticks +worker.wait_for_idle() # wait for the current tick to drain +store.reset() # safe to delete keys now +worker.resume() +``` + +If the in-flight flag is set **inside** the `if not paused: ...` branch, there is a window between the pause check and the actual tick where a concurrent `pause()` + `wait_for_idle()` observes `tick_in_flight=false` AND `paused=true`, falls straight through, and runs the `DEL` sweep while the tick is mid-write. The streaming write then recreates a hash entry that was just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Symptom: "0 leftover keys" smoke test fails sporadically, often only under load. + +The lifecycle flags (`running`, `tick_in_flight`) must be cleared in an **outer** `try/finally` / `defer` (around the whole tick loop, not just one iteration) so a thread that exits via an uncaught exception or a panic leaves the worker in a state where `start()` can spin a fresh thread. Without the outer clear, the demo's "is the worker running?" indicator gets stuck on, and a subsequent `start()` becomes a no-op. + +**Sample audit prompt:** + +> Audit every worker-thread tick loop in the 9 client implementations under `content/develop/use-cases/{{USE_CASE_NAME}}/`. For each, verify (a) the in-flight flag is set to true BEFORE the `paused` check, not inside the `not paused` branch; (b) a finally / defer / ensure clears the in-flight flag on every exit path including the paused-and-skipped path; (c) an outer try/finally around the whole tick loop clears both `running` and the in-flight flag so a panic / uncaught exception doesn't strand the lifecycle state. Run a quick stress test: 5x `reset` + `bulk-load` against an active streaming worker; the final keyspace must contain 0 leftover streaming-only hashes. Flag any port where (a), (b), or (c) is missing — those ports can produce ghost entries under concurrent reset. + +**Why on list:** Feature-store use case. Codex flagged the bug first on the Go port; once articulated, the same shape needed fixing in 7 of the 8 sibling ports (only Node.js's single-threaded event loop was immune). The reference Python implementation **shipped without the fix** — Codex caught it on a later client, and Python wasn't retrofitted. Future Phase 1 reference implementations of streaming-worker-style use cases must adopt the pattern from the start. + +--- + +## 31. Worker stop with bounded join + silent thread abandonment + +**What to scan for:** every `stop()` / `Stop()` / `StopAsync()` / shutdown method on a worker that owns a thread, task, or goroutine. Look at how the parent waits for the worker to exit. + +**Pass criterion:** if the wait is bounded (`thread.join(timeout=2.0)`, `worker.join(2000)`, `task.Wait(2000)`, etc.), the timeout-expired path must escalate, not silently move on. Acceptable shapes: + +* **Warn + indefinite wait.** Log a warning and call `thread.join()` (no timeout) so the parent at least observes that the stop took longer than the budget but never returns while the thread is still alive. This is the right shape for demos and well-behaved workers. +* **Force-interrupt + wait.** Cancel the task's cancellation token, send `Thread.interrupt()`, send `SIGTERM`, etc., and only then return. The right shape for production code where the worker might be stuck in a blocking I/O call. +* **Recovery via the in-flight flag.** Pair the bounded join with a `waitForIdle()` (polling the in-flight flag) that runs after the join. The in-flight flag's lifecycle (per row 30) is the eventual truth — even if the thread is still alive, once `tick_in_flight=false` the worker is safe to operate on. This is how Jedis and Lettuce ship in the feature-store ports. + +A bare `thread.join(timeout=N); self._thread = None` (drop the handle, move on) is the wrong shape. The thread is still running, holding a Redis connection, potentially writing during the next bulk-load. The demo "works" because Python daemon threads die when the process exits — but `stop()` was supposed to be a clean shutdown, and silently abandoning the thread defeats every test that relies on it. + +**Sample audit prompt:** + +> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate the worker's stop / shutdown method. If it uses a bounded join / wait (any timeout, any unit), verify one of these three recovery paths is present: (a) on timeout, log a warning and join indefinitely; (b) on timeout, force-interrupt the worker and then wait; (c) on timeout, fall through to a `waitForIdle()` (or equivalent in-flight-flag poll) that provides the actual safety guarantee. Flag any port where the timeout path is "set the handle to null and return" — that's silent thread abandonment, regardless of how the demo behaves under normal load. + +**Why on list:** Feature-store use case, Codex independent review of the Ruby port. The same shape was already in the Python reference (`thread.join(timeout=2.0)` then `self._thread = None`) but no earlier audit flagged it; Codex caught it on Ruby and the Python retrofit followed. Jedis / Lettuce had the bounded join but were saved by an explicit `waitForIdle()` after it — that's recovery shape (c) above, and it's the reason the bug never surfaced in those clients. Go / .NET / Rust / Node.js / PHP all use unbounded waits and are fine. The bug class is real even when masked by the in-flight-flag recovery; future ports should pick one shape and apply it consistently. + +--- + ## How to add a new row When a bug class is identified after this skill has been used: diff --git a/.agents/skills/redis-use-case-ports/assets/redis-conventions.md b/.agents/skills/redis-use-case-ports/assets/redis-conventions.md index da5559408b..fbbae3ada9 100644 --- a/.agents/skills/redis-use-case-ports/assets/redis-conventions.md +++ b/.agents/skills/redis-use-case-ports/assets/redis-conventions.md @@ -416,6 +416,71 @@ The reference Python pattern is an `_emit_change_locked(...)` helper called insi See audit-checklist row 16 for the audit prompt. +## Streaming-worker / background-task patterns + +These apply to any use case whose demo runs a long-lived in-process worker thread alongside the HTTP handler (streaming-feature-store updaters, background sync workers, CDC consumers, scheduled reindexers). The shape is similar to the pub/sub workers above but without the network primitive — the cross-port traps are about thread lifecycle and pause / wait-idle race conditions, not about ack handshakes. + +### Pre-flight in-flight flag before the pause check + +Every worker that exposes both `pause()` and `wait_for_idle()` (or equivalent) must set its `tick_in_flight` flag to `true` **before** the `paused` check inside the tick loop, with a `finally` / `defer` / `ensure` block that clears it on every exit path. The reference shape (Python-style pseudocode) is: + +```python +while not self._stop_event.is_set(): + self._stop_event.wait(timeout=self.tick_seconds) + if self._stop_event.is_set(): + break + self._tick_in_flight.set() + try: + if not self._paused.is_set(): + self._tick() + except Exception: + ... + finally: + self._tick_in_flight.clear() +``` + +The point is that an external caller can do `pause() + wait_for_idle() + reset()` and be guaranteed the reset's `DEL` sweep runs only after the in-flight tick has drained. If the flag is set **inside** the `not paused` branch, a concurrent `pause` + `wait_for_idle` can fall straight through while the tick is still mid-write, and the streaming `HSET` recreates an entry the reset just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Audit-checklist row 30 covers this. + +The outer `try/finally` (or `defer`, or `ensure`) wrapping the **whole tick loop** must also clear `running` and `tick_in_flight` on every exit path, so a worker that exits via an uncaught exception leaves the lifecycle state where the next `start()` can spin a fresh thread. + +### Worker lifetime decoupled from request lifetime + +Workers spawned from an HTTP request handler must not inherit the request's cancellation context. In Go specifically, calling `worker.Start(ctx)` with the `*http.Request.Context()` kills the worker as soon as the request completes — a particularly easy mistake because the Go community style strongly encourages passing `ctx` through everything. The fix is for `Start` to derive `context.Background()` (or use `context.WithCancel(context.Background())` for its own cancellation) internally; the HTTP `ctx` is only for the request's own work. + +The same shape applies to .NET (`CancellationToken` from the request) and Rust (`tokio_util::sync::CancellationToken` from the handler). The lifecycle of the worker is the lifetime of the **demo server process**, not the lifetime of any single request. + +### Worker stop semantics + +If the stop path uses a bounded `join` / `wait` / `await`, the timeout-expired branch must escalate — log + indefinite join, interrupt + wait, or fall through to a `waitForIdle()` on the in-flight flag. A bare `thread.join(timeout=N); thread = None` (drop the handle, move on) is silent thread abandonment, regardless of whether the daemon-thread shape lets the process exit cleanly. Audit-checklist row 31 covers this; the reference Python implementation shipped without it and was retrofitted after Codex flagged the same shape in the Ruby port. + +### HEXPIRE pipeline reply shapes vary across clients + +`HEXPIRE` returns one status code per requested field. Inside a pipeline / multi block the per-client decode shape varies: + +| Client | Pipeline-mode HEXPIRE reply | Notes | +|---|---|---| +| redis-py | `[int, int, ...]` flat list | `await pipe.execute()` gives back the per-field codes directly. | +| node-redis 5.x | `MultiErrorReply` per failed code | Inside `multi/exec`. Use `execAsPipeline()` for a plain array reply if no transactional guarantee is needed. | +| go-redis v9 | `[]int64` from `cmd.Result()` | Inspect via `redis.IntSliceCmd`. | +| Jedis | `List` from `Response.get()` | After `pipeline.sync()`. | +| Lettuce | `List` from `RedisFuture>.get()` | Use `async()` then `awaitAll` or `awaitOrCancel`. | +| StackExchange.Redis | `RedisResult[]` — one per field | `(long)results[i]` to read each code. | +| Predis 3 | `array` from the `pipeline()` callback's return value | The typed `hexpire()` decode preserves the per-field shape. | +| redis-rb | `Array` from `redis.pipelined { ... }` | Use `redis.call('HEXPIRE', key, ttl, 'FIELDS', n, *names)`; the typed binding is not stable on 5.4. | +| redis-rs | `Vec>` — outer pipeline wraps the inner array, take `[0]` | `pipe.cmd("HEXPIRE")...query_async::>>(&mut conn)`. | + +In every client the helper must iterate the per-field codes and raise / throw on anything other than `1` (assuming no `NX | XX | GT | LT` flag is in use). A discarded reply or a check that only looks at the first element silently leaves the rest of the fields un-TTL'd. Audit-checklist row 29 covers this. + +`HTTL` follows the same per-field-array shape with `-1` (no TTL) and `-2` (missing field/key) sentinels. The helper must normalise to a per-field array even when the reply is `nil` / `None` / `null` for a missing key — default missing slots to `-2` so callers never index out of range. + +### Stats counters across stateless and stateful runtimes + +Worker tick counts, write counts, and reads-per-second counters live in process memory for the threaded ports (Python, Node, Go, Jedis, Lettuce, Rust, .NET, Ruby). For PHP under `php -S`, where the demo server and the worker run as separate processes, the counters move into Redis under a `fs:control:*` / `:control:*` keyspace and the demo server / worker both `INCRBY` / `GET` against them. This is the same pattern as the prefetch-cache PHP port's cross-process pause flags (row 5 + the PHP-specific section above) but generalised to any counter the UI needs to display. + +### Reference projects + +* [`content/develop/use-cases/feature-store/`](../../../content/develop/use-cases/feature-store/) — established this section's conventions. Each port has a `StreamingWorker` (or equivalent) implementing the pause-and-wait-idle pattern, the outer lifecycle try/finally, and the per-field HEXPIRE reply check. + ## ML / vector-search use cases These apply to any use case whose helper has to embed text (or any other modality) into a vector and store the bytes in a Redis Search VECTOR field — recommendation engines, semantic search, RAG retrieval, classification feature stores. The shape is fundamentally different from the keyspace use cases because each port has to choose its own embedding library, and the library choice has real implications for setup ergonomics, performance, and which sentence encoder it ships. From c0406358b22dbd613257ff7569453c2af05ff52a Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 14:53:51 +0100 Subject: [PATCH 13/20] DOC-6661 Python issue fixed following skill improvement --- .../feature-store/redis-py/demo_server.py | 4 ++ .../redis-py/streaming_worker.py | 50 +++++++++++++++---- 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py index fd8713a3f3..cd0ad15b21 100644 --- a/content/develop/use-cases/feature-store/redis-py/demo_server.py +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -517,10 +517,14 @@ def reset(self) -> dict: # tick can't recreate a user that was just enumerated for deletion # (streaming HSET creates the key if it's missing, and that would # leave behind a streaming-only hash with no key-level TTL). + # pause() only stops *future* ticks; wait_for_idle() flushes any + # tick that's already mid-write before the DEL sweep runs. was_paused = self.worker.is_paused if self.worker.is_running and not was_paused: self.worker.pause() try: + if self.worker.is_running: + self.worker.wait_for_idle() deleted = self.store.reset() self.store.reset_stats() self.worker.reset_stats() diff --git a/content/develop/use-cases/feature-store/redis-py/streaming_worker.py b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py index ba5ef79de4..d3f3df223e 100644 --- a/content/develop/use-cases/feature-store/redis-py/streaming_worker.py +++ b/content/develop/use-cases/feature-store/redis-py/streaming_worker.py @@ -50,6 +50,7 @@ def __init__( self._thread: Optional[threading.Thread] = None self._stop_event = threading.Event() self._paused = threading.Event() + self._tick_in_flight = threading.Event() self._lock = threading.Lock() self._tick_count = 0 @@ -87,6 +88,22 @@ def pause(self) -> None: def resume(self) -> None: self._paused.clear() + def wait_for_idle(self, timeout: float = 5.0) -> bool: + """Block until any in-flight tick has finished. + + ``pause()`` only stops *future* ticks; callers (a reset that's + about to ``DEL`` every entity, for example) use this to flush a + mid-flight tick before they touch state the tick might still be + writing to. Returns ``True`` if the worker is idle, ``False`` + on timeout. + """ + deadline = time.monotonic() + timeout + while self._tick_in_flight.is_set(): + if time.monotonic() >= deadline: + return False + time.sleep(0.02) + return True + @property def is_paused(self) -> bool: return self._paused.is_set() @@ -100,15 +117,30 @@ def is_running(self) -> bool: # ------------------------------------------------------------------ def _run(self) -> None: - while not self._stop_event.is_set(): - if self._paused.is_set(): - time.sleep(0.05) - continue - try: - self._tick() - except Exception as exc: - print(f"[streaming-worker] tick failed: {exc}") - self._stop_event.wait(timeout=self.tick_seconds) + try: + while not self._stop_event.is_set(): + self._stop_event.wait(timeout=self.tick_seconds) + if self._stop_event.is_set(): + break + # Set tick_in_flight *before* the pause check so a + # concurrent pause + wait_for_idle can never observe + # tick_in_flight=False in the window between the pause + # check and the actual tick call. The finally block + # clears the flag whether we paused, succeeded, or + # raised. + self._tick_in_flight.set() + try: + if not self._paused.is_set(): + self._tick() + except Exception as exc: + print(f"[streaming-worker] tick failed: {exc}") + finally: + self._tick_in_flight.clear() + finally: + # Clear tick_in_flight no matter how the loop exits so a + # later start() can spin a fresh thread with a clean + # in-flight slate. + self._tick_in_flight.clear() def _tick(self) -> None: ids = self.store.list_entity_ids(limit=500) From 1e7045ee043e637963c22240b396ac92b8013d48 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 15:17:52 +0100 Subject: [PATCH 14/20] DOC-6661 restored consistency after merge conflict resolution --- .agents/skills/redis-use-case-ports/SKILL.md | 2 +- .../assets/audit-checklist.md | 133 +++++++++--------- .../assets/redis-conventions.md | 6 +- 3 files changed, 72 insertions(+), 69 deletions(-) diff --git a/.agents/skills/redis-use-case-ports/SKILL.md b/.agents/skills/redis-use-case-ports/SKILL.md index 2e7fbe8d56..4fd80bfdda 100644 --- a/.agents/skills/redis-use-case-ports/SKILL.md +++ b/.agents/skills/redis-use-case-ports/SKILL.md @@ -166,5 +166,5 @@ Keep `SKILL.md` itself focused on the workflow. The concrete artefacts live in ` - [`content/develop/use-cases/job-queue/`](../../../content/develop/use-cases/job-queue/) — the project that introduced rows 11–13 of [`audit-checklist.md`](assets/audit-checklist.md) (token-checked atomic state transitions, crash-window fallback timer, shared-keyspace collision in parallel smoke tests). - [`content/develop/use-cases/pub-sub/`](../../../content/develop/use-cases/pub-sub/) — the first non-keyspace use case ported. Introduced rows 14–18 of [`audit-checklist.md`](assets/audit-checklist.md) (subscribe-ack race, concurrent-name reservation, detached-worker PID capture, silent timeout fallthrough, server-wide PUBSUB introspection) plus the pub/sub conventions section in [`redis-conventions.md`](assets/redis-conventions.md). Also the project that motivated adding Phase 4b (independent review) after Codex caught four real bugs that Phase 4 cleared. - [`content/develop/use-cases/recommendation-engine/`](../../../content/develop/use-cases/recommendation-engine/) — the first ML / vector-search use case. Introduced the **ML / vector-search use cases** section in [`redis-conventions.md`](assets/redis-conventions.md) (per-client embedding library table, pre-computed `catalog.json` wire format, FFI / Ruby-version setup blockers, per-port deviation conventions) and rows 24–28 of [`audit-checklist.md`](assets/audit-checklist.md) (vector dim mismatch in client-side blend helpers, L2 normalisation silently skipped by the embedding wrapper, TAG escape must include the backslash itself, connection-wide state toggle race on a shared client, weight=0 must disable not normalise to default). Each of the five new rows came from a real bug — bugbot or Codex caught all of them; the Python reference shipped with the TAG-escape bug originally. -- [`content/develop/use-cases/feature-store/`](../../../content/develop/use-cases/feature-store/) — the first streaming-feature-store use case (HEXPIRE / HTTL per-field TTL + a long-lived in-process worker thread next to the demo server). Introduced the **Streaming-worker / background-task patterns** section in [`redis-conventions.md`](assets/redis-conventions.md) (pre-flight in-flight flag, worker lifetime decoupled from request lifetime, stop semantics, per-client HEXPIRE pipeline reply-shape table) and rows 29–31 of [`audit-checklist.md`](assets/audit-checklist.md) (HEXPIRE / HTTL per-field reply-code checking, pause-and-wait-idle race in worker-thread reset paths, worker stop with bounded join + silent thread abandonment). The reference Python implementation shipped without the in-flight flag and the stop-timeout recovery; Codex caught both on later ports and the Python retrofit followed. +- [`content/develop/use-cases/feature-store/`](../../../content/develop/use-cases/feature-store/) — the first streaming-feature-store use case (HEXPIRE / HTTL per-field TTL + a long-lived in-process worker thread next to the demo server). Introduced the **Streaming-worker / background-task patterns** section in [`redis-conventions.md`](assets/redis-conventions.md) (pre-flight in-flight flag, worker lifetime decoupled from request lifetime, stop semantics, per-client HEXPIRE pipeline reply-shape table) and rows 35–37 of [`audit-checklist.md`](assets/audit-checklist.md) (HEXPIRE / HTTL per-field reply-code checking, pause-and-wait-idle race in worker-thread reset paths, worker stop with bounded join + silent thread abandonment). The reference Python implementation shipped without the in-flight flag and the stop-timeout recovery; Codex caught both on later ports and the Python retrofit followed. - [`content/develop/use-cases/semantic-cache/`](../../../content/develop/use-cases/semantic-cache/) — the second ML / vector-search use case. Cache-on-LLM-responses backed by Redis Search KNN with a thresholded hit/miss decision and tenant/locale/model-version metadata filtering. Introduced rows 29–34 of [`audit-checklist.md`](assets/audit-checklist.md): embedder Predictor / Session thread-safety on shared instances (DJL needs `synchronized`, ONNX is fine); library config keys that look real but don't take effect (WEBrick's `MaxRequestBodySize` is not an option name; the body cap must be enforced in user code); lockfile pinning a newer runtime than the manifest declares (composer.lock requiring PHP 8.4 while composer.json said `^8.2`); NaN / Inf parsing via language-specific quirks (PHP `(float)"nan"` → 0.0 silently, must use textual rejection before parsing); per-language strings in HTML that's shared across language demos (badge text, default threshold must be populated via `/state` at first load); docs wire-form snippets must show escaped TAG values (`gpt\-4\.5\-2026`, not `gpt-4.5-2026`). Also the project that motivated the Phase 4b note about verifying independent-review findings against the current file before applying — several Jedis and PHP "missing" findings were actually re-discoveries of fixes that had landed minutes earlier. diff --git a/.agents/skills/redis-use-case-ports/assets/audit-checklist.md b/.agents/skills/redis-use-case-ports/assets/audit-checklist.md index 21a3caec01..acd885ef6b 100644 --- a/.agents/skills/redis-use-case-ports/assets/audit-checklist.md +++ b/.agents/skills/redis-use-case-ports/assets/audit-checklist.md @@ -480,71 +480,6 @@ The first form lets a caller pass `0` to bypass the bonus entirely (and a downst --- -## 29. HEXPIRE / HTTL per-field reply-code checking - -**What to scan for:** every call site of `HEXPIRE`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, `HTTL`, `HPTTL`, or any client-library typed wrapper around them. Look at how the per-field array reply is consumed. - -**Pass criterion:** `HEXPIRE`-family commands return one status code per requested field, not a single success/failure. Each code is: - -* `1` — TTL set / updated. -* `2` — the expiry was `0` or in the past, so Redis deleted the field instead of attaching a TTL. -* `0` — an `NX | XX | GT | LT` conditional flag was specified and not met. -* `-2` — no such field, or no such key. - -The helper must **iterate the reply array and raise/throw on any code other than `1`** (when no conditional flag is in use), so the "every streaming write renews its TTL" invariant fails loudly rather than silently leaving a field with no expiry attached. A naked `await client.hexpire(...)` (or `pipe.hexpire(...)` whose result is discarded) is the wrong shape — the call can "succeed" at the RESP level and still have left every field un-TTL'd. - -`HTTL` returns the same array shape (per-field integer seconds, with `-2` for missing fields and `-1` for fields with no TTL). When the key is missing entirely, some libraries return a list-of-`-2` of the right length, others return `nil` / `None` / `null`. The helper must normalise to a per-field array of integers, defaulting missing/short replies to `-2` so callers never index out of range. - -**Sample audit prompt:** - -> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate every `HEXPIRE` (or family) call site and every `HTTL` call site. For HEXPIRE: confirm the helper iterates the per-field array and raises / throws on any code other than `1` (or documents why a specific non-`1` code is acceptable). A discarded reply or a check that only looks at the first element is a bug. For HTTL: confirm the helper normalises the reply to a per-field array even when the key is missing, with `-2` as the default for missing slots. Flag any port where a partial or `null` reply could cause an index-out-of-range error, a silent loss of the dead-letter signal, or a per-field TTL that never actually got set. - -**Why on list:** Feature-store use case, Codex independent review. The Python reference originally awaited `hexpire(...)` and discarded the per-field reply; for the streaming-feature-store pattern to work, every streaming write **must** renew the per-field TTL on every call. A single code of `2` (which means "Redis deleted the field because the expiry was already in the past") looks like success but is actually data loss. The defensive shim for HTTL was needed because redis-rs's typed wrapper, redis-rb's `call`-style return, and several of the pipelined clients all surface partial / `nil` arrays differently when the key has expired between the caller's check and the HTTL itself. - ---- - -## 30. Pause-and-wait-idle race in worker-thread reset paths - -**What to scan for:** every worker-thread tick loop that supports `pause()` plus an external `reset` / `clear` / `purge` path. Look at where the in-flight flag (`tick_in_flight`, `_tickInFlight`, `Volatile.Read(ref _tickInFlight)`, etc.) is set relative to the `paused` check inside the tick loop. - -**Pass criterion:** the in-flight flag must be set to `true` (or `1`) **before** the pause check, with a `finally` / `defer` / `ensure` block clearing it on every exit path. The combination lets an external caller do: - -``` -worker.pause() # stop future ticks -worker.wait_for_idle() # wait for the current tick to drain -store.reset() # safe to delete keys now -worker.resume() -``` - -If the in-flight flag is set **inside** the `if not paused: ...` branch, there is a window between the pause check and the actual tick where a concurrent `pause()` + `wait_for_idle()` observes `tick_in_flight=false` AND `paused=true`, falls straight through, and runs the `DEL` sweep while the tick is mid-write. The streaming write then recreates a hash entry that was just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Symptom: "0 leftover keys" smoke test fails sporadically, often only under load. - -The lifecycle flags (`running`, `tick_in_flight`) must be cleared in an **outer** `try/finally` / `defer` (around the whole tick loop, not just one iteration) so a thread that exits via an uncaught exception or a panic leaves the worker in a state where `start()` can spin a fresh thread. Without the outer clear, the demo's "is the worker running?" indicator gets stuck on, and a subsequent `start()` becomes a no-op. - -**Sample audit prompt:** - -> Audit every worker-thread tick loop in the 9 client implementations under `content/develop/use-cases/{{USE_CASE_NAME}}/`. For each, verify (a) the in-flight flag is set to true BEFORE the `paused` check, not inside the `not paused` branch; (b) a finally / defer / ensure clears the in-flight flag on every exit path including the paused-and-skipped path; (c) an outer try/finally around the whole tick loop clears both `running` and the in-flight flag so a panic / uncaught exception doesn't strand the lifecycle state. Run a quick stress test: 5x `reset` + `bulk-load` against an active streaming worker; the final keyspace must contain 0 leftover streaming-only hashes. Flag any port where (a), (b), or (c) is missing — those ports can produce ghost entries under concurrent reset. - -**Why on list:** Feature-store use case. Codex flagged the bug first on the Go port; once articulated, the same shape needed fixing in 7 of the 8 sibling ports (only Node.js's single-threaded event loop was immune). The reference Python implementation **shipped without the fix** — Codex caught it on a later client, and Python wasn't retrofitted. Future Phase 1 reference implementations of streaming-worker-style use cases must adopt the pattern from the start. - ---- - -## 31. Worker stop with bounded join + silent thread abandonment - -**What to scan for:** every `stop()` / `Stop()` / `StopAsync()` / shutdown method on a worker that owns a thread, task, or goroutine. Look at how the parent waits for the worker to exit. - -**Pass criterion:** if the wait is bounded (`thread.join(timeout=2.0)`, `worker.join(2000)`, `task.Wait(2000)`, etc.), the timeout-expired path must escalate, not silently move on. Acceptable shapes: - -* **Warn + indefinite wait.** Log a warning and call `thread.join()` (no timeout) so the parent at least observes that the stop took longer than the budget but never returns while the thread is still alive. This is the right shape for demos and well-behaved workers. -* **Force-interrupt + wait.** Cancel the task's cancellation token, send `Thread.interrupt()`, send `SIGTERM`, etc., and only then return. The right shape for production code where the worker might be stuck in a blocking I/O call. -* **Recovery via the in-flight flag.** Pair the bounded join with a `waitForIdle()` (polling the in-flight flag) that runs after the join. The in-flight flag's lifecycle (per row 30) is the eventual truth — even if the thread is still alive, once `tick_in_flight=false` the worker is safe to operate on. This is how Jedis and Lettuce ship in the feature-store ports. - -A bare `thread.join(timeout=N); self._thread = None` (drop the handle, move on) is the wrong shape. The thread is still running, holding a Redis connection, potentially writing during the next bulk-load. The demo "works" because Python daemon threads die when the process exits — but `stop()` was supposed to be a clean shutdown, and silently abandoning the thread defeats every test that relies on it. - -**Sample audit prompt:** - -> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate the worker's stop / shutdown method. If it uses a bounded join / wait (any timeout, any unit), verify one of these three recovery paths is present: (a) on timeout, log a warning and join indefinitely; (b) on timeout, force-interrupt the worker and then wait; (c) on timeout, fall through to a `waitForIdle()` (or equivalent in-flight-flag poll) that provides the actual safety guarantee. Flag any port where the timeout path is "set the handle to null and return" — that's silent thread abandonment, regardless of how the demo behaves under normal load. - -**Why on list:** Feature-store use case, Codex independent review of the Ruby port. The same shape was already in the Python reference (`thread.join(timeout=2.0)` then `self._thread = None`) but no earlier audit flagged it; Codex caught it on Ruby and the Python retrofit followed. Jedis / Lettuce had the bounded join but were saved by an explicit `waitForIdle()` after it — that's recovery shape (c) above, and it's the reason the bug never surfaced in those clients. Go / .NET / Rust / Node.js / PHP all use unbounded waits and are fine. The bug class is real even when masked by the in-flight-flag recovery; future ports should pick one shape and apply it consistently. ## 29. Embedder Predictor / Session is not always thread-safe on a shared instance **What to scan for:** the embedding wrapper's `encodeOne` / `encode_many` / `EncodeInternal` methods, and how the wrapper is reached from the HTTP handler. Particularly look at the handler executor (cached thread pool, `Executors.newCachedThreadPool`, async runtime with multiple workers, `HttpListener` callback) — does the wrapper hold any mutable state across calls, and is the underlying library documented as thread-safe? @@ -643,6 +578,74 @@ The robust pattern is **textual rejection before parsing**: lowercase the input, --- +## 35. HEXPIRE / HTTL per-field reply-code checking + +**What to scan for:** every call site of `HEXPIRE`, `HEXPIREAT`, `HPEXPIRE`, `HPEXPIREAT`, `HTTL`, `HPTTL`, or any client-library typed wrapper around them. Look at how the per-field array reply is consumed. + +**Pass criterion:** `HEXPIRE`-family commands return one status code per requested field, not a single success/failure. Each code is: + +* `1` — TTL set / updated. +* `2` — the expiry was `0` or in the past, so Redis deleted the field instead of attaching a TTL. +* `0` — an `NX | XX | GT | LT` conditional flag was specified and not met. +* `-2` — no such field, or no such key. + +The helper must **iterate the reply array and raise/throw on any code other than `1`** (when no conditional flag is in use), so the "every streaming write renews its TTL" invariant fails loudly rather than silently leaving a field with no expiry attached. A naked `await client.hexpire(...)` (or `pipe.hexpire(...)` whose result is discarded) is the wrong shape — the call can "succeed" at the RESP level and still have left every field un-TTL'd. + +`HTTL` returns the same array shape (per-field integer seconds, with `-2` for missing fields and `-1` for fields with no TTL). When the key is missing entirely, some libraries return a list-of-`-2` of the right length, others return `nil` / `None` / `null`. The helper must normalise to a per-field array of integers, defaulting missing/short replies to `-2` so callers never index out of range. + +**Sample audit prompt:** + +> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate every `HEXPIRE` (or family) call site and every `HTTL` call site. For HEXPIRE: confirm the helper iterates the per-field array and raises / throws on any code other than `1` (or documents why a specific non-`1` code is acceptable). A discarded reply or a check that only looks at the first element is a bug. For HTTL: confirm the helper normalises the reply to a per-field array even when the key is missing, with `-2` as the default for missing slots. Flag any port where a partial or `null` reply could cause an index-out-of-range error, a silent loss of the dead-letter signal, or a per-field TTL that never actually got set. + +**Why on list:** Feature-store use case, Codex independent review. The Python reference originally awaited `hexpire(...)` and discarded the per-field reply; for the streaming-feature-store pattern to work, every streaming write **must** renew the per-field TTL on every call. A single code of `2` (which means "Redis deleted the field because the expiry was already in the past") looks like success but is actually data loss. The defensive shim for HTTL was needed because redis-rs's typed wrapper, redis-rb's `call`-style return, and several of the pipelined clients all surface partial / `nil` arrays differently when the key has expired between the caller's check and the HTTL itself. + +--- + +## 36. Pause-and-wait-idle race in worker-thread reset paths + +**What to scan for:** every worker-thread tick loop that supports `pause()` plus an external `reset` / `clear` / `purge` path. Look at where the in-flight flag (`tick_in_flight`, `_tickInFlight`, `Volatile.Read(ref _tickInFlight)`, etc.) is set relative to the `paused` check inside the tick loop. + +**Pass criterion:** the in-flight flag must be set to `true` (or `1`) **before** the pause check, with a `finally` / `defer` / `ensure` block clearing it on every exit path. The combination lets an external caller do: + +``` +worker.pause() # stop future ticks +worker.wait_for_idle() # wait for the current tick to drain +store.reset() # safe to delete keys now +worker.resume() +``` + +If the in-flight flag is set **inside** the `if not paused: ...` branch, there is a window between the pause check and the actual tick where a concurrent `pause()` + `wait_for_idle()` observes `tick_in_flight=false` AND `paused=true`, falls straight through, and runs the `DEL` sweep while the tick is mid-write. The streaming write then recreates a hash entry that was just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Symptom: "0 leftover keys" smoke test fails sporadically, often only under load. + +The lifecycle flags (`running`, `tick_in_flight`) must be cleared in an **outer** `try/finally` / `defer` (around the whole tick loop, not just one iteration) so a thread that exits via an uncaught exception or a panic leaves the worker in a state where `start()` can spin a fresh thread. Without the outer clear, the demo's "is the worker running?" indicator gets stuck on, and a subsequent `start()` becomes a no-op. + +**Sample audit prompt:** + +> Audit every worker-thread tick loop in the 9 client implementations under `content/develop/use-cases/{{USE_CASE_NAME}}/`. For each, verify (a) the in-flight flag is set to true BEFORE the `paused` check, not inside the `not paused` branch; (b) a finally / defer / ensure clears the in-flight flag on every exit path including the paused-and-skipped path; (c) an outer try/finally around the whole tick loop clears both `running` and the in-flight flag so a panic / uncaught exception doesn't strand the lifecycle state. Run a quick stress test: 5x `reset` + `bulk-load` against an active streaming worker; the final keyspace must contain 0 leftover streaming-only hashes. Flag any port where (a), (b), or (c) is missing — those ports can produce ghost entries under concurrent reset. + +**Why on list:** Feature-store use case. Codex flagged the bug first on the Go port; once articulated, the same shape needed fixing in 7 of the 8 sibling ports (only Node.js's single-threaded event loop was immune). The reference Python implementation **shipped without the fix** — Codex caught it on a later client, and Python was retrofitted to match (the in-flight `threading.Event`, the pre-flight set, and the `wait_for_idle()` recovery now match the other 8 ports). Future Phase 1 reference implementations of streaming-worker-style use cases must adopt the pattern from the start. + +--- + +## 37. Worker stop with bounded join + silent thread abandonment + +**What to scan for:** every `stop()` / `Stop()` / `StopAsync()` / shutdown method on a worker that owns a thread, task, or goroutine. Look at how the parent waits for the worker to exit. + +**Pass criterion:** if the wait is bounded (`thread.join(timeout=2.0)`, `worker.join(2000)`, `task.Wait(2000)`, etc.), the timeout-expired path must escalate, not silently move on. Acceptable shapes: + +* **Warn + indefinite wait.** Log a warning and call `thread.join()` (no timeout) so the parent at least observes that the stop took longer than the budget but never returns while the thread is still alive. This is the right shape for demos and well-behaved workers. +* **Force-interrupt + wait.** Cancel the task's cancellation token, send `Thread.interrupt()`, send `SIGTERM`, etc., and only then return. The right shape for production code where the worker might be stuck in a blocking I/O call. +* **Recovery via the in-flight flag.** Pair the bounded join with a `waitForIdle()` (polling the in-flight flag) that runs after the join. The in-flight flag's lifecycle (per row 36) is the eventual truth — even if the thread is still alive, once `tick_in_flight=false` the worker is safe to operate on. This is how Jedis and Lettuce ship in the feature-store ports. + +A bare `thread.join(timeout=N); self._thread = None` (drop the handle, move on) is the wrong shape. The thread is still running, holding a Redis connection, potentially writing during the next bulk-load. The demo "works" because Python daemon threads die when the process exits — but `stop()` was supposed to be a clean shutdown, and silently abandoning the thread defeats every test that relies on it. + +**Sample audit prompt:** + +> For each port under `content/develop/use-cases/{{USE_CASE_NAME}}/`, locate the worker's stop / shutdown method. If it uses a bounded join / wait (any timeout, any unit), verify one of these three recovery paths is present: (a) on timeout, log a warning and join indefinitely; (b) on timeout, force-interrupt the worker and then wait; (c) on timeout, fall through to a `waitForIdle()` (or equivalent in-flight-flag poll) that provides the actual safety guarantee. Flag any port where the timeout path is "set the handle to null and return" — that's silent thread abandonment, regardless of how the demo behaves under normal load. + +**Why on list:** Feature-store use case, Codex independent review of the Ruby port. The same shape was already in the Python reference (`thread.join(timeout=2.0)` then `self._thread = None`) but no earlier audit flagged it; Codex caught it on Ruby and the Python retrofit followed. Jedis / Lettuce had the bounded join but were saved by an explicit `waitForIdle()` after it — that's recovery shape (c) above, and it's the reason the bug never surfaced in those clients. Go / .NET / Rust / Node.js / PHP all use unbounded waits and are fine. The bug class is real even when masked by the in-flight-flag recovery; future ports should pick one shape and apply it consistently. + +--- + ## How to add a new row When a bug class is identified after this skill has been used: diff --git a/.agents/skills/redis-use-case-ports/assets/redis-conventions.md b/.agents/skills/redis-use-case-ports/assets/redis-conventions.md index fbbae3ada9..5ca5d9e39b 100644 --- a/.agents/skills/redis-use-case-ports/assets/redis-conventions.md +++ b/.agents/skills/redis-use-case-ports/assets/redis-conventions.md @@ -439,7 +439,7 @@ while not self._stop_event.is_set(): self._tick_in_flight.clear() ``` -The point is that an external caller can do `pause() + wait_for_idle() + reset()` and be guaranteed the reset's `DEL` sweep runs only after the in-flight tick has drained. If the flag is set **inside** the `not paused` branch, a concurrent `pause` + `wait_for_idle` can fall straight through while the tick is still mid-write, and the streaming `HSET` recreates an entry the reset just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Audit-checklist row 30 covers this. +The point is that an external caller can do `pause() + wait_for_idle() + reset()` and be guaranteed the reset's `DEL` sweep runs only after the in-flight tick has drained. If the flag is set **inside** the `not paused` branch, a concurrent `pause` + `wait_for_idle` can fall straight through while the tick is still mid-write, and the streaming `HSET` recreates an entry the reset just enumerated for deletion — leaving a streaming-only hash with no key-level TTL. Audit-checklist row 36 covers this. The outer `try/finally` (or `defer`, or `ensure`) wrapping the **whole tick loop** must also clear `running` and `tick_in_flight` on every exit path, so a worker that exits via an uncaught exception leaves the lifecycle state where the next `start()` can spin a fresh thread. @@ -451,7 +451,7 @@ The same shape applies to .NET (`CancellationToken` from the request) and Rust ( ### Worker stop semantics -If the stop path uses a bounded `join` / `wait` / `await`, the timeout-expired branch must escalate — log + indefinite join, interrupt + wait, or fall through to a `waitForIdle()` on the in-flight flag. A bare `thread.join(timeout=N); thread = None` (drop the handle, move on) is silent thread abandonment, regardless of whether the daemon-thread shape lets the process exit cleanly. Audit-checklist row 31 covers this; the reference Python implementation shipped without it and was retrofitted after Codex flagged the same shape in the Ruby port. +If the stop path uses a bounded `join` / `wait` / `await`, the timeout-expired branch must escalate — log + indefinite join, interrupt + wait, or fall through to a `waitForIdle()` on the in-flight flag. A bare `thread.join(timeout=N); thread = None` (drop the handle, move on) is silent thread abandonment, regardless of whether the daemon-thread shape lets the process exit cleanly. Audit-checklist row 37 covers this; the reference Python implementation shipped without it and was retrofitted after Codex flagged the same shape in the Ruby port. ### HEXPIRE pipeline reply shapes vary across clients @@ -469,7 +469,7 @@ If the stop path uses a bounded `join` / `wait` / `await`, the timeout-expired b | redis-rb | `Array` from `redis.pipelined { ... }` | Use `redis.call('HEXPIRE', key, ttl, 'FIELDS', n, *names)`; the typed binding is not stable on 5.4. | | redis-rs | `Vec>` — outer pipeline wraps the inner array, take `[0]` | `pipe.cmd("HEXPIRE")...query_async::>>(&mut conn)`. | -In every client the helper must iterate the per-field codes and raise / throw on anything other than `1` (assuming no `NX | XX | GT | LT` flag is in use). A discarded reply or a check that only looks at the first element silently leaves the rest of the fields un-TTL'd. Audit-checklist row 29 covers this. +In every client the helper must iterate the per-field codes and raise / throw on anything other than `1` (assuming no `NX | XX | GT | LT` flag is in use). A discarded reply or a check that only looks at the first element silently leaves the rest of the fields un-TTL'd. Audit-checklist row 35 covers this. `HTTL` follows the same per-field-array shape with `-1` (no TTL) and `-2` (missing field/key) sentinels. The helper must normalise to a per-field array even when the reply is `nil` / `None` / `null` for a missing key — default missing slots to `-2` so callers never index out of range. From 0531c47214137c53e04843ff6514349102e3f7e4 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 15:36:46 +0100 Subject: [PATCH 15/20] DOC-6661 fix broken link in semantic cache example --- content/develop/use-cases/semantic-cache/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/develop/use-cases/semantic-cache/_index.md b/content/develop/use-cases/semantic-cache/_index.md index 28c6349abf..4cdb1e8775 100644 --- a/content/develop/use-cases/semantic-cache/_index.md +++ b/content/develop/use-cases/semantic-cache/_index.md @@ -56,7 +56,7 @@ Redis provides the following features that make it a good fit for a semantic cac The following libraries, frameworks, and managed services build on Redis for semantic caching: -- **Python**: [RedisVL](https://github.com/redis/redis-vl-python) provides the `SemanticCache` API with built-in embedding, distance thresholds, TTL, and metadata filters. See the [RedisVL LLM cache user guide]({{< relref "/develop/ai/redisvl/user_guide/llmcache" >}}) and the [LangCache integration guide]({{< relref "/develop/ai/redisvl/user_guide/how_to_guides/langcache_semantic_cache" >}}). +- **Python**: [RedisVL](https://github.com/redis/redis-vl-python) provides the `SemanticCache` API with built-in embedding, distance thresholds, TTL, and metadata filters. See the [RedisVL LLM cache user guide]({{< relref "/develop/ai/redisvl/user_guide/how_to_guides/llmcache" >}}) and the [LangCache integration guide]({{< relref "/develop/ai/redisvl/user_guide/how_to_guides/langcache_semantic_cache" >}}). - **Frameworks**: [LangChain](https://python.langchain.com/docs/integrations/llm_caching/#redis-cache) (Redis as an LLM cache and vector store), [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/vector_stores/RedisIndexDemo/), and [LangGraph](https://langchain-ai.github.io/langgraph/) for agent memory and response caching. - **Managed**: [Redis LangCache]({{< relref "/develop/ai/context-engine/langcache" >}}) is a fully managed semantic cache with a REST API, configurable distance thresholds, automatic eviction, and built-in metrics — no index management or embedding wiring required. From 1f48b4a80210bf2d35a3c11b657898b9401d50f4 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 15:53:51 +0100 Subject: [PATCH 16/20] DOC-6661 bugbot issues --- .../feature-store/dotnet/FeatureStore.cs | 16 ++++++++++------ .../use-cases/feature-store/dotnet/Program.cs | 6 +++++- .../use-cases/feature-store/go/demo_server.go | 7 +++++-- .../feature-store/java-jedis/DemoServer.java | 6 +++++- .../feature-store/java-lettuce/DemoServer.java | 6 +++++- .../use-cases/feature-store/nodejs/demoServer.js | 14 +++++++++++--- .../use-cases/feature-store/php/demo_server.php | 13 ++++++++++++- .../feature-store/redis-py/demo_server.py | 7 ++++++- .../use-cases/feature-store/ruby/demo_server.rb | 13 +++++++++++-- .../use-cases/feature-store/rust/demo_server.rs | 7 +++++-- 10 files changed, 75 insertions(+), 20 deletions(-) diff --git a/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs index 77b6a8583d..923ef44fde 100644 --- a/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs +++ b/content/develop/use-cases/feature-store/dotnet/FeatureStore.cs @@ -297,12 +297,16 @@ public async Task KeyTtlSecondsAsync(string entityId) } /// - /// Per-field TTL via HTTL (Redis 7.4+). Values are in - /// seconds (the StackExchange.Redis return is milliseconds; we - /// convert here for consistency with the other clients): - /// positive seconds remaining, -1 no field TTL, -2 field - /// (or key) missing. + /// Per-field TTL helper (Redis 7.4+). Returns whole seconds for + /// parity with the other clients: positive seconds remaining, + /// -1 no field TTL, -2 field (or key) missing. /// + /// + /// HashFieldGetTimeToLiveAsync wraps HPTTL (not + /// HTTL), so the API returns milliseconds. We convert to + /// whole seconds here so the JSON shape matches Python, Node.js, + /// Go, Java, Rust, Ruby, and PHP, which all expose seconds. + /// public async Task> FieldTtlsSecondsAsync( string entityId, IReadOnlyList fieldNames) { @@ -315,7 +319,7 @@ public async Task> FieldTtlsSecondsAsync( // any future version that might return a shorter or empty array. for (int i = 0; i < fieldNames.Count; i++) { - // HTTL returns ms remaining; negative sentinels pass + // HPTTL returns ms remaining; negative sentinels pass // through. Convert positive durations to whole seconds // for parity with the other clients' helpers. long v = i < ms.Length ? ms[i] : -2L; diff --git a/content/develop/use-cases/feature-store/dotnet/Program.cs b/content/develop/use-cases/feature-store/dotnet/Program.cs index da9a17bf10..bc49babe54 100644 --- a/content/develop/use-cases/feature-store/dotnet/Program.cs +++ b/content/develop/use-cases/feature-store/dotnet/Program.cs @@ -191,8 +191,12 @@ string IndexHtml() => await demoLock.WaitAsync(); try { + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // StartAsync() clears the paused flag, so a fall-through + // would pause the worker we just brought back up. if (!worker.IsRunning) await worker.StartAsync(); - if (worker.IsPaused) worker.Resume(); + else if (worker.IsPaused) worker.Resume(); else worker.Pause(); return Results.Json(new { paused = worker.IsPaused, running = worker.IsRunning }); } diff --git a/content/develop/use-cases/feature-store/go/demo_server.go b/content/develop/use-cases/feature-store/go/demo_server.go index 8ce1b63e23..d086990643 100644 --- a/content/develop/use-cases/feature-store/go/demo_server.go +++ b/content/develop/use-cases/feature-store/go/demo_server.go @@ -122,10 +122,13 @@ func (d *FeatureStoreDemo) Reset(ctx context.Context) (int64, error) { func (d *FeatureStoreDemo) ToggleWorker() (paused, running bool) { d.mu.Lock() defer d.mu.Unlock() + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // Start() clears the paused flag, so a fall-through pauses the + // worker we just brought back up. if !d.worker.IsRunning() { d.worker.Start() - } - if d.worker.IsPaused() { + } else if d.worker.IsPaused() { d.worker.Resume() } else { d.worker.Pause() diff --git a/content/develop/use-cases/feature-store/java-jedis/DemoServer.java b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java index 8822a3e186..a8939a60eb 100644 --- a/content/develop/use-cases/feature-store/java-jedis/DemoServer.java +++ b/content/develop/use-cases/feature-store/java-jedis/DemoServer.java @@ -189,8 +189,12 @@ public long reset() { public Map toggleWorker() { lock.lock(); try { + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // start() clears the paused flag, so a fall-through + // would pause the worker we just brought back up. if (!worker.isRunning()) worker.start(); - if (worker.isPaused()) worker.resume(); + else if (worker.isPaused()) worker.resume(); else worker.pause(); return Map.of( "paused", worker.isPaused(), diff --git a/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java b/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java index cee9427c88..86ae61ab3d 100644 --- a/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java +++ b/content/develop/use-cases/feature-store/java-lettuce/DemoServer.java @@ -200,8 +200,12 @@ public long reset() { public Map toggleWorker() { lock.lock(); try { + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // start() clears the paused flag, so a fall-through + // would pause the worker we just brought back up. if (!worker.isRunning()) worker.start(); - if (worker.isPaused()) worker.resume(); + else if (worker.isPaused()) worker.resume(); else worker.pause(); return Map.of( "paused", worker.isPaused(), diff --git a/content/develop/use-cases/feature-store/nodejs/demoServer.js b/content/develop/use-cases/feature-store/nodejs/demoServer.js index be79f6efaa..d0e3b167a5 100644 --- a/content/develop/use-cases/feature-store/nodejs/demoServer.js +++ b/content/develop/use-cases/feature-store/nodejs/demoServer.js @@ -526,9 +526,17 @@ class FeatureStoreDemo { } toggleWorker() { - if (!this.worker.running) this.worker.start(); - if (this.worker.paused) this.worker.resume(); - else this.worker.pause(); + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // start() clears the paused flag, so a fall-through pauses the + // worker we just brought back up. + if (!this.worker.running) { + this.worker.start(); + } else if (this.worker.paused) { + this.worker.resume(); + } else { + this.worker.pause(); + } return { paused: this.worker.paused, running: this.worker.running }; } } diff --git a/content/develop/use-cases/feature-store/php/demo_server.php b/content/develop/use-cases/feature-store/php/demo_server.php index 632964675a..9c80221b4e 100644 --- a/content/develop/use-cases/feature-store/php/demo_server.php +++ b/content/develop/use-cases/feature-store/php/demo_server.php @@ -328,8 +328,19 @@ function handle_worker_toggle( Client $redis, string $redisUri, string $keyPrefix, int $batchTtl, int $streamTtl, int $usersPerTick ): void { - // If the worker process died, respawn it. + // Three states: stopped → respawn (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // Capture liveness BEFORE the respawn so we can tell whether the + // call is "bring the worker back" vs. "flip the pause flag". + // Otherwise a respawn lands with whatever `fs:control:paused` was + // last set to, and the toggle pauses the worker we just spawned. + $wasRunning = pid_alive((int)$redis->get('fs:control:worker_pid')); spawn_worker_if_needed($redis, $redisUri, $keyPrefix, $batchTtl, $streamTtl, $usersPerTick); + if (!$wasRunning) { + $redis->set('fs:control:paused', '0'); + send_json(200, ['paused' => false, 'running' => true]); + return; + } $paused = $redis->get('fs:control:paused') === '1'; $redis->set('fs:control:paused', $paused ? '0' : '1'); send_json(200, [ diff --git a/content/develop/use-cases/feature-store/redis-py/demo_server.py b/content/develop/use-cases/feature-store/redis-py/demo_server.py index cd0ad15b21..c1ec232437 100644 --- a/content/develop/use-cases/feature-store/redis-py/demo_server.py +++ b/content/develop/use-cases/feature-store/redis-py/demo_server.py @@ -534,9 +534,14 @@ def reset(self) -> dict: return {"deleted": deleted} def toggle_worker(self) -> dict: + # Three states: stopped → start (and leave unpaused); + # running + unpaused → pause; running + paused → resume. + # Avoid falling through start into the pause branch — start() + # clears the paused flag, so a fall-through pauses the worker + # we just brought back up. if not self.worker.is_running: self.worker.start() - if self.worker.is_paused: + elif self.worker.is_paused: self.worker.resume() else: self.worker.pause() diff --git a/content/develop/use-cases/feature-store/ruby/demo_server.rb b/content/develop/use-cases/feature-store/ruby/demo_server.rb index 6fd4121045..e105a1bc7f 100644 --- a/content/develop/use-cases/feature-store/ruby/demo_server.rb +++ b/content/develop/use-cases/feature-store/ruby/demo_server.rb @@ -66,8 +66,17 @@ def reset def toggle_worker @lock.synchronize do - @worker.start unless @worker.running? - @worker.paused? ? @worker.resume : @worker.pause + # Three states: stopped -> start (and leave unpaused); + # running + unpaused -> pause; running + paused -> resume. + # start clears the paused flag, so a fall-through would pause + # the worker we just brought back up. + if !@worker.running? + @worker.start + elsif @worker.paused? + @worker.resume + else + @worker.pause + end { paused: @worker.paused?, running: @worker.running? } end end diff --git a/content/develop/use-cases/feature-store/rust/demo_server.rs b/content/develop/use-cases/feature-store/rust/demo_server.rs index 6b2bcf5900..a4384a68b7 100644 --- a/content/develop/use-cases/feature-store/rust/demo_server.rs +++ b/content/develop/use-cases/feature-store/rust/demo_server.rs @@ -309,10 +309,13 @@ async fn reset(State(state): State) -> impl IntoResponse { async fn toggle_worker(State(state): State) -> impl IntoResponse { let _guard = state.demo_lock.lock().await; + // Three states: stopped → start (and leave unpaused); + // running + unpaused → pause; running + paused → resume. + // start() clears the paused flag, so a fall-through would pause + // the worker we just brought back up. if !state.worker.is_running() { state.worker.start().await; - } - if state.worker.is_paused() { + } else if state.worker.is_paused() { state.worker.resume(); } else { state.worker.pause(); From 587631d633cc460c259cc14370cfcd8e468c715b Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 16:16:46 +0100 Subject: [PATCH 17/20] DOC-6661 more from the bugbot --- .../feature-store/dotnet/StreamingWorker.cs | 27 ++++++++++--------- .../feature-store/go/streaming_worker.go | 16 +++++++++++ 2 files changed, 30 insertions(+), 13 deletions(-) diff --git a/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs index 67b18c3c7f..dfcb7cbcb6 100644 --- a/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs +++ b/content/develop/use-cases/feature-store/dotnet/StreamingWorker.cs @@ -82,27 +82,28 @@ public async Task StartAsync() public async Task StopAsync() { - // Capture the task/CTS locally under the lifecycle lock so - // a concurrent StartAsync can't clear them on us before we - // get to await. - Task? task; - CancellationTokenSource? cts; + // Hold the lifecycle lock across the entire stop, including + // the await for the task to drain. Releasing the lock before + // the await would let a concurrent StartAsync spawn a + // successor task while the old task's outer finally is still + // about to run; the old finally then clears _running, leaving + // the new task running with IsRunning=false and unstoppable. await _lifecycleLock.WaitAsync(); try { if (Interlocked.Exchange(ref _running, 0) != 1) return; - task = _task; - cts = _cts; + var task = _task; + var cts = _cts; _task = null; _cts = null; + cts?.Cancel(); + try { if (task is not null) await task; } + catch (OperationCanceledException) { /* expected */ } + cts?.Dispose(); + // The awaited task's outer finally already cleared + // _tickInFlight; nothing extra to do here. } finally { _lifecycleLock.Release(); } - - cts?.Cancel(); - try { if (task is not null) await task; } - catch (OperationCanceledException) { /* expected */ } - cts?.Dispose(); - Interlocked.Exchange(ref _tickInFlight, 0); } public void Pause() => Interlocked.Exchange(ref _paused, 1); diff --git a/content/develop/use-cases/feature-store/go/streaming_worker.go b/content/develop/use-cases/feature-store/go/streaming_worker.go index f061382149..b070627bae 100644 --- a/content/develop/use-cases/feature-store/go/streaming_worker.go +++ b/content/develop/use-cases/feature-store/go/streaming_worker.go @@ -49,6 +49,13 @@ type StreamingWorker struct { rng *rand.Rand rngMu sync.Mutex + // lifecycleMu serialises Start and Stop so a concurrent Start + // (e.g. from /worker/toggle) can't spawn a successor goroutine + // while a Stop is mid-wait on doneCh. Without it, the old + // goroutine's deferred running.Store(false) would clobber the + // new goroutine's running flag, leaving IsRunning() false and + // the new goroutine unstoppable. + lifecycleMu sync.Mutex running atomic.Bool paused atomic.Bool tickInFlight atomic.Bool @@ -84,6 +91,8 @@ func NewStreamingWorker(store *FeatureStore, tick time.Duration, usersPerTick in // response completes, which would kill the worker on the very next // tick. Lifecycle is owned by ``Stop`` (and the internal ``stopCh``). func (w *StreamingWorker) Start() { + w.lifecycleMu.Lock() + defer w.lifecycleMu.Unlock() if !w.running.CompareAndSwap(false, true) { return } @@ -95,7 +104,14 @@ func (w *StreamingWorker) Start() { // Stop signals the worker to exit and waits for any in-flight tick // to settle. Safe to call multiple times. +// +// Holds lifecycleMu across the doneCh wait so a concurrent Start +// can't reassign stopCh/doneCh while we're waiting on them — that +// would leak the old goroutine and have the deferred +// running.Store(false) clobber the new goroutine's running flag. func (w *StreamingWorker) Stop() { + w.lifecycleMu.Lock() + defer w.lifecycleMu.Unlock() if !w.running.CompareAndSwap(true, false) { return } From a2027d8e9a41190d07f343f6a06b01cb7830a8e6 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 16:21:30 +0100 Subject: [PATCH 18/20] DOC-6661 fixed style guide contravention --- content/develop/use-cases/feature-store/_index.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/content/develop/use-cases/feature-store/_index.md b/content/develop/use-cases/feature-store/_index.md index e1cdcd54ab..51f6e7f0ff 100644 --- a/content/develop/use-cases/feature-store/_index.md +++ b/content/develop/use-cases/feature-store/_index.md @@ -35,7 +35,7 @@ obvious workarounds have real drawbacks: on one side and not the other. - **Disk-backed online stores** hit a throughput wall when every user action has to update a dozen features simultaneously across millions of entities — - the IO mix of small concurrent writes is exactly what they are slowest at. + the I/O mix of small concurrent writes is exactly what they are slowest at. - **Single-TTL stores** can't handle mixed staleness: batch features refreshed nightly coexist with streaming features updated every few seconds, and a single per-key expiry can't express both. Worse, a failed ingestion From 75b4fc1cc488727384c3bfcf6c9ff2cd46e0668d Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Mon, 1 Jun 2026 16:54:58 +0100 Subject: [PATCH 19/20] DOC-6661 more bugbot stuff --- .gitignore | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitignore b/.gitignore index 3897a97956..384d5c9585 100644 --- a/.gitignore +++ b/.gitignore @@ -17,8 +17,6 @@ package-lock.json .DS_Store .idea # Rust docs demos -/content/develop/use-cases/rate-limiter/rust/target/ -/content/develop/use-cases/rate-limiter/rust/Cargo.lock /content/develop/use-cases/**/rust/target/ /content/develop/use-cases/**/rust/Cargo.lock # Java / Maven build output for the docs demos From 663da47096d2d0aad3fdc698b8da07a73a6082b5 Mon Sep 17 00:00:00 2001 From: Andy Stark Date: Tue, 2 Jun 2026 09:36:54 +0100 Subject: [PATCH 20/20] DOC-6661 another bugbot issue --- .../use-cases/feature-store/dotnet/Program.cs | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/content/develop/use-cases/feature-store/dotnet/Program.cs b/content/develop/use-cases/feature-store/dotnet/Program.cs index bc49babe54..5b47042d5e 100644 --- a/content/develop/use-cases/feature-store/dotnet/Program.cs +++ b/content/develop/use-cases/feature-store/dotnet/Program.cs @@ -266,16 +266,24 @@ string IndexHtml() => Console.WriteLine($"Materialized {seeded} user(s); streaming worker running."); var appTask = app.RunAsync(); +var shutdownTcs = new TaskCompletionSource(); -Console.CancelKeyPress += async (_, e) => +// Synchronous handler only — `async void` here would swallow any +// exception from StopAsync into an unobserved task and return to +// the runtime before the awaited cleanup completes. Signal a +// completion source instead and let the main flow await the +// shutdown chain in order, with normal exception propagation. +Console.CancelKeyPress += (_, e) => { e.Cancel = true; Console.WriteLine("\nShutting down..."); - await worker.StopAsync(); - await app.StopAsync(); - await mux.CloseAsync(); + shutdownTcs.TrySetResult(); }; +await Task.WhenAny(appTask, shutdownTcs.Task); +await worker.StopAsync(); +await app.StopAsync(); +await mux.CloseAsync(); await appTask; return 0;