-
Notifications
You must be signed in to change notification settings - Fork 44
feat(auth-cache): add hit/miss/eviction metrics to authentication cache #281
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
+308
−6
Merged
Changes from all commits
Commits
Show all changes
3 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,114 @@ | ||
| """ | ||
| Metrics instrumentation for the in-process authentication/authorization caches. | ||
|
|
||
| Mirrors the dual-emit pattern in ``src/utils/db_metrics.py``: | ||
|
|
||
| - When an OTLP endpoint is configured (``OTEL_EXPORTER_OTLP_ENDPOINT``), counters | ||
| are recorded through the OpenTelemetry SDK. | ||
| - When the Datadog Agent is reachable (``DD_AGENT_HOST``), the same events are | ||
| emitted as StatsD counters. | ||
| - When neither is configured, every function here is a cheap no-op. | ||
|
|
||
| The goal is to make cache effectiveness observable: hit rate per cache, the | ||
| reason for misses (expired vs never-seen), and capacity-driven evictions. These | ||
| are exactly the signals needed to tell whether a low hit rate is a TTL problem, | ||
| a key-cardinality problem, or a capacity problem. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| import os | ||
| from typing import TYPE_CHECKING, Literal | ||
|
|
||
| from datadog import statsd | ||
|
|
||
| from src.utils.logging import make_logger | ||
| from src.utils.otel_metrics import get_meter | ||
|
|
||
| if TYPE_CHECKING: | ||
| from opentelemetry.metrics import Counter | ||
|
|
||
| logger = make_logger(__name__) | ||
|
|
||
| # StatsD is only emitted if the Datadog Agent host is configured. | ||
| _STATSD_ENABLED = bool(os.environ.get("DD_AGENT_HOST")) | ||
|
|
||
| # Outcome of a single cache read. "hit" = present and fresh; "miss_expired" = | ||
| # present but past its TTL (TTL too short / churn); "miss_absent" = never cached | ||
| # or evicted (cold cache, key never repeats, or capacity eviction). | ||
| CacheResult = Literal["hit", "miss_expired", "miss_absent"] | ||
|
|
||
| # Lazily-created OTel instruments (created once, on first use). | ||
| _access_counter: Counter | None = None | ||
| _eviction_counter: Counter | None = None | ||
| _instruments_initialized = False | ||
|
|
||
|
|
||
| def _ensure_instruments() -> None: | ||
| """Create OTel counters on first use. No-op if OTel is not configured.""" | ||
| global _access_counter, _eviction_counter, _instruments_initialized | ||
|
|
||
| if _instruments_initialized: | ||
| return | ||
| _instruments_initialized = True | ||
|
|
||
| meter = get_meter("agentex.auth_cache") | ||
| if meter is None: | ||
| # OTel not configured; OTel path stays disabled. StatsD may still emit. | ||
| return | ||
|
|
||
| _access_counter = meter.create_counter( | ||
| name="auth_cache.access", | ||
| description="Authentication/authorization cache reads, tagged by cache and result", | ||
| unit="{access}", | ||
| ) | ||
| _eviction_counter = meter.create_counter( | ||
| name="auth_cache.eviction", | ||
| description="LRU evictions from authentication/authorization caches", | ||
| unit="{eviction}", | ||
| ) | ||
|
|
||
|
|
||
| def record_cache_access(cache_name: str, result: CacheResult) -> None: | ||
| """ | ||
| Record a single cache read. | ||
|
|
||
| Args: | ||
| cache_name: Logical cache name (e.g. "auth_gateway", "agent_api_key"). | ||
| result: One of "hit", "miss_expired", "miss_absent". | ||
|
|
||
| Never raises: emission failures (e.g. a StatsD UDP socket error or an OTel | ||
| SDK fault) are swallowed so instrumentation can never disrupt a caller on | ||
| the critical auth path. | ||
| """ | ||
| try: | ||
| _ensure_instruments() | ||
|
|
||
| if _access_counter is not None: | ||
| _access_counter.add(1, {"cache": cache_name, "result": result}) | ||
|
|
||
| if _STATSD_ENABLED: | ||
| statsd.increment( | ||
| "auth_cache.access", | ||
| tags=[f"cache:{cache_name}", f"result:{result}"], | ||
| ) | ||
| except Exception: | ||
| logger.debug("Failed to emit auth_cache.access metric", exc_info=True) | ||
|
|
||
|
|
||
| def record_cache_eviction(cache_name: str) -> None: | ||
| """ | ||
| Record a single capacity-driven (LRU) eviction. | ||
|
|
||
| Never raises: see ``record_cache_access``. | ||
| """ | ||
| try: | ||
| _ensure_instruments() | ||
|
|
||
| if _eviction_counter is not None: | ||
| _eviction_counter.add(1, {"cache": cache_name}) | ||
|
|
||
| if _STATSD_ENABLED: | ||
| statsd.increment("auth_cache.eviction", tags=[f"cache:{cache_name}"]) | ||
| except Exception: | ||
| logger.debug("Failed to emit auth_cache.eviction metric", exc_info=True) |
92 changes: 92 additions & 0 deletions
92
agentex/tests/unit/api/test_authentication_cache_metrics.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,92 @@ | ||
| """Tests for cache hit/miss/eviction instrumentation in ``authentication_cache``. | ||
|
|
||
| Asserts that ``AsyncTTLCache`` records the correct metric for each of the three | ||
| read outcomes (hit / miss_expired / miss_absent) and emits an eviction metric on | ||
| capacity-driven LRU eviction. The emission backend (OTel / StatsD) is covered | ||
| separately; here we patch the recorder functions at their import site in | ||
| ``authentication_cache`` and assert the calls. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
| from src.api.authentication_cache import AsyncTTLCache, AuthenticationCache | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| @pytest.mark.asyncio | ||
| class TestAsyncTTLCacheMetrics: | ||
| async def test_hit_records_hit(self): | ||
| cache = AsyncTTLCache(name="agent_api_key", ttl_seconds=300) | ||
| await cache.set("k", "v") | ||
|
|
||
| with patch("src.api.authentication_cache.record_cache_access") as record_access: | ||
| result = await cache.get("k") | ||
|
|
||
| assert result == "v" | ||
| record_access.assert_called_once_with("agent_api_key", "hit") | ||
|
|
||
| async def test_absent_key_records_miss_absent(self): | ||
| cache = AsyncTTLCache(name="auth_gateway", ttl_seconds=300) | ||
|
|
||
| with patch("src.api.authentication_cache.record_cache_access") as record_access: | ||
| result = await cache.get("never-set") | ||
|
|
||
| assert result is None | ||
| record_access.assert_called_once_with("auth_gateway", "miss_absent") | ||
|
|
||
| async def test_expired_entry_records_miss_expired(self): | ||
| cache = AsyncTTLCache(name="auth_gateway", ttl_seconds=60) | ||
| await cache.set("k", "v") | ||
| # Force expiry deterministically by backdating the stored timestamp, | ||
| # avoiding any reliance on wall-clock timing in the test. | ||
| value, _ = cache.cache["k"] | ||
| cache.cache["k"] = (value, 0.0) | ||
|
|
||
| with patch("src.api.authentication_cache.record_cache_access") as record_access: | ||
| result = await cache.get("k") | ||
|
|
||
| assert result is None | ||
| record_access.assert_called_once_with("auth_gateway", "miss_expired") | ||
| # The expired entry should also have been purged from the cache. | ||
| assert "k" not in cache.cache | ||
|
|
||
| async def test_capacity_eviction_records_eviction(self): | ||
| cache = AsyncTTLCache(name="authorization_check", max_size=1, ttl_seconds=300) | ||
| await cache.set("first", "v1") | ||
|
|
||
| with patch( | ||
| "src.api.authentication_cache.record_cache_eviction" | ||
| ) as record_eviction: | ||
| # Inserting a second distinct key evicts the oldest (LRU). | ||
| await cache.set("second", "v2") | ||
|
|
||
| record_eviction.assert_called_once_with("authorization_check") | ||
| assert "first" not in cache.cache | ||
| assert "second" in cache.cache | ||
|
|
||
| async def test_overwriting_existing_key_does_not_evict(self): | ||
| cache = AsyncTTLCache(name="authorization_check", max_size=1, ttl_seconds=300) | ||
| await cache.set("k", "v1") | ||
|
|
||
| with patch( | ||
| "src.api.authentication_cache.record_cache_eviction" | ||
| ) as record_eviction: | ||
| # Re-setting an existing key is an update, not a capacity eviction. | ||
| await cache.set("k", "v2") | ||
|
|
||
| record_eviction.assert_not_called() | ||
| assert await cache.get("k") == "v2" | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| def test_authentication_cache_assigns_distinct_names(): | ||
| """Each sub-cache carries the name used as the ``cache`` metric tag.""" | ||
| cache = AuthenticationCache() | ||
|
|
||
| assert cache.agent_identity_cache.name == "agent_identity" | ||
| assert cache.agent_api_key_cache.name == "agent_api_key" | ||
| assert cache.auth_gateway_cache.name == "auth_gateway" | ||
| assert cache.authorization_check_cache.name == "authorization_check" |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,75 @@ | ||
| """Tests for the auth-cache metrics emitter. | ||
|
|
||
| Covers the two paths that matter operationally: the no-op path (neither OTel nor | ||
| StatsD configured, which is the default in tests and local dev) must never | ||
| raise, and the StatsD path must emit a counter with the expected name and tags. | ||
| """ | ||
|
|
||
| from __future__ import annotations | ||
|
|
||
| from unittest.mock import patch | ||
|
|
||
| import pytest | ||
| from src.utils import cache_metrics | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| def test_record_functions_are_noop_when_unconfigured(): | ||
| # With no OTLP endpoint and no DD_AGENT_HOST, both calls must be harmless. | ||
| with ( | ||
| patch.object(cache_metrics, "_STATSD_ENABLED", False), | ||
| patch.object(cache_metrics, "_access_counter", None), | ||
| patch.object(cache_metrics, "_eviction_counter", None), | ||
| patch.object(cache_metrics, "_instruments_initialized", True), | ||
| ): | ||
| cache_metrics.record_cache_access("auth_gateway", "hit") | ||
| cache_metrics.record_cache_eviction("auth_gateway") | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| def test_record_functions_swallow_emission_errors(): | ||
| # A failing backend must never propagate to the caller (critical auth path). | ||
| with ( | ||
| patch.object(cache_metrics, "_STATSD_ENABLED", True), | ||
| patch.object(cache_metrics, "_instruments_initialized", True), | ||
| patch.object(cache_metrics, "_access_counter", None), | ||
| patch.object(cache_metrics, "_eviction_counter", None), | ||
| patch.object(cache_metrics, "statsd") as mock_statsd, | ||
| ): | ||
| mock_statsd.increment.side_effect = OSError("socket in a bad state") | ||
|
|
||
| # Neither call should raise despite the backend blowing up. | ||
| cache_metrics.record_cache_access("auth_gateway", "hit") | ||
| cache_metrics.record_cache_eviction("auth_gateway") | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| def test_record_cache_access_emits_statsd_when_enabled(): | ||
| with ( | ||
| patch.object(cache_metrics, "_STATSD_ENABLED", True), | ||
| patch.object(cache_metrics, "_instruments_initialized", True), | ||
| patch.object(cache_metrics, "_access_counter", None), | ||
| patch.object(cache_metrics, "statsd") as mock_statsd, | ||
| ): | ||
| cache_metrics.record_cache_access("auth_gateway", "miss_absent") | ||
|
|
||
| mock_statsd.increment.assert_called_once_with( | ||
| "auth_cache.access", | ||
| tags=["cache:auth_gateway", "result:miss_absent"], | ||
| ) | ||
|
|
||
|
|
||
| @pytest.mark.unit | ||
| def test_record_cache_eviction_emits_statsd_when_enabled(): | ||
| with ( | ||
| patch.object(cache_metrics, "_STATSD_ENABLED", True), | ||
| patch.object(cache_metrics, "_instruments_initialized", True), | ||
| patch.object(cache_metrics, "_eviction_counter", None), | ||
| patch.object(cache_metrics, "statsd") as mock_statsd, | ||
| ): | ||
| cache_metrics.record_cache_eviction("agent_api_key") | ||
|
|
||
| mock_statsd.increment.assert_called_once_with( | ||
| "auth_cache.eviction", | ||
| tags=["cache:agent_api_key"], | ||
| ) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.