Skip to content

Commit

Permalink
perf: Optimize get_operation_by_reference method performance
Browse files Browse the repository at this point in the history
Signed-off-by: Dmitry Dygalo <dmitry@dygalo.dev>
  • Loading branch information
Stranger6667 committed May 15, 2024
1 parent 66e847f commit 95c268c
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 20 deletions.
1 change: 1 addition & 0 deletions docs/changelog.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Changelog
**Performance**

- Optimize `get_operation_by_id` method performance and reduce memory usage.
- Optimize `get_operation_by_reference` method performance.

.. _v3.28.1:

Expand Down
70 changes: 51 additions & 19 deletions src/schemathesis/specs/openapi/_cache.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from __future__ import annotations

from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Tuple

if TYPE_CHECKING:
from ...models import APIOperation
Expand All @@ -20,35 +20,42 @@ class OperationCacheEntry:
__slots__ = ("path", "method", "scope", "shared_parameters", "operation")


# During traversal, we need to keep track of the scope, path, and method
TraversalKey = Tuple[str, str, str]
OperationId = str
Reference = str


@dataclass
class OperationCache:
"""Cache for Open API operations.
This cache contains multiple levels to avoid unnecessary parsing of the schema.
The first level cache contains operation IDs and their metadata. The second level cache contains
initialized operation instances.
The first level is populated eagerly because it is cheap. It is mostly a dict traversal and
a bit of reference resolving. The entries there does not own the data, they are just references to the schema.
The second level is populated lazily because it is more expensive. It requires parsing the schema, its parameters
and some more elaborate reference resolution.
"""

_ids_to_definitions: dict[OperationId, OperationCacheEntry] = field(default_factory=dict)
_ids_to_operations: dict[OperationId, APIOperation] = field(default_factory=dict)
# Cache to avoid schema traversal on every access
_id_to_definition: dict[OperationId, OperationCacheEntry] = field(default_factory=dict)
# Map map between 1st & 2nd level cache keys
# Even though 1st level keys could be directly mapped to Python objects in memory, we need to keep them separate
# to ensure a single owner of the operation instance.
_id_to_operation: dict[OperationId, int] = field(default_factory=dict)
_traversal_key_to_operation: dict[TraversalKey, int] = field(default_factory=dict)
_reference_to_operation: dict[Reference, int] = field(default_factory=dict)
# The actual operations
_operations: list[APIOperation] = field(default_factory=list)

@property
def known_operation_ids(self) -> list[str]:
return list(self._ids_to_definitions)
return list(self._id_to_definition)

@property
def has_ids_to_definitions(self) -> bool:
return bool(self._ids_to_definitions)
return bool(self._id_to_definition)

def _append_operation(self, operation: APIOperation) -> int:
idx = len(self._operations)
self._operations.append(operation)
return idx

def insert_definition_by_id(
self,
Expand All @@ -60,19 +67,44 @@ def insert_definition_by_id(
operation: dict[str, Any],
) -> None:
"""Insert a new operation definition into cache."""
self._ids_to_definitions[operation_id] = OperationCacheEntry(
self._id_to_definition[operation_id] = OperationCacheEntry(
path=path, method=method, scope=scope, shared_parameters=shared_parameters, operation=operation
)

def get_definition_by_id(self, operation_id: str) -> OperationCacheEntry:
"""Get an operation definition by its ID."""
# TODO: Avoid KeyError in the future
return self._ids_to_definitions[operation_id]
return self._id_to_definition[operation_id]

def insert_operation_by_id(self, operation_id: str, operation: APIOperation) -> None:
"""Insert a new operation into cache."""
self._ids_to_operations[operation_id] = operation
"""Insert a new operation into cache by ID."""
self._id_to_operation[operation_id] = self._append_operation(operation)

def insert_operation_by_reference(self, reference: str, operation: APIOperation) -> None:
"""Insert a new operation into cache by reference."""
self._reference_to_operation[reference] = self._append_operation(operation)

def insert_operation_by_traversal_key(self, scope: str, path: str, method: str, operation: APIOperation) -> None:
"""Insert a new operation into cache by traversal key."""
self._traversal_key_to_operation[(scope, path, method)] = self._append_operation(operation)

def get_operation_by_id(self, operation_id: str) -> APIOperation | None:
"""Get an operation by its ID."""
return self._ids_to_operations.get(operation_id)
idx = self._id_to_operation.get(operation_id)
if idx is not None:
return self._operations[idx]
return None

def get_operation_by_reference(self, reference: str) -> APIOperation | None:
"""Get an operation by its reference."""
idx = self._reference_to_operation.get(reference)
if idx is not None:
return self._operations[idx]
return None

def get_operation_by_traversal_key(self, scope: str, path: str, method: str) -> APIOperation | None:
"""Get an operation by its traverse key."""
idx = self._traversal_key_to_operation.get((scope, path, method))
if idx is not None:
return self._operations[idx]
return None
19 changes: 18 additions & 1 deletion src/schemathesis/specs/openapi/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,10 @@ def get_operation_by_id(self, operation_id: str) -> APIOperation:
except KeyError as exc:
matches = get_close_matches(operation_id, cache.known_operation_ids)
self._on_missing_operation(operation_id, exc, matches)
# It could'be been already accessed in a different place
instance = cache.get_operation_by_traversal_key(entry.scope, entry.path, entry.method)
if instance is not None:
return instance
shared_parameters = self.resolver.resolve_all(entry.shared_parameters, RECURSION_DEPTH_LIMIT - 8)
self.resolver.push_scope(entry.scope)
try:
Expand All @@ -397,6 +401,7 @@ def get_operation_by_id(self, operation_id: str) -> APIOperation:
parameters = self.collect_parameters(raw_parameters, resolved)
definition = OperationDefinition(entry.operation, resolved, entry.scope)
initialized = self.make_operation(entry.path, entry.method, parameters, definition)
cache.insert_operation_by_traversal_key(entry.scope, entry.path, entry.method, initialized)
cache.insert_operation_by_id(operation_id, initialized)
return initialized

Expand Down Expand Up @@ -427,9 +432,18 @@ def get_operation_by_reference(self, reference: str) -> APIOperation:
Reference example: #/paths/~1users~1{user_id}/patch
"""
cache = self._operation_cache
operation = cache.get_operation_by_reference(reference)
if operation is not None:
return operation
scope, data = self.resolver.resolve(reference)
path, method = scope.rsplit("/", maxsplit=2)[-2:]
path = path.replace("~1", "/").replace("~0", "~")
# Check the traversal cache as it could've been populated in other places
traversal_key = (self.resolver.resolution_scope, path, method)
operation = cache.get_operation_by_traversal_key(*traversal_key)
if operation is not None:
return operation
resolved_definition = self.resolver.resolve_all(data)
parent_ref, _ = reference.rsplit("/", maxsplit=1)
_, methods = self.resolver.resolve(parent_ref)
Expand All @@ -438,7 +452,10 @@ def get_operation_by_reference(self, reference: str) -> APIOperation:
itertools.chain(resolved_definition.get("parameters", ()), common_parameters), resolved_definition
)
raw_definition = OperationDefinition(data, resolved_definition, scope)
return self.make_operation(path, method, parameters, raw_definition)
initialized = self.make_operation(path, method, parameters, raw_definition)
cache.insert_operation_by_reference(reference, initialized)
cache.insert_operation_by_traversal_key(*traversal_key, initialized)
return initialized

def get_case_strategy(
self,
Expand Down
30 changes: 30 additions & 0 deletions test/specs/openapi/test_schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,10 @@
@pytest.mark.operations("get_user", "update_user")
def test_get_operation_via_remote_reference(openapi_version, schema_url):
schema = schemathesis.from_uri(schema_url)
first = schema.get_operation_by_reference(f"{schema_url}#/paths/~1users~1{{user_id}}/patch")
resolved = schema.get_operation_by_reference(f"{schema_url}#/paths/~1users~1{{user_id}}/patch")
# Check caching
assert resolved is first
assert isinstance(resolved, APIOperation)
assert resolved.path == "/users/{user_id}"
assert resolved.method.upper() == "PATCH"
Expand All @@ -30,6 +33,33 @@ def test_get_operation_via_remote_reference(openapi_version, schema_url):
}


@pytest.mark.operations("get_user", "update_user")
def test_operation_cache_sharing(mocker, schema_url):
# When the same operation is accessed via different methods
# The second access should use cache

def setup_mock(schema):
mocker.patch.object(
schema._operation_cache, "insert_operation_by_traversal_key", side_effect=ValueError("Not cached")
)

reference = f"{schema_url}#/paths/~1users~1{{user_id}}/patch"
operation_id = "updateUser"

schema = schemathesis.from_uri(schema_url)
first = schema.get_operation_by_reference(reference)
# After accessing by reference, there should not be an attempt to insert it again
setup_mock(schema)
second = schema.get_operation_by_id(operation_id)
assert first is second
# And the other way around
schema = schemathesis.from_uri(schema_url)
first = schema.get_operation_by_id(operation_id)
setup_mock(schema)
second = schema.get_operation_by_reference(reference)
assert first is second


SINGLE_METHOD_PATHS = {
"/test-2": {"get": {"responses": {"200": {"description": "OK"}}}},
}
Expand Down

0 comments on commit 95c268c

Please sign in to comment.