Optimize KCFG memory usage (#872)

This optimization works by making sure that a single object is used for all groups of equal subterms in a kcfg's nodes. This is implemented by keeping a map from (optimized) subterms to IDs, and another map from subterm IDs to actual subterms. Whenever a new node is added to the kcfg, this PR will do a bottom-up traversal of its subterms, replacing them with their cached version, if that exists, or adding them to the cache otherwise. Memory usage --------------- For the add liquidity proof, when running with Kasmer, the kcfg object uses 50-60GB of memory. With this PR, it uses only 1 GB. Educated guess: In general, I expect that the memory usage for proofs whose configurations do not change much between consecutive nodes (usually that would be configurations of medium or high complexity), the memory usage is higher when the kcfg holds 1-2 nodes, and lower afterwards. For proofs whose configurations change significantly between nodes (usually that would mean simple configurations, e.g. with only the `<k>` cell), this PR may increase the memory usage. Runtime -------- * Running a simple proof (i.e. most of the time is spent in the Booster backend): 6 min 59 sec with the PR, 6 min 56 sec before PR * Loading a small kcfg (29M on disk) from json: 5.5 sec with the PR, 4.9 sec before PR * Loading a large kcfg (6.1G on disk) from json: 1606 sec with the PR, 1425 sec before PR --------- Co-authored-by: devops <devops@runtimeverification.com> Co-authored-by: Tamás Tóth <tothtamas28@users.noreply.github.com> Co-authored-by: Lisandra <lisandrasilva18@hotmail.com> Co-authored-by: Noah Watson <107630091+nwatson22@users.noreply.github.com> Co-authored-by: rv-jenkins <admin@runtimeverification.com> Co-authored-by: Everett Hildenbrandt <everett.hildenbrandt@gmail.com>
runtimeverification · Feb 27, 2024 · a8a749d · a8a749d
1 parent 6fd119b
commit a8a749d
Show file tree

Hide file tree

Showing 7 changed files with 236 additions and 14 deletions.
diff --git a/docs/conf.py b/docs/conf.py
@@ -9,8 +9,8 @@
 project = 'pyk'
 author = 'Runtime Verification, Inc'
 copyright = '2024, Runtime Verification, Inc'
-version = '0.1.653'
-release = '0.1.653'
+version = '0.1.654'
+release = '0.1.654'
 
 # -- General configuration ---------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration

diff --git a/package/version b/package/version
@@ -1 +1 @@
-0.1.653
+0.1.654
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "pyk"
-version = "0.1.653"
+version = "0.1.654"
 description = ""
 authors = [
     "Runtime Verification, Inc. <contact@runtimeverification.com>",

diff --git a/src/pyk/kcfg/kcfg.py b/src/pyk/kcfg/kcfg.py
@@ -25,7 +25,7 @@
 from ..utils import ensure_dir_path
 
 if TYPE_CHECKING:
-    from collections.abc import Iterable, Mapping
+    from collections.abc import Iterable, Mapping, MutableMapping
     from pathlib import Path
     from types import TracebackType
     from typing import Any
@@ -203,7 +203,7 @@ def edges(self) -> tuple[KCFG.Edge, ...]:
             return tuple(KCFG.Edge(self.source, target, 1, ()) for target in self.targets)
 
     _node_id: int
-    _nodes: dict[int, Node]
+    _nodes: MutableMapping[int, Node]
 
     _created_nodes: set[int]
     _deleted_nodes: set[int]
@@ -218,9 +218,14 @@ def edges(self) -> tuple[KCFG.Edge, ...]:
     _lock: RLock
     cfg_dir: Path | None
 
-    def __init__(self, cfg_dir: Path | None = None) -> None:
+    def __init__(self, cfg_dir: Path | None = None, optimize_memory: bool = True) -> None:
         self._node_id = 1
-        self._nodes = {}
+        if optimize_memory:
+            from .store import OptimizedNodeStore
+
+            self._nodes = OptimizedNodeStore()
+        else:
+            self._nodes = {}
         self._created_nodes = set()
         self._deleted_nodes = set()
         self._edges = {}
@@ -293,9 +298,9 @@ def uncovered(self) -> list[Node]:
 
     @staticmethod
     def from_claim(
-        defn: KDefinition, claim: KClaim, cfg_dir: Path | None = None
+        defn: KDefinition, claim: KClaim, cfg_dir: Path | None = None, optimize_memory: bool = True
     ) -> tuple[KCFG, NodeIdLike, NodeIdLike]:
-        cfg = KCFG(cfg_dir=cfg_dir)
+        cfg = KCFG(cfg_dir=cfg_dir, optimize_memory=optimize_memory)
         claim_body = claim.body
         claim_body = defn.instantiate_cell_vars(claim_body)
         claim_body = rename_generated_vars(claim_body)
@@ -382,8 +387,8 @@ def to_dict(self) -> dict[str, Any]:
         return {k: v for k, v in res.items() if v}
 
     @staticmethod
-    def from_dict(dct: Mapping[str, Any]) -> KCFG:
-        cfg = KCFG()
+    def from_dict(dct: Mapping[str, Any], optimize_memory: bool = True) -> KCFG:
+        cfg = KCFG(optimize_memory=optimize_memory)
 
         max_id = 0
         for node_dict in dct.get('nodes') or []:
@@ -440,8 +445,8 @@ def to_json(self) -> str:
         return json.dumps(self.to_dict(), sort_keys=True)
 
     @staticmethod
-    def from_json(s: str) -> KCFG:
-        return KCFG.from_dict(json.loads(s))
+    def from_json(s: str, optimize_memory: bool = True) -> KCFG:
+        return KCFG.from_dict(json.loads(s), optimize_memory=optimize_memory)
 
     def to_rules(self, priority: int = 20) -> list[KRuleLike]:
         return [e.to_rule('BASIC-BLOCK', priority=priority) for e in self.edges()]

diff --git a/src/pyk/kcfg/store.py b/src/pyk/kcfg/store.py
@@ -0,0 +1,134 @@
+from __future__ import annotations
+
+import threading
+from abc import ABC, abstractmethod
+from collections.abc import Hashable, MutableMapping
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Generic, TypeVar, final
+
+from ..cterm import CTerm
+from ..kast.inner import KApply, KSequence, KToken, KVariable, bottom_up_with_summary
+from .kcfg import KCFG
+
+if TYPE_CHECKING:
+    from collections.abc import Iterator
+
+    from ..kast.inner import KInner, KLabel
+
+
+A = TypeVar('A', bound=Hashable)
+
+
+class OptimizedNodeStore(MutableMapping[int, KCFG.Node]):
+    _nodes: dict[int, KCFG.Node]
+    _optimized_terms: _Cache[_OptInner]
+    _klabels: _Cache[KLabel]
+    _terms: list[KInner]
+
+    _lock: threading.Lock
+
+    def __init__(self) -> None:
+        self._nodes = {}
+        self._optimized_terms = _Cache()
+        self._klabels = _Cache()
+        self._terms = []
+
+        self._lock = threading.Lock()
+
+    def __getitem__(self, key: int) -> KCFG.Node:
+        return self._nodes[key]
+
+    def __setitem__(self, key: int, node: KCFG.Node) -> None:
+        old_cterm = node.cterm
+        new_config = self._optimize(old_cterm.config)
+        new_constraints = tuple(self._optimize(c) for c in old_cterm.constraints)
+        new_node = KCFG.Node(node.id, CTerm(new_config, new_constraints))
+        self._nodes[key] = new_node
+
+    def __delitem__(self, key: int) -> None:
+        del self._nodes[key]
+
+    def __iter__(self) -> Iterator[int]:
+        return iter(self._nodes)
+
+    def __len__(self) -> int:
+        return len(self._nodes)
+
+    def _optimize(self, term: KInner) -> KInner:
+        def optimizer(to_optimize: KInner, children: list[int]) -> tuple[KInner, int]:
+            if isinstance(to_optimize, KToken) or isinstance(to_optimize, KVariable):
+                optimized_id = self._cache(_OptBasic(to_optimize))
+            elif isinstance(to_optimize, KApply):
+                klabel_id = self._klabels.cache(to_optimize.label)
+                optimized_id = self._cache(_OptApply(klabel_id, tuple(children)))
+            elif isinstance(to_optimize, KSequence):
+                optimized_id = self._cache(_OptKSequence(tuple(children)))
+            else:
+                raise ValueError('Unknown term type: ' + str(type(to_optimize)))
+            return (self._terms[optimized_id], optimized_id)
+
+        with self._lock:
+            optimized, _ = bottom_up_with_summary(optimizer, term)
+        return optimized
+
+    def _cache(self, term: _OptInner) -> int:
+        id = self._optimized_terms.cache(term)
+        assert id <= len(self._terms)
+        if id == len(self._terms):
+            self._terms.append(term.build(self._klabels, self._terms))
+        return id
+
+
+class _Cache(Generic[A]):
+    _value_to_id: dict[A, int]
+    _values: list[A]
+
+    def __init__(self) -> None:
+        self._value_to_id = {}
+        self._values = []
+
+    def cache(self, value: A) -> int:
+        idx = self._value_to_id.get(value)
+        if idx is not None:
+            return idx
+        idx = len(self._values)
+        self._value_to_id[value] = idx
+        self._values.append(value)
+        return idx
+
+    def get(self, idx: int) -> A:
+        return self._values[idx]
+
+
+class _OptInner(ABC):
+    @abstractmethod
+    def build(self, klabels: _Cache[KLabel], terms: list[KInner]) -> KInner:
+        ...
+
+
+@final
+@dataclass(eq=True, frozen=True)
+class _OptBasic(_OptInner):
+    term: KInner
+
+    def build(self, klabels: _Cache[KLabel], terms: list[KInner]) -> KInner:
+        return self.term
+
+
+@final
+@dataclass(eq=True, frozen=True)
+class _OptApply(_OptInner):
+    label: int
+    children: tuple[int, ...]
+
+    def build(self, klabels: _Cache[KLabel], terms: list[KInner]) -> KInner:
+        return KApply(klabels.get(self.label), tuple(terms[child] for child in self.children))
+
+
+@final
+@dataclass(eq=True, frozen=True)
+class _OptKSequence(_OptInner):
+    children: tuple[int, ...]
+
+    def build(self, klabels: _Cache[KLabel], terms: list[KInner]) -> KInner:
+        return KSequence(tuple(terms[child] for child in self.children))
diff --git a/src/tests/unit/kcfg/__init__.py b/src/tests/unit/kcfg/__init__.py
diff --git a/src/tests/unit/kcfg/test_store.py b/src/tests/unit/kcfg/test_store.py
@@ -0,0 +1,83 @@
+from __future__ import annotations
+
+from itertools import count
+from typing import TYPE_CHECKING
+
+import pytest
+
+from pyk.cterm import CTerm
+from pyk.kast.inner import KApply, KSequence
+from pyk.kcfg.kcfg import KCFG
+from pyk.kcfg.store import OptimizedNodeStore, _Cache
+from pyk.prelude.utils import token
+
+from ..utils import a, b, c, f
+
+if TYPE_CHECKING:
+    from typing import Final
+
+    from pyk.kast import KInner
+
+
+EQUAL_TEST_DATA: Final[tuple[tuple[KInner, KInner], ...]] = (
+    (token(1), token(1)),
+    (token('a'), token('a')),
+    (a, a),
+    (f(a), f(a)),
+    (KSequence([a, b]), KSequence([a, b])),
+)
+
+
+@pytest.mark.parametrize('term1,term2', EQUAL_TEST_DATA, ids=count())
+def test_use_cached(term1: KInner, term2: KInner) -> None:
+    # Given
+    cached_values: _Cache[KInner] = _Cache()
+
+    # When
+    id1 = cached_values.cache(term1)
+    id2 = cached_values.cache(term2)
+
+    # Then
+    assert id1 == id2
+
+
+NOT_EQUAL_TEST_DATA: Final[tuple[tuple[KInner, KInner], ...]] = (
+    (token(1), token(2)),
+    (token(1), token('1')),
+    (a, b),
+    (f(a), f(b)),
+    (KSequence([a, b]), KSequence([a, c])),
+)
+
+
+@pytest.mark.parametrize('term1,term2', NOT_EQUAL_TEST_DATA, ids=count())
+def test_not_use_cached(term1: KInner, term2: KInner) -> None:
+    # Given
+    cached_values: _Cache[KInner] = _Cache()
+
+    # When
+    id1 = cached_values.cache(term1)
+    id2 = cached_values.cache(term2)
+
+    # Then
+    assert term1 != term2
+    assert id1 != id2
+
+
+OPTIMIZE_TEST_DATA: Final[tuple[KInner, ...]] = (
+    token(1),
+    token('a'),
+    a,
+    f(a),
+    KSequence([a, token(3)]),
+)
+
+
+def test_optimized_store() -> None:
+    store = OptimizedNodeStore()
+
+    for idx, item in zip(range(0, len(OPTIMIZE_TEST_DATA)), OPTIMIZE_TEST_DATA, strict=True):
+        store[idx] = KCFG.Node(idx, CTerm(KApply('<cell>', item), ()))
+
+    for idx, item in zip(range(0, len(OPTIMIZE_TEST_DATA)), OPTIMIZE_TEST_DATA, strict=True):
+        assert KCFG.Node(idx, CTerm(KApply('<cell>', item), ())) == store[idx]