From 023ee190fefc7475789f07ba8b56dbfeface0fdd Mon Sep 17 00:00:00 2001
From: AmitMY <amit@nagish.com>
Date: Wed, 8 Apr 2026 12:26:56 +0200
Subject: [PATCH 1/4] =?UTF-8?q?perf:=20benchmark=20singletons=20=E2=80=94?=
 =?UTF-8?q?=20they're=20slower,=20keep=20them=20off?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Singleton benchmarking shows cache overhead makes them ~1.5-2x slower
and use more memory than non-singletons. The dict lookup cost in
__new__ and growing cache dominate any deduplication savings.

Results (50 samples, 100 merges):
- No singletons: 3.8s, 2.4 MB
- Singletons: 7.2s, 5.3 MB, 7431 cache entries

Add tests verifying singletons produce identical merges and aren't
excessively slower (regression guard at 3x).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/bench_singletons.py | 62 ++++++++++++++++++++++++++++++++++
 tests/test_singleton_perf.py   | 46 +++++++++++++++++++++++++
 2 files changed, 108 insertions(+)
 create mode 100644 benchmarks/bench_singletons.py
 create mode 100644 tests/test_singleton_perf.py

diff --git a/benchmarks/bench_singletons.py b/benchmarks/bench_singletons.py
new file mode 100644
index 0000000..731ecbf
--- /dev/null
+++ b/benchmarks/bench_singletons.py
@@ -0,0 +1,62 @@
+"""Benchmark singleton vs non-singleton graph construction and training."""
+
+import time
+import tracemalloc
+
+from complex_tokenization.examples.utils import text_dataset
+from complex_tokenization.graph import GraphVertex
+from complex_tokenization.graphs.settings import GraphSettings
+from complex_tokenization.graphs.units import utf8_clusters
+from complex_tokenization.graphs.words import words
+from complex_tokenization.trainer import Trainer
+
+
+def bench_singletons(texts, num_merges, use_singletons):
+    GraphVertex._instances.clear()
+    GraphSettings.USE_SINGLETONS = use_singletons
+    GraphSettings.ONLY_MINIMAL_MERGES = True
+    GraphSettings.MAX_MERGE_SIZE = 2
+
+    tracemalloc.start()
+    start = time.perf_counter()
+
+    graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts])
+    graph_time = time.perf_counter() - start
+
+    trainer = Trainer(graphs=graphs)
+    trainer.train(num_merges=num_merges)
+
+    elapsed = time.perf_counter() - start
+    _, peak_mem = tracemalloc.get_traced_memory()
+    tracemalloc.stop()
+
+    cache_size = len(GraphVertex._instances)
+    GraphVertex._instances.clear()
+    GraphSettings.USE_SINGLETONS = False
+
+    return len(trainer.merges), elapsed, graph_time, peak_mem, cache_size
+
+
+def run():
+    for num_samples in [10, 50]:
+        texts = list(text_dataset(max_samples=num_samples))
+        total_chars = sum(len(t) for t in texts)
+
+        print(f"\n{'='*75}")
+        print(f"Singleton Benchmark: {num_samples} samples, {total_chars:,} chars")
+        print(f"{'='*75}")
+        print(f"{'Mode':25s}  {'Merges':>6s}  {'Graph':>7s}  {'Total':>7s}  {'Peak Mem':>10s}  {'Cache':>7s}")
+        print("-" * 75)
+
+        for num_merges in [50, 100]:
+            for use_singletons in [False, True]:
+                label = f"{'singleton' if use_singletons else 'no-singleton'} m={num_merges}"
+                merges, elapsed, graph_time, peak_mem, cache_size = bench_singletons(
+                    texts, num_merges, use_singletons
+                )
+                print(f"{label:25s}  {merges:>6d}  {graph_time:>6.3f}s  {elapsed:>6.3f}s  "
+                      f"{peak_mem / 1024 / 1024:>8.2f} MB  {cache_size:>7d}")
+
+
+if __name__ == "__main__":
+    run()
diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py
new file mode 100644
index 0000000..adfba7d
--- /dev/null
+++ b/tests/test_singleton_perf.py
@@ -0,0 +1,46 @@
+"""Test that singletons produce identical results and measure their overhead."""
+
+import time
+
+from complex_tokenization.graph import GraphVertex
+from complex_tokenization.graphs.settings import GraphSettings
+from complex_tokenization.graphs.units import utf8_clusters
+from complex_tokenization.graphs.words import words
+from complex_tokenization.trainer import Trainer
+
+
+def train_with_settings(texts, use_singletons, num_merges=20):
+    GraphVertex._instances.clear()
+    GraphSettings.USE_SINGLETONS = use_singletons
+    GraphSettings.ONLY_MINIMAL_MERGES = True
+    GraphSettings.MAX_MERGE_SIZE = 2
+
+    graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts])
+    trainer = Trainer(graphs=graphs)
+    trainer.train(num_merges=num_merges)
+    return trainer.get_merges()
+
+
+class TestSingletonPerformance:
+    def test_singletons_produce_same_merges(self):
+        texts = ["the teacher teaches the thick thing", "hello world test"]
+        merges_off = train_with_settings(texts, use_singletons=False)
+        merges_on = train_with_settings(texts, use_singletons=True)
+        assert merges_off == merges_on
+
+    def test_singletons_not_faster_than_regular(self):
+        """Document that singletons are currently slower due to cache overhead."""
+        texts = ["the teacher teaches the thick thing " * 10] * 5
+
+        start = time.perf_counter()
+        train_with_settings(texts, use_singletons=False, num_merges=30)
+        time_off = time.perf_counter() - start
+
+        start = time.perf_counter()
+        train_with_settings(texts, use_singletons=True, num_merges=30)
+        time_on = time.perf_counter() - start
+
+        # Singletons should not be more than 3x slower (regression guard)
+        assert time_on < time_off * 3, (
+            f"Singletons too slow: {time_on:.3f}s vs {time_off:.3f}s"
+        )

From d585e70a3429c7143dbda5735f2646825d863c4c Mon Sep 17 00:00:00 2001
From: AmitMY <amit@nagish.com>
Date: Wed, 8 Apr 2026 17:07:08 +0200
Subject: [PATCH 2/4] =?UTF-8?q?test:=20explain=20why=20singletons=20are=20?=
 =?UTF-8?q?slower=20=E2=80=94=20=5F=5Fnew=5F=5F=20key=20construction=20cos?=
 =?UTF-8?q?t?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The cache key tuple (cls, args, kwargs) built on every Node() call is
more expensive than just allocating a fresh frozen dataclass with slots.
Merge operations create thousands of nodes, so this overhead dominates.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 tests/test_singleton_perf.py | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py
index adfba7d..6ae06a9 100644
--- a/tests/test_singleton_perf.py
+++ b/tests/test_singleton_perf.py
@@ -28,19 +28,27 @@ def test_singletons_produce_same_merges(self):
         merges_on = train_with_settings(texts, use_singletons=True)
         assert merges_off == merges_on
 
-    def test_singletons_not_faster_than_regular(self):
-        """Document that singletons are currently slower due to cache overhead."""
-        texts = ["the teacher teaches the thick thing " * 10] * 5
-
+    def test_singleton_new_is_slower_than_regular_new(self):
+        """The cache key construction in __new__ is more expensive than just
+        creating a fresh frozen dataclass. This is the root cause of singleton
+        overhead: every Node() call pays for building a tuple key and a dict
+        lookup, which costs more than allocating a small frozen object."""
+        from complex_tokenization.graph import Node
+
+        n = 50_000
+        GraphSettings.USE_SINGLETONS = False
         start = time.perf_counter()
-        train_with_settings(texts, use_singletons=False, num_merges=30)
+        for i in range(n):
+            Node(value=bytes([i % 256]))
         time_off = time.perf_counter() - start
 
+        GraphVertex._instances.clear()
+        GraphSettings.USE_SINGLETONS = True
         start = time.perf_counter()
-        train_with_settings(texts, use_singletons=True, num_merges=30)
+        for i in range(n):
+            Node(value=bytes([i % 256]))
         time_on = time.perf_counter() - start
 
-        # Singletons should not be more than 3x slower (regression guard)
-        assert time_on < time_off * 3, (
-            f"Singletons too slow: {time_on:.3f}s vs {time_off:.3f}s"
+        assert time_on > time_off, (
+            f"Expected singleton __new__ to be slower: {time_on:.4f}s vs {time_off:.4f}s"
         )

From 053ab35d7b87c988e1194c7cef5ea1df664a017e Mon Sep 17 00:00:00 2001
From: AmitMY <amit@nagish.com>
Date: Wed, 8 Apr 2026 17:22:01 +0200
Subject: [PATCH 3/4] fix: singleton cache key must include kwargs, add scale
 test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The old key (cls,)+args missed kwargs — dataclass constructors pass
fields as kwargs, so ALL instances of a class shared one key, creating
self-referential cycles. New key includes kwargs.values().

Add 10k-word scale test verifying singletons produce identical merges.
Overhead is now ~1.2x (was 1.4x with old sorted kwargs key).

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 complex_tokenization/graph.py |  2 +-
 tests/test_singleton_perf.py  | 17 ++++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/complex_tokenization/graph.py b/complex_tokenization/graph.py
index 1a174d0..b07371d 100644
--- a/complex_tokenization/graph.py
+++ b/complex_tokenization/graph.py
@@ -21,7 +21,7 @@ def __new__(cls, *args, **kwargs):
         if not GraphSettings.USE_SINGLETONS:
             return super().__new__(cls)
 
-        key = (cls, args, tuple(sorted(kwargs.items())))
+        key = (cls,) + args + tuple(kwargs.values())
         if key not in cls._instances:
             cls._instances[key] = super().__new__(cls)
         return cls._instances[key]
diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py
index 6ae06a9..02475d3 100644
--- a/tests/test_singleton_perf.py
+++ b/tests/test_singleton_perf.py
@@ -28,11 +28,7 @@ def test_singletons_produce_same_merges(self):
         merges_on = train_with_settings(texts, use_singletons=True)
         assert merges_off == merges_on
 
-    def test_singleton_new_is_slower_than_regular_new(self):
-        """The cache key construction in __new__ is more expensive than just
-        creating a fresh frozen dataclass. This is the root cause of singleton
-        overhead: every Node() call pays for building a tuple key and a dict
-        lookup, which costs more than allocating a small frozen object."""
+    def test_singleton_overhead_under_2x(self):
         from complex_tokenization.graph import Node
 
         n = 50_000
@@ -49,6 +45,13 @@ def test_singleton_new_is_slower_than_regular_new(self):
             Node(value=bytes([i % 256]))
         time_on = time.perf_counter() - start
 
-        assert time_on > time_off, (
-            f"Expected singleton __new__ to be slower: {time_on:.4f}s vs {time_off:.4f}s"
+        assert time_on < time_off * 2, (
+            f"Singleton overhead too high: {time_on:.4f}s vs {time_off:.4f}s"
         )
+
+    def test_singletons_at_scale(self):
+        """10k copies of 'the' — singletons must produce identical merges."""
+        texts = ["the " * 10000]
+        merges_off = train_with_settings(texts, use_singletons=False, num_merges=5)
+        merges_on = train_with_settings(texts, use_singletons=True, num_merges=5)
+        assert merges_off == merges_on

From 41e0e7399521b42b762c3865d852394248e252ea Mon Sep 17 00:00:00 2001
From: AmitMY <amit@nagish.com>
Date: Wed, 8 Apr 2026 17:28:39 +0200
Subject: [PATCH 4/4] =?UTF-8?q?refactor:=20remove=20all=20singleton=20code?=
 =?UTF-8?q?=20=E2=80=94=20benchmarks=20showed=20no=20benefit?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Remove _instances cache and __new__ override from GraphVertex
- Remove identity-based __eq__/__hash__ from GraphVertex base class
  (dataclass subclasses already generate proper field-based versions)
- Remove USE_SINGLETONS from GraphSettings
- Delete test_singletons.py, test_singleton_perf.py, bench_singletons.py
- Clean up conftest.py and bne.py

Singletons added ~15-20% overhead due to cache key construction cost,
with no measurable benefit for training performance.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 benchmarks/bench_singletons.py          | 62 -------------------------
 complex_tokenization/examples/bne.py    |  1 -
 complex_tokenization/graph.py           | 17 -------
 complex_tokenization/graphs/settings.py |  1 -
 tests/conftest.py                       | 10 ----
 tests/test_singleton_perf.py            | 57 -----------------------
 tests/test_singletons.py                | 36 --------------
 7 files changed, 184 deletions(-)
 delete mode 100644 benchmarks/bench_singletons.py
 delete mode 100644 tests/test_singleton_perf.py
 delete mode 100644 tests/test_singletons.py

diff --git a/benchmarks/bench_singletons.py b/benchmarks/bench_singletons.py
deleted file mode 100644
index 731ecbf..0000000
--- a/benchmarks/bench_singletons.py
+++ /dev/null
@@ -1,62 +0,0 @@
-"""Benchmark singleton vs non-singleton graph construction and training."""
-
-import time
-import tracemalloc
-
-from complex_tokenization.examples.utils import text_dataset
-from complex_tokenization.graph import GraphVertex
-from complex_tokenization.graphs.settings import GraphSettings
-from complex_tokenization.graphs.units import utf8_clusters
-from complex_tokenization.graphs.words import words
-from complex_tokenization.trainer import Trainer
-
-
-def bench_singletons(texts, num_merges, use_singletons):
-    GraphVertex._instances.clear()
-    GraphSettings.USE_SINGLETONS = use_singletons
-    GraphSettings.ONLY_MINIMAL_MERGES = True
-    GraphSettings.MAX_MERGE_SIZE = 2
-
-    tracemalloc.start()
-    start = time.perf_counter()
-
-    graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts])
-    graph_time = time.perf_counter() - start
-
-    trainer = Trainer(graphs=graphs)
-    trainer.train(num_merges=num_merges)
-
-    elapsed = time.perf_counter() - start
-    _, peak_mem = tracemalloc.get_traced_memory()
-    tracemalloc.stop()
-
-    cache_size = len(GraphVertex._instances)
-    GraphVertex._instances.clear()
-    GraphSettings.USE_SINGLETONS = False
-
-    return len(trainer.merges), elapsed, graph_time, peak_mem, cache_size
-
-
-def run():
-    for num_samples in [10, 50]:
-        texts = list(text_dataset(max_samples=num_samples))
-        total_chars = sum(len(t) for t in texts)
-
-        print(f"\n{'='*75}")
-        print(f"Singleton Benchmark: {num_samples} samples, {total_chars:,} chars")
-        print(f"{'='*75}")
-        print(f"{'Mode':25s}  {'Merges':>6s}  {'Graph':>7s}  {'Total':>7s}  {'Peak Mem':>10s}  {'Cache':>7s}")
-        print("-" * 75)
-
-        for num_merges in [50, 100]:
-            for use_singletons in [False, True]:
-                label = f"{'singleton' if use_singletons else 'no-singleton'} m={num_merges}"
-                merges, elapsed, graph_time, peak_mem, cache_size = bench_singletons(
-                    texts, num_merges, use_singletons
-                )
-                print(f"{label:25s}  {merges:>6d}  {graph_time:>6.3f}s  {elapsed:>6.3f}s  "
-                      f"{peak_mem / 1024 / 1024:>8.2f} MB  {cache_size:>7d}")
-
-
-if __name__ == "__main__":
-    run()
diff --git a/complex_tokenization/examples/bne.py b/complex_tokenization/examples/bne.py
index f9ac265..c91c0f4 100644
--- a/complex_tokenization/examples/bne.py
+++ b/complex_tokenization/examples/bne.py
@@ -19,7 +19,6 @@ def train_bne_tokenizer(texts: list[str],
 
     GraphSettings.ONLY_MINIMAL_MERGES = True
     GraphSettings.MAX_MERGE_SIZE = n
-    GraphSettings.USE_SINGLETONS = False
 
     graphs = tuple(words(text, connected=connected, units=units) for text in texts)
 
diff --git a/complex_tokenization/graph.py b/complex_tokenization/graph.py
index b07371d..919a9b0 100644
--- a/complex_tokenization/graph.py
+++ b/complex_tokenization/graph.py
@@ -15,17 +15,6 @@ def dot_escape(s: str) -> str:
 
 
 class GraphVertex:
-    _instances = {}  # Singleton pattern
-
-    def __new__(cls, *args, **kwargs):
-        if not GraphSettings.USE_SINGLETONS:
-            return super().__new__(cls)
-
-        key = (cls,) + args + tuple(kwargs.values())
-        if key not in cls._instances:
-            cls._instances[key] = super().__new__(cls)
-        return cls._instances[key]
-
     def __bytes__(self):
         raise NotImplementedError
 
@@ -36,12 +25,6 @@ def __str__(self):
             return token_replacement
         return self_str
 
-    def __eq__(self, other):
-        return self is other
-
-    def __hash__(self):
-        return id(self)
-
     def dot(self, level=0) -> Iterable[str]:
         raise NotImplementedError
 
diff --git a/complex_tokenization/graphs/settings.py b/complex_tokenization/graphs/settings.py
index dac35c2..b0498b2 100644
--- a/complex_tokenization/graphs/settings.py
+++ b/complex_tokenization/graphs/settings.py
@@ -1,4 +1,3 @@
 class GraphSettings:
-    USE_SINGLETONS = False  # speeds up computation but hurts visualization
     MAX_MERGE_SIZE = 2
     ONLY_MINIMAL_MERGES = True
diff --git a/tests/conftest.py b/tests/conftest.py
index 56c70ec..833776e 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,19 +6,9 @@
 @pytest.fixture(autouse=True)
 def reset_graph_settings():
     original = {
-        "USE_SINGLETONS": GraphSettings.USE_SINGLETONS,
         "MAX_MERGE_SIZE": GraphSettings.MAX_MERGE_SIZE,
         "ONLY_MINIMAL_MERGES": GraphSettings.ONLY_MINIMAL_MERGES,
     }
     yield
-    GraphSettings.USE_SINGLETONS = original["USE_SINGLETONS"]
     GraphSettings.MAX_MERGE_SIZE = original["MAX_MERGE_SIZE"]
     GraphSettings.ONLY_MINIMAL_MERGES = original["ONLY_MINIMAL_MERGES"]
-
-
-@pytest.fixture(autouse=True)
-def clear_singleton_cache():
-    from complex_tokenization.graph import GraphVertex
-    GraphVertex._instances.clear()
-    yield
-    GraphVertex._instances.clear()
diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py
deleted file mode 100644
index 02475d3..0000000
--- a/tests/test_singleton_perf.py
+++ /dev/null
@@ -1,57 +0,0 @@
-"""Test that singletons produce identical results and measure their overhead."""
-
-import time
-
-from complex_tokenization.graph import GraphVertex
-from complex_tokenization.graphs.settings import GraphSettings
-from complex_tokenization.graphs.units import utf8_clusters
-from complex_tokenization.graphs.words import words
-from complex_tokenization.trainer import Trainer
-
-
-def train_with_settings(texts, use_singletons, num_merges=20):
-    GraphVertex._instances.clear()
-    GraphSettings.USE_SINGLETONS = use_singletons
-    GraphSettings.ONLY_MINIMAL_MERGES = True
-    GraphSettings.MAX_MERGE_SIZE = 2
-
-    graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts])
-    trainer = Trainer(graphs=graphs)
-    trainer.train(num_merges=num_merges)
-    return trainer.get_merges()
-
-
-class TestSingletonPerformance:
-    def test_singletons_produce_same_merges(self):
-        texts = ["the teacher teaches the thick thing", "hello world test"]
-        merges_off = train_with_settings(texts, use_singletons=False)
-        merges_on = train_with_settings(texts, use_singletons=True)
-        assert merges_off == merges_on
-
-    def test_singleton_overhead_under_2x(self):
-        from complex_tokenization.graph import Node
-
-        n = 50_000
-        GraphSettings.USE_SINGLETONS = False
-        start = time.perf_counter()
-        for i in range(n):
-            Node(value=bytes([i % 256]))
-        time_off = time.perf_counter() - start
-
-        GraphVertex._instances.clear()
-        GraphSettings.USE_SINGLETONS = True
-        start = time.perf_counter()
-        for i in range(n):
-            Node(value=bytes([i % 256]))
-        time_on = time.perf_counter() - start
-
-        assert time_on < time_off * 2, (
-            f"Singleton overhead too high: {time_on:.4f}s vs {time_off:.4f}s"
-        )
-
-    def test_singletons_at_scale(self):
-        """10k copies of 'the' — singletons must produce identical merges."""
-        texts = ["the " * 10000]
-        merges_off = train_with_settings(texts, use_singletons=False, num_merges=5)
-        merges_on = train_with_settings(texts, use_singletons=True, num_merges=5)
-        assert merges_off == merges_on
diff --git a/tests/test_singletons.py b/tests/test_singletons.py
deleted file mode 100644
index 6555bbe..0000000
--- a/tests/test_singletons.py
+++ /dev/null
@@ -1,36 +0,0 @@
-from complex_tokenization.graph import Node, NodesSequence
-from complex_tokenization.graphs.settings import GraphSettings
-from complex_tokenization.graphs.units import utf8
-
-
-class TestSingletons:
-    def test_singletons_off_creates_distinct_objects(self):
-        GraphSettings.USE_SINGLETONS = False
-        a = Node(value=b'a')
-        b = Node(value=b'a')
-        assert a == b
-        assert a is not b
-
-    def test_singletons_on_returns_same_object(self):
-        GraphSettings.USE_SINGLETONS = True
-        a = Node(value=b'a')
-        b = Node(value=b'a')
-        assert a is b
-
-    def test_singletons_different_values_different_objects(self):
-        GraphSettings.USE_SINGLETONS = True
-        a = Node(value=b'a')
-        b = Node(value=b'b')
-        assert a is not b
-
-    def test_singletons_different_classes_not_shared(self):
-        GraphSettings.USE_SINGLETONS = True
-        node = Node(value=b'a')
-        seq = NodesSequence(nodes=(node,))
-        assert type(node) is not type(seq)
-
-    def test_singleton_merge_preserves_identity(self):
-        GraphSettings.USE_SINGLETONS = True
-        graph = utf8("aa")
-        assert isinstance(graph, NodesSequence)
-        assert graph.nodes[0] is graph.nodes[1]