From 023ee190fefc7475789f07ba8b56dbfeface0fdd Mon Sep 17 00:00:00 2001 From: AmitMY Date: Wed, 8 Apr 2026 12:26:56 +0200 Subject: [PATCH 1/4] =?UTF-8?q?perf:=20benchmark=20singletons=20=E2=80=94?= =?UTF-8?q?=20they're=20slower,=20keep=20them=20off?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Singleton benchmarking shows cache overhead makes them ~1.5-2x slower and use more memory than non-singletons. The dict lookup cost in __new__ and growing cache dominate any deduplication savings. Results (50 samples, 100 merges): - No singletons: 3.8s, 2.4 MB - Singletons: 7.2s, 5.3 MB, 7431 cache entries Add tests verifying singletons produce identical merges and aren't excessively slower (regression guard at 3x). Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/bench_singletons.py | 62 ++++++++++++++++++++++++++++++++++ tests/test_singleton_perf.py | 46 +++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 benchmarks/bench_singletons.py create mode 100644 tests/test_singleton_perf.py diff --git a/benchmarks/bench_singletons.py b/benchmarks/bench_singletons.py new file mode 100644 index 0000000..731ecbf --- /dev/null +++ b/benchmarks/bench_singletons.py @@ -0,0 +1,62 @@ +"""Benchmark singleton vs non-singleton graph construction and training.""" + +import time +import tracemalloc + +from complex_tokenization.examples.utils import text_dataset +from complex_tokenization.graph import GraphVertex +from complex_tokenization.graphs.settings import GraphSettings +from complex_tokenization.graphs.units import utf8_clusters +from complex_tokenization.graphs.words import words +from complex_tokenization.trainer import Trainer + + +def bench_singletons(texts, num_merges, use_singletons): + GraphVertex._instances.clear() + GraphSettings.USE_SINGLETONS = use_singletons + GraphSettings.ONLY_MINIMAL_MERGES = True + GraphSettings.MAX_MERGE_SIZE = 2 + + tracemalloc.start() + start = time.perf_counter() + + graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts]) + graph_time = time.perf_counter() - start + + trainer = Trainer(graphs=graphs) + trainer.train(num_merges=num_merges) + + elapsed = time.perf_counter() - start + _, peak_mem = tracemalloc.get_traced_memory() + tracemalloc.stop() + + cache_size = len(GraphVertex._instances) + GraphVertex._instances.clear() + GraphSettings.USE_SINGLETONS = False + + return len(trainer.merges), elapsed, graph_time, peak_mem, cache_size + + +def run(): + for num_samples in [10, 50]: + texts = list(text_dataset(max_samples=num_samples)) + total_chars = sum(len(t) for t in texts) + + print(f"\n{'='*75}") + print(f"Singleton Benchmark: {num_samples} samples, {total_chars:,} chars") + print(f"{'='*75}") + print(f"{'Mode':25s} {'Merges':>6s} {'Graph':>7s} {'Total':>7s} {'Peak Mem':>10s} {'Cache':>7s}") + print("-" * 75) + + for num_merges in [50, 100]: + for use_singletons in [False, True]: + label = f"{'singleton' if use_singletons else 'no-singleton'} m={num_merges}" + merges, elapsed, graph_time, peak_mem, cache_size = bench_singletons( + texts, num_merges, use_singletons + ) + print(f"{label:25s} {merges:>6d} {graph_time:>6.3f}s {elapsed:>6.3f}s " + f"{peak_mem / 1024 / 1024:>8.2f} MB {cache_size:>7d}") + + +if __name__ == "__main__": + run() diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py new file mode 100644 index 0000000..adfba7d --- /dev/null +++ b/tests/test_singleton_perf.py @@ -0,0 +1,46 @@ +"""Test that singletons produce identical results and measure their overhead.""" + +import time + +from complex_tokenization.graph import GraphVertex +from complex_tokenization.graphs.settings import GraphSettings +from complex_tokenization.graphs.units import utf8_clusters +from complex_tokenization.graphs.words import words +from complex_tokenization.trainer import Trainer + + +def train_with_settings(texts, use_singletons, num_merges=20): + GraphVertex._instances.clear() + GraphSettings.USE_SINGLETONS = use_singletons + GraphSettings.ONLY_MINIMAL_MERGES = True + GraphSettings.MAX_MERGE_SIZE = 2 + + graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts]) + trainer = Trainer(graphs=graphs) + trainer.train(num_merges=num_merges) + return trainer.get_merges() + + +class TestSingletonPerformance: + def test_singletons_produce_same_merges(self): + texts = ["the teacher teaches the thick thing", "hello world test"] + merges_off = train_with_settings(texts, use_singletons=False) + merges_on = train_with_settings(texts, use_singletons=True) + assert merges_off == merges_on + + def test_singletons_not_faster_than_regular(self): + """Document that singletons are currently slower due to cache overhead.""" + texts = ["the teacher teaches the thick thing " * 10] * 5 + + start = time.perf_counter() + train_with_settings(texts, use_singletons=False, num_merges=30) + time_off = time.perf_counter() - start + + start = time.perf_counter() + train_with_settings(texts, use_singletons=True, num_merges=30) + time_on = time.perf_counter() - start + + # Singletons should not be more than 3x slower (regression guard) + assert time_on < time_off * 3, ( + f"Singletons too slow: {time_on:.3f}s vs {time_off:.3f}s" + ) From d585e70a3429c7143dbda5735f2646825d863c4c Mon Sep 17 00:00:00 2001 From: AmitMY Date: Wed, 8 Apr 2026 17:07:08 +0200 Subject: [PATCH 2/4] =?UTF-8?q?test:=20explain=20why=20singletons=20are=20?= =?UTF-8?q?slower=20=E2=80=94=20=5F=5Fnew=5F=5F=20key=20construction=20cos?= =?UTF-8?q?t?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The cache key tuple (cls, args, kwargs) built on every Node() call is more expensive than just allocating a fresh frozen dataclass with slots. Merge operations create thousands of nodes, so this overhead dominates. Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/test_singleton_perf.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py index adfba7d..6ae06a9 100644 --- a/tests/test_singleton_perf.py +++ b/tests/test_singleton_perf.py @@ -28,19 +28,27 @@ def test_singletons_produce_same_merges(self): merges_on = train_with_settings(texts, use_singletons=True) assert merges_off == merges_on - def test_singletons_not_faster_than_regular(self): - """Document that singletons are currently slower due to cache overhead.""" - texts = ["the teacher teaches the thick thing " * 10] * 5 - + def test_singleton_new_is_slower_than_regular_new(self): + """The cache key construction in __new__ is more expensive than just + creating a fresh frozen dataclass. This is the root cause of singleton + overhead: every Node() call pays for building a tuple key and a dict + lookup, which costs more than allocating a small frozen object.""" + from complex_tokenization.graph import Node + + n = 50_000 + GraphSettings.USE_SINGLETONS = False start = time.perf_counter() - train_with_settings(texts, use_singletons=False, num_merges=30) + for i in range(n): + Node(value=bytes([i % 256])) time_off = time.perf_counter() - start + GraphVertex._instances.clear() + GraphSettings.USE_SINGLETONS = True start = time.perf_counter() - train_with_settings(texts, use_singletons=True, num_merges=30) + for i in range(n): + Node(value=bytes([i % 256])) time_on = time.perf_counter() - start - # Singletons should not be more than 3x slower (regression guard) - assert time_on < time_off * 3, ( - f"Singletons too slow: {time_on:.3f}s vs {time_off:.3f}s" + assert time_on > time_off, ( + f"Expected singleton __new__ to be slower: {time_on:.4f}s vs {time_off:.4f}s" ) From 053ab35d7b87c988e1194c7cef5ea1df664a017e Mon Sep 17 00:00:00 2001 From: AmitMY Date: Wed, 8 Apr 2026 17:22:01 +0200 Subject: [PATCH 3/4] fix: singleton cache key must include kwargs, add scale test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The old key (cls,)+args missed kwargs — dataclass constructors pass fields as kwargs, so ALL instances of a class shared one key, creating self-referential cycles. New key includes kwargs.values(). Add 10k-word scale test verifying singletons produce identical merges. Overhead is now ~1.2x (was 1.4x with old sorted kwargs key). Co-Authored-By: Claude Opus 4.6 (1M context) --- complex_tokenization/graph.py | 2 +- tests/test_singleton_perf.py | 17 ++++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/complex_tokenization/graph.py b/complex_tokenization/graph.py index 1a174d0..b07371d 100644 --- a/complex_tokenization/graph.py +++ b/complex_tokenization/graph.py @@ -21,7 +21,7 @@ def __new__(cls, *args, **kwargs): if not GraphSettings.USE_SINGLETONS: return super().__new__(cls) - key = (cls, args, tuple(sorted(kwargs.items()))) + key = (cls,) + args + tuple(kwargs.values()) if key not in cls._instances: cls._instances[key] = super().__new__(cls) return cls._instances[key] diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py index 6ae06a9..02475d3 100644 --- a/tests/test_singleton_perf.py +++ b/tests/test_singleton_perf.py @@ -28,11 +28,7 @@ def test_singletons_produce_same_merges(self): merges_on = train_with_settings(texts, use_singletons=True) assert merges_off == merges_on - def test_singleton_new_is_slower_than_regular_new(self): - """The cache key construction in __new__ is more expensive than just - creating a fresh frozen dataclass. This is the root cause of singleton - overhead: every Node() call pays for building a tuple key and a dict - lookup, which costs more than allocating a small frozen object.""" + def test_singleton_overhead_under_2x(self): from complex_tokenization.graph import Node n = 50_000 @@ -49,6 +45,13 @@ def test_singleton_new_is_slower_than_regular_new(self): Node(value=bytes([i % 256])) time_on = time.perf_counter() - start - assert time_on > time_off, ( - f"Expected singleton __new__ to be slower: {time_on:.4f}s vs {time_off:.4f}s" + assert time_on < time_off * 2, ( + f"Singleton overhead too high: {time_on:.4f}s vs {time_off:.4f}s" ) + + def test_singletons_at_scale(self): + """10k copies of 'the' — singletons must produce identical merges.""" + texts = ["the " * 10000] + merges_off = train_with_settings(texts, use_singletons=False, num_merges=5) + merges_on = train_with_settings(texts, use_singletons=True, num_merges=5) + assert merges_off == merges_on From 41e0e7399521b42b762c3865d852394248e252ea Mon Sep 17 00:00:00 2001 From: AmitMY Date: Wed, 8 Apr 2026 17:28:39 +0200 Subject: [PATCH 4/4] =?UTF-8?q?refactor:=20remove=20all=20singleton=20code?= =?UTF-8?q?=20=E2=80=94=20benchmarks=20showed=20no=20benefit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove _instances cache and __new__ override from GraphVertex - Remove identity-based __eq__/__hash__ from GraphVertex base class (dataclass subclasses already generate proper field-based versions) - Remove USE_SINGLETONS from GraphSettings - Delete test_singletons.py, test_singleton_perf.py, bench_singletons.py - Clean up conftest.py and bne.py Singletons added ~15-20% overhead due to cache key construction cost, with no measurable benefit for training performance. Co-Authored-By: Claude Opus 4.6 (1M context) --- benchmarks/bench_singletons.py | 62 ------------------------- complex_tokenization/examples/bne.py | 1 - complex_tokenization/graph.py | 17 ------- complex_tokenization/graphs/settings.py | 1 - tests/conftest.py | 10 ---- tests/test_singleton_perf.py | 57 ----------------------- tests/test_singletons.py | 36 -------------- 7 files changed, 184 deletions(-) delete mode 100644 benchmarks/bench_singletons.py delete mode 100644 tests/test_singleton_perf.py delete mode 100644 tests/test_singletons.py diff --git a/benchmarks/bench_singletons.py b/benchmarks/bench_singletons.py deleted file mode 100644 index 731ecbf..0000000 --- a/benchmarks/bench_singletons.py +++ /dev/null @@ -1,62 +0,0 @@ -"""Benchmark singleton vs non-singleton graph construction and training.""" - -import time -import tracemalloc - -from complex_tokenization.examples.utils import text_dataset -from complex_tokenization.graph import GraphVertex -from complex_tokenization.graphs.settings import GraphSettings -from complex_tokenization.graphs.units import utf8_clusters -from complex_tokenization.graphs.words import words -from complex_tokenization.trainer import Trainer - - -def bench_singletons(texts, num_merges, use_singletons): - GraphVertex._instances.clear() - GraphSettings.USE_SINGLETONS = use_singletons - GraphSettings.ONLY_MINIMAL_MERGES = True - GraphSettings.MAX_MERGE_SIZE = 2 - - tracemalloc.start() - start = time.perf_counter() - - graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts]) - graph_time = time.perf_counter() - start - - trainer = Trainer(graphs=graphs) - trainer.train(num_merges=num_merges) - - elapsed = time.perf_counter() - start - _, peak_mem = tracemalloc.get_traced_memory() - tracemalloc.stop() - - cache_size = len(GraphVertex._instances) - GraphVertex._instances.clear() - GraphSettings.USE_SINGLETONS = False - - return len(trainer.merges), elapsed, graph_time, peak_mem, cache_size - - -def run(): - for num_samples in [10, 50]: - texts = list(text_dataset(max_samples=num_samples)) - total_chars = sum(len(t) for t in texts) - - print(f"\n{'='*75}") - print(f"Singleton Benchmark: {num_samples} samples, {total_chars:,} chars") - print(f"{'='*75}") - print(f"{'Mode':25s} {'Merges':>6s} {'Graph':>7s} {'Total':>7s} {'Peak Mem':>10s} {'Cache':>7s}") - print("-" * 75) - - for num_merges in [50, 100]: - for use_singletons in [False, True]: - label = f"{'singleton' if use_singletons else 'no-singleton'} m={num_merges}" - merges, elapsed, graph_time, peak_mem, cache_size = bench_singletons( - texts, num_merges, use_singletons - ) - print(f"{label:25s} {merges:>6d} {graph_time:>6.3f}s {elapsed:>6.3f}s " - f"{peak_mem / 1024 / 1024:>8.2f} MB {cache_size:>7d}") - - -if __name__ == "__main__": - run() diff --git a/complex_tokenization/examples/bne.py b/complex_tokenization/examples/bne.py index f9ac265..c91c0f4 100644 --- a/complex_tokenization/examples/bne.py +++ b/complex_tokenization/examples/bne.py @@ -19,7 +19,6 @@ def train_bne_tokenizer(texts: list[str], GraphSettings.ONLY_MINIMAL_MERGES = True GraphSettings.MAX_MERGE_SIZE = n - GraphSettings.USE_SINGLETONS = False graphs = tuple(words(text, connected=connected, units=units) for text in texts) diff --git a/complex_tokenization/graph.py b/complex_tokenization/graph.py index b07371d..919a9b0 100644 --- a/complex_tokenization/graph.py +++ b/complex_tokenization/graph.py @@ -15,17 +15,6 @@ def dot_escape(s: str) -> str: class GraphVertex: - _instances = {} # Singleton pattern - - def __new__(cls, *args, **kwargs): - if not GraphSettings.USE_SINGLETONS: - return super().__new__(cls) - - key = (cls,) + args + tuple(kwargs.values()) - if key not in cls._instances: - cls._instances[key] = super().__new__(cls) - return cls._instances[key] - def __bytes__(self): raise NotImplementedError @@ -36,12 +25,6 @@ def __str__(self): return token_replacement return self_str - def __eq__(self, other): - return self is other - - def __hash__(self): - return id(self) - def dot(self, level=0) -> Iterable[str]: raise NotImplementedError diff --git a/complex_tokenization/graphs/settings.py b/complex_tokenization/graphs/settings.py index dac35c2..b0498b2 100644 --- a/complex_tokenization/graphs/settings.py +++ b/complex_tokenization/graphs/settings.py @@ -1,4 +1,3 @@ class GraphSettings: - USE_SINGLETONS = False # speeds up computation but hurts visualization MAX_MERGE_SIZE = 2 ONLY_MINIMAL_MERGES = True diff --git a/tests/conftest.py b/tests/conftest.py index 56c70ec..833776e 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -6,19 +6,9 @@ @pytest.fixture(autouse=True) def reset_graph_settings(): original = { - "USE_SINGLETONS": GraphSettings.USE_SINGLETONS, "MAX_MERGE_SIZE": GraphSettings.MAX_MERGE_SIZE, "ONLY_MINIMAL_MERGES": GraphSettings.ONLY_MINIMAL_MERGES, } yield - GraphSettings.USE_SINGLETONS = original["USE_SINGLETONS"] GraphSettings.MAX_MERGE_SIZE = original["MAX_MERGE_SIZE"] GraphSettings.ONLY_MINIMAL_MERGES = original["ONLY_MINIMAL_MERGES"] - - -@pytest.fixture(autouse=True) -def clear_singleton_cache(): - from complex_tokenization.graph import GraphVertex - GraphVertex._instances.clear() - yield - GraphVertex._instances.clear() diff --git a/tests/test_singleton_perf.py b/tests/test_singleton_perf.py deleted file mode 100644 index 02475d3..0000000 --- a/tests/test_singleton_perf.py +++ /dev/null @@ -1,57 +0,0 @@ -"""Test that singletons produce identical results and measure their overhead.""" - -import time - -from complex_tokenization.graph import GraphVertex -from complex_tokenization.graphs.settings import GraphSettings -from complex_tokenization.graphs.units import utf8_clusters -from complex_tokenization.graphs.words import words -from complex_tokenization.trainer import Trainer - - -def train_with_settings(texts, use_singletons, num_merges=20): - GraphVertex._instances.clear() - GraphSettings.USE_SINGLETONS = use_singletons - GraphSettings.ONLY_MINIMAL_MERGES = True - GraphSettings.MAX_MERGE_SIZE = 2 - - graphs = tuple([words(text, connected=False, units=utf8_clusters) for text in texts]) - trainer = Trainer(graphs=graphs) - trainer.train(num_merges=num_merges) - return trainer.get_merges() - - -class TestSingletonPerformance: - def test_singletons_produce_same_merges(self): - texts = ["the teacher teaches the thick thing", "hello world test"] - merges_off = train_with_settings(texts, use_singletons=False) - merges_on = train_with_settings(texts, use_singletons=True) - assert merges_off == merges_on - - def test_singleton_overhead_under_2x(self): - from complex_tokenization.graph import Node - - n = 50_000 - GraphSettings.USE_SINGLETONS = False - start = time.perf_counter() - for i in range(n): - Node(value=bytes([i % 256])) - time_off = time.perf_counter() - start - - GraphVertex._instances.clear() - GraphSettings.USE_SINGLETONS = True - start = time.perf_counter() - for i in range(n): - Node(value=bytes([i % 256])) - time_on = time.perf_counter() - start - - assert time_on < time_off * 2, ( - f"Singleton overhead too high: {time_on:.4f}s vs {time_off:.4f}s" - ) - - def test_singletons_at_scale(self): - """10k copies of 'the' — singletons must produce identical merges.""" - texts = ["the " * 10000] - merges_off = train_with_settings(texts, use_singletons=False, num_merges=5) - merges_on = train_with_settings(texts, use_singletons=True, num_merges=5) - assert merges_off == merges_on diff --git a/tests/test_singletons.py b/tests/test_singletons.py deleted file mode 100644 index 6555bbe..0000000 --- a/tests/test_singletons.py +++ /dev/null @@ -1,36 +0,0 @@ -from complex_tokenization.graph import Node, NodesSequence -from complex_tokenization.graphs.settings import GraphSettings -from complex_tokenization.graphs.units import utf8 - - -class TestSingletons: - def test_singletons_off_creates_distinct_objects(self): - GraphSettings.USE_SINGLETONS = False - a = Node(value=b'a') - b = Node(value=b'a') - assert a == b - assert a is not b - - def test_singletons_on_returns_same_object(self): - GraphSettings.USE_SINGLETONS = True - a = Node(value=b'a') - b = Node(value=b'a') - assert a is b - - def test_singletons_different_values_different_objects(self): - GraphSettings.USE_SINGLETONS = True - a = Node(value=b'a') - b = Node(value=b'b') - assert a is not b - - def test_singletons_different_classes_not_shared(self): - GraphSettings.USE_SINGLETONS = True - node = Node(value=b'a') - seq = NodesSequence(nodes=(node,)) - assert type(node) is not type(seq) - - def test_singleton_merge_preserves_identity(self): - GraphSettings.USE_SINGLETONS = True - graph = utf8("aa") - assert isinstance(graph, NodesSequence) - assert graph.nodes[0] is graph.nodes[1]