From 05398ab1c6607609ced8a44354dc1f84537366f0 Mon Sep 17 00:00:00 2001 From: AmitMY Date: Wed, 8 Apr 2026 12:10:13 +0200 Subject: [PATCH] test: add GraphSettings isolation fixture and edge case tests - Add conftest.py with autouse fixtures to reset GraphSettings and clear singleton cache between tests, preventing cross-test contamination - Add singleton tests: identity, distinctness, cross-class isolation - Add trainer tests: error handling, merge exhaustion, single node, multiple graphs, readable merge output - Add characters() bytes roundtrip tests for ASCII and non-ASCII Co-Authored-By: Claude Opus 4.6 (1M context) --- tests/conftest.py | 24 ++++++++++++ tests/test_singletons.py | 36 ++++++++++++++++++ tests/test_trainer.py | 79 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 139 insertions(+) create mode 100644 tests/conftest.py create mode 100644 tests/test_singletons.py create mode 100644 tests/test_trainer.py diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..56c70ec --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,24 @@ +import pytest + +from complex_tokenization.graphs.settings import GraphSettings + + +@pytest.fixture(autouse=True) +def reset_graph_settings(): + original = { + "USE_SINGLETONS": GraphSettings.USE_SINGLETONS, + "MAX_MERGE_SIZE": GraphSettings.MAX_MERGE_SIZE, + "ONLY_MINIMAL_MERGES": GraphSettings.ONLY_MINIMAL_MERGES, + } + yield + GraphSettings.USE_SINGLETONS = original["USE_SINGLETONS"] + GraphSettings.MAX_MERGE_SIZE = original["MAX_MERGE_SIZE"] + GraphSettings.ONLY_MINIMAL_MERGES = original["ONLY_MINIMAL_MERGES"] + + +@pytest.fixture(autouse=True) +def clear_singleton_cache(): + from complex_tokenization.graph import GraphVertex + GraphVertex._instances.clear() + yield + GraphVertex._instances.clear() diff --git a/tests/test_singletons.py b/tests/test_singletons.py new file mode 100644 index 0000000..fd24696 --- /dev/null +++ b/tests/test_singletons.py @@ -0,0 +1,36 @@ +from complex_tokenization.graph import GraphVertex, Node, NodesSequence +from complex_tokenization.graphs.settings import GraphSettings +from complex_tokenization.graphs.units import utf8 + + +class TestSingletons: + def test_singletons_off_creates_distinct_objects(self): + GraphSettings.USE_SINGLETONS = False + a = Node(value=b'a') + b = Node(value=b'a') + assert a == b + assert a is not b + + def test_singletons_on_returns_same_object(self): + GraphSettings.USE_SINGLETONS = True + a = Node(value=b'a') + b = Node(value=b'a') + assert a is b + + def test_singletons_different_values_different_objects(self): + GraphSettings.USE_SINGLETONS = True + a = Node(value=b'a') + b = Node(value=b'b') + assert a is not b + + def test_singletons_different_classes_not_shared(self): + GraphSettings.USE_SINGLETONS = True + node = Node(value=b'a') + seq = NodesSequence(nodes=(node,)) + assert type(node) is not type(seq) + + def test_singleton_merge_preserves_identity(self): + GraphSettings.USE_SINGLETONS = True + graph = utf8("aa") + assert isinstance(graph, NodesSequence) + assert graph.nodes[0] is graph.nodes[1] diff --git a/tests/test_trainer.py b/tests/test_trainer.py new file mode 100644 index 0000000..c0c4191 --- /dev/null +++ b/tests/test_trainer.py @@ -0,0 +1,79 @@ +import pytest + +from complex_tokenization.graph import Node, NodesSequence +from complex_tokenization.graphs.settings import GraphSettings +from complex_tokenization.graphs.units import utf8 +from complex_tokenization.trainer import Trainer + + +class TestTrainer: + def test_trainer_requires_graph_or_graphs(self): + with pytest.raises(ValueError, match="Must provide either graph or graphs"): + Trainer() + + def test_trainer_rejects_both_graph_and_graphs(self): + graph = utf8("test") + with pytest.raises(ValueError, match="Must provide either graph or graphs, not both"): + Trainer(graph=graph, graphs=(graph,)) + + def test_train_single_node_no_merges(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + node = Node(value=b'a') + trainer = Trainer(graph=node) + trainer.train(num_merges=10) + assert len(trainer.merges) == 0 + + def test_train_stops_when_no_merges_left(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + graph = utf8("ab") + trainer = Trainer(graph=graph) + trainer.train(num_merges=100) + assert len(trainer.merges) == 1 + + def test_train_merge_reduces_graph(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + graph = utf8("aaa") + trainer = Trainer(graph=graph) + trainer.train(num_merges=1) + assert len(trainer.merges) == 1 + assert isinstance(trainer.graph, NodesSequence) + + def test_train_full_merge_to_single_node(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + graph = utf8("aa") + trainer = Trainer(graph=graph) + trainer.train(num_merges=1) + assert len(trainer.merges) == 1 + assert isinstance(trainer.graph, Node) + + def test_get_merges_returns_readable(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + graph = utf8("abab") + trainer = Trainer(graph=graph) + trainer.train(num_merges=1) + merges = trainer.get_merges() + assert len(merges) == 1 + assert merges[0] == ('a', 'b') + + def test_train_with_multiple_graphs(self): + GraphSettings.MAX_MERGE_SIZE = 2 + GraphSettings.ONLY_MINIMAL_MERGES = True + graphs = (utf8("ab"), utf8("ab"), utf8("cd")) + trainer = Trainer(graphs=graphs) + trainer.train(num_merges=1) + assert trainer.get_merges()[0] == ('a', 'b') + + def test_characters_produce_valid_bytes(self): + from complex_tokenization.graphs.units import characters + graph = characters("hello") + assert bytes(graph) == b"hello" + + def test_characters_non_ascii_produce_valid_bytes(self): + from complex_tokenization.graphs.units import characters + graph = characters("שלום") + assert bytes(graph) == "שלום".encode("utf-8")