From 0eff950a283aa8a50edca68ea6681e936e0e4bb9 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 26 Oct 2025 21:37:54 +0000 Subject: [PATCH 01/12] feat: add DuckDB store support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add DuckDBStore implementation with BaseStore and BaseContextManagerStore - Support both in-memory (:memory:) and persistent (file-based) storage - Include seed support following current store patterns - Add comprehensive test suite (294 tests passing) - Update pyproject.toml with duckdb>=1.0.0 optional dependency - Mark as unstable API (_stable_api = False) Implements #11 ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-authored-by: William Easton --- key-value/key-value-aio/pyproject.toml | 3 +- .../key_value/aio/stores/duckdb/__init__.py | 3 + .../src/key_value/aio/stores/duckdb/store.py | 188 ++++++++++++++++++ .../tests/stores/duckdb/__init__.py | 1 + .../tests/stores/duckdb/test_duckdb.py | 142 +++++++++++++ uv.lock | 44 +++- 6 files changed, 376 insertions(+), 5 deletions(-) create mode 100644 key-value/key-value-aio/src/key_value/aio/stores/duckdb/__init__.py create mode 100644 key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py create mode 100644 key-value/key-value-aio/tests/stores/duckdb/__init__.py create mode 100644 key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py diff --git a/key-value/key-value-aio/pyproject.toml b/key-value/key-value-aio/pyproject.toml index 199261c9..5fc7c1d3 100644 --- a/key-value/key-value-aio/pyproject.toml +++ b/key-value/key-value-aio/pyproject.toml @@ -48,6 +48,7 @@ rocksdb = [ "rocksdict>=0.3.24 ; python_version >= '3.12'", # RocksDB 0.3.24 is the first version to support Python 3.13 "rocksdict>=0.3.2 ; python_version < '3.12'" ] +duckdb = ["duckdb>=1.0.0"] wrappers-encryption = ["cryptography>=45.0.0"] [tool.pytest.ini_options] @@ -67,7 +68,7 @@ env_files = [".env"] [dependency-groups] dev = [ - "py-key-value-aio[memory,disk,redis,elasticsearch,memcached,mongodb,vault,dynamodb,rocksdb]", + "py-key-value-aio[memory,disk,redis,elasticsearch,memcached,mongodb,vault,dynamodb,rocksdb,duckdb]", "py-key-value-aio[valkey]; platform_system != 'Windows'", "py-key-value-aio[keyring]", "py-key-value-aio[pydantic]", diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/__init__.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/__init__.py new file mode 100644 index 00000000..797a7fb4 --- /dev/null +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/__init__.py @@ -0,0 +1,3 @@ +from key_value.aio.stores.duckdb.store import DuckDBStore + +__all__ = ["DuckDBStore"] diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py new file mode 100644 index 00000000..1ea761a2 --- /dev/null +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -0,0 +1,188 @@ +from pathlib import Path +from typing import overload + +from key_value.shared.utils.managed_entry import ManagedEntry +from typing_extensions import override + +from key_value.aio.stores.base import SEED_DATA_TYPE, BaseContextManagerStore, BaseStore + +try: + import duckdb +except ImportError as e: + msg = "DuckDBStore requires py-key-value-aio[duckdb]" + raise ImportError(msg) from e + + +class DuckDBStore(BaseContextManagerStore, BaseStore): + """A DuckDB-based key-value store supporting both in-memory and persistent storage. + + DuckDB is an in-process SQL OLAP database that provides excellent performance + for analytical workloads while supporting standard SQL operations. This store + can operate in memory-only mode or persist data to disk. + """ + + _connection: duckdb.DuckDBPyConnection + _is_closed: bool + + @overload + def __init__( + self, + *, + connection: duckdb.DuckDBPyConnection, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store with an existing connection. + + Args: + connection: An existing DuckDB connection to use. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + + @overload + def __init__( + self, + *, + database_path: Path | str | None = None, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store with a database path. + + Args: + database_path: Path to the database file. If None or ':memory:', uses in-memory database. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + + def __init__( + self, + *, + connection: duckdb.DuckDBPyConnection | None = None, + database_path: Path | str | None = None, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store. + + Args: + connection: An existing DuckDB connection to use. + database_path: Path to the database file. If None or ':memory:', uses in-memory database. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + if connection is not None and database_path is not None: + msg = "Provide only one of connection or database_path" + raise ValueError(msg) + + if connection is not None: + self._connection = connection + else: + # Convert Path to string if needed + if isinstance(database_path, Path): + database_path = str(database_path) + + # Use in-memory database if no path specified + if database_path is None or database_path == ":memory:": + self._connection = duckdb.connect(":memory:") + else: + self._connection = duckdb.connect(database=database_path) + + self._is_closed = False + self._stable_api = False + + super().__init__(default_collection=default_collection, seed=seed) + + @override + async def _setup(self) -> None: + """Initialize the database schema for key-value storage.""" + # Create the main table for storing key-value entries + self._connection.execute(""" + CREATE TABLE IF NOT EXISTS kv_entries ( + collection VARCHAR NOT NULL, + key VARCHAR NOT NULL, + value_json TEXT NOT NULL, + created_at DOUBLE, + ttl DOUBLE, + expires_at DOUBLE, + PRIMARY KEY (collection, key) + ) + """) + + # Create index for efficient collection queries + self._connection.execute(""" + CREATE INDEX IF NOT EXISTS idx_kv_collection + ON kv_entries(collection) + """) + + # Create index for expiration-based queries + self._connection.execute(""" + CREATE INDEX IF NOT EXISTS idx_kv_expires_at + ON kv_entries(expires_at) + """) + + @override + async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: + """Retrieve a managed entry by key from the specified collection.""" + result = self._connection.execute( + "SELECT value_json FROM kv_entries WHERE collection = ? AND key = ?", + [collection, key], + ).fetchone() + + if result is None: + return None + + value_json = result[0] + return ManagedEntry.from_json(json_str=value_json) + + @override + async def _put_managed_entry( + self, + *, + key: str, + collection: str, + managed_entry: ManagedEntry, + ) -> None: + """Store a managed entry by key in the specified collection.""" + # Insert or replace the entry + self._connection.execute( + """ + INSERT OR REPLACE INTO kv_entries + (collection, key, value_json, created_at, ttl, expires_at) + VALUES (?, ?, ?, ?, ?, ?) + """, + [ + collection, + key, + managed_entry.to_json(), + managed_entry.created_at.timestamp() if managed_entry.created_at else None, + managed_entry.ttl, + managed_entry.expires_at.timestamp() if managed_entry.expires_at else None, + ], + ) + + @override + async def _delete_managed_entry(self, *, key: str, collection: str) -> bool: + """Delete a managed entry by key from the specified collection.""" + result = self._connection.execute( + "DELETE FROM kv_entries WHERE collection = ? AND key = ? RETURNING key", + [collection, key], + ) + + # Check if any rows were deleted by counting returned rows + deleted_rows = result.fetchall() + return len(deleted_rows) > 0 + + @override + async def _close(self) -> None: + """Close the DuckDB connection.""" + if not self._is_closed: + self._connection.close() + self._is_closed = True + + def __del__(self) -> None: + """Clean up the DuckDB connection on deletion.""" + if not self._is_closed: + self._connection.close() + self._is_closed = True diff --git a/key-value/key-value-aio/tests/stores/duckdb/__init__.py b/key-value/key-value-aio/tests/stores/duckdb/__init__.py new file mode 100644 index 00000000..d735ddb5 --- /dev/null +++ b/key-value/key-value-aio/tests/stores/duckdb/__init__.py @@ -0,0 +1 @@ +# DuckDB store tests diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py new file mode 100644 index 00000000..c54d3a83 --- /dev/null +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -0,0 +1,142 @@ +from collections.abc import AsyncGenerator +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +from typing_extensions import override + +from key_value.aio.stores.base import BaseStore +from key_value.aio.stores.duckdb import DuckDBStore +from tests.stores.base import BaseStoreTests, ContextManagerStoreTestMixin + + +class TestDuckDBStore(ContextManagerStoreTestMixin, BaseStoreTests): + @override + @pytest.fixture + async def store(self) -> AsyncGenerator[DuckDBStore, None]: + """Test with in-memory DuckDB database.""" + duckdb_store = DuckDBStore() + yield duckdb_store + await duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + @override + async def test_not_unbounded(self, store: BaseStore): ... + + +class TestDuckDBStorePersistent(ContextManagerStoreTestMixin, BaseStoreTests): + @override + @pytest.fixture + async def store(self) -> AsyncGenerator[DuckDBStore, None]: + """Test with persistent DuckDB database file.""" + with TemporaryDirectory() as temp_dir: + db_path = Path(temp_dir) / "test.db" + duckdb_store = DuckDBStore(database_path=db_path) + yield duckdb_store + await duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + @override + async def test_not_unbounded(self, store: BaseStore): ... + + +class TestDuckDBStoreSpecific: + """Test DuckDB-specific functionality.""" + + @pytest.fixture + async def store(self) -> AsyncGenerator[DuckDBStore, None]: + """Provide DuckDB store instance.""" + duckdb_store = DuckDBStore() + yield duckdb_store + await duckdb_store.close() + + async def test_database_path_initialization(self): + """Test that store can be initialized with different database path options.""" + # In-memory (default) + store1 = DuckDBStore() + await store1.put(collection="test", key="key1", value={"test": "value1"}) + result1 = await store1.get(collection="test", key="key1") + assert result1 == {"test": "value1"} + await store1.close() + + # Explicit in-memory + store2 = DuckDBStore(database_path=":memory:") + await store2.put(collection="test", key="key2", value={"test": "value2"}) + result2 = await store2.get(collection="test", key="key2") + assert result2 == {"test": "value2"} + await store2.close() + + async def test_persistent_database(self): + """Test that data persists across store instances when using file database.""" + with TemporaryDirectory() as temp_dir: + db_path = Path(temp_dir) / "persist_test.db" + + # Store data in first instance + store1 = DuckDBStore(database_path=db_path) + await store1.put(collection="test", key="persist_key", value={"data": "persistent"}) + await store1.close() + + # Create second instance with same database file + store2 = DuckDBStore(database_path=db_path) + result = await store2.get(collection="test", key="persist_key") + await store2.close() + + assert result == {"data": "persistent"} + + async def test_sql_injection_protection(self, store: DuckDBStore): + """Test that the store is protected against SQL injection attacks.""" + malicious_collection = "test'; DROP TABLE kv_entries; --" + malicious_key = "key'; DELETE FROM kv_entries; --" + + # These operations should not cause SQL injection + await store.put(collection=malicious_collection, key=malicious_key, value={"safe": "data"}) + result = await store.get(collection=malicious_collection, key=malicious_key) + assert result == {"safe": "data"} + + # Verify the table still exists and other data is safe + await store.put(collection="normal", key="normal_key", value={"normal": "data"}) + normal_result = await store.get(collection="normal", key="normal_key") + assert normal_result == {"normal": "data"} + + async def test_large_data_storage(self, store: DuckDBStore): + """Test storing and retrieving large data values.""" + # Create a large value (1MB of data) + large_value = {"large_data": "x" * (1024 * 1024)} + + await store.put(collection="test", key="large_key", value=large_value) + result = await store.get(collection="test", key="large_key") + + assert result == large_value + + async def test_unicode_support(self, store: DuckDBStore): + """Test that the store properly handles Unicode characters.""" + unicode_data = { + "english": "Hello World", + "chinese": "ไฝ ๅฅฝไธ–็•Œ", + "japanese": "ใ“ใ‚“ใซใกใฏไธ–็•Œ", + "arabic": "ู…ุฑุญุจุง ุจุงู„ุนุงู„ู…", + "emoji": "๐ŸŒ๐Ÿš€๐Ÿ’ป", + "special": "Special chars: !@#$%^&*()_+-={}[]|\\:;\"'<>?,./", + } + + await store.put(collection="unicode_test", key="unicode_key", value=unicode_data) + result = await store.get(collection="unicode_test", key="unicode_key") + + assert result == unicode_data + + async def test_connection_initialization(self): + """Test that store can be initialized with existing DuckDB connection.""" + import duckdb + + conn = duckdb.connect(":memory:") + store = DuckDBStore(connection=conn) + + await store.put(collection="test", key="conn_test", value={"test": "value"}) + result = await store.get(collection="test", key="conn_test") + assert result == {"test": "value"} + + await store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + @override + async def test_not_unbounded(self, store: BaseStore): ... diff --git a/uv.lock b/uv.lock index 448b117e..47adb93a 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 2 +revision = 3 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12' and sys_platform != 'win32'", @@ -674,6 +674,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "duckdb" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ea/e7/21cf50a3d52ffceee1f0bcc3997fa96a5062e6bab705baee4f6c4e33cce5/duckdb-1.4.1.tar.gz", hash = "sha256:f903882f045d057ebccad12ac69975952832edfe133697694854bb784b8d6c76", size = 18461687, upload-time = "2025-10-07T10:37:28.605Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/cc/00a07de0e33d16763edd4132d7c8a2f9efd57a2f296a25a948f239a1fadf/duckdb-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:296b4fff3908fb4c47b0aa1d77bd1933375e75401009d2dc81af8e7a0b8a05b4", size = 29062814, upload-time = "2025-10-07T10:36:14.261Z" }, + { url = "https://files.pythonhosted.org/packages/17/ea/fb0fda8886d1928f1b2a53a1163ef94f6f4b41f6d8b29eee457acfc2fa67/duckdb-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b4182800092115feee5d71a8691efb283d3c9f5eb0b36362b308ef007a12222", size = 16161652, upload-time = "2025-10-07T10:36:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/b4/5f/052e6436a71f461e61cd3a982954c029145a84b58cefa1dfb3eb2d96e4fc/duckdb-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:67cc3b6c7f7ba07a69e9331b8ccea7a60cbcd4204bb473e5da9b71588bd2eca9", size = 13753030, upload-time = "2025-10-07T10:36:19.782Z" }, + { url = "https://files.pythonhosted.org/packages/c2/fd/3ae3c89d0f6ad54c0be4430e572306fbfc9f173c97b23c5025a540449325/duckdb-1.4.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cef0cee7030b561640cb9af718f8841b19cdd2aa020d53561057b5743bea90b", size = 18487683, upload-time = "2025-10-07T10:36:22.375Z" }, + { url = "https://files.pythonhosted.org/packages/d4/3c/eef454cd7c3880c2d55b50e18a9c7a213bf91ded79efcfb573d8d6dd8a47/duckdb-1.4.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2bf93347f37a46bacce6ac859d651dbf5731e2c94a64ab358300425b09e3de23", size = 20487080, upload-time = "2025-10-07T10:36:24.692Z" }, + { url = "https://files.pythonhosted.org/packages/bb/5b/b619f4c986a1cb0b06315239da9ce5fd94a20c07a344d03e2635d56a6967/duckdb-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:2e60d2361f978908a3d96eebaf1f4b346f283afcc467351aae50ea45ca293a2b", size = 12324436, upload-time = "2025-10-07T10:36:27.458Z" }, + { url = "https://files.pythonhosted.org/packages/d9/52/606f13fa9669a24166d2fe523e28982d8ef9039874b4de774255c7806d1f/duckdb-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:605d563c1d5203ca992497cd33fb386ac3d533deca970f9dcf539f62a34e22a9", size = 29065894, upload-time = "2025-10-07T10:36:29.837Z" }, + { url = "https://files.pythonhosted.org/packages/84/57/138241952ece868b9577e607858466315bed1739e1fbb47205df4dfdfd88/duckdb-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d3305c7c4b70336171de7adfdb50431f23671c000f11839b580c4201d9ce6ef5", size = 16163720, upload-time = "2025-10-07T10:36:32.241Z" }, + { url = "https://files.pythonhosted.org/packages/a3/81/afa3a0a78498a6f4acfea75c48a70c5082032d9ac87822713d7c2d164af1/duckdb-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a063d6febbe34b32f1ad2e68822db4d0e4b1102036f49aaeeb22b844427a75df", size = 13756223, upload-time = "2025-10-07T10:36:34.673Z" }, + { url = "https://files.pythonhosted.org/packages/47/dd/5f6064fbd9248e37a3e806a244f81e0390ab8f989d231b584fb954f257fc/duckdb-1.4.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1ffcaaf74f7d1df3684b54685cbf8d3ce732781c541def8e1ced304859733ae", size = 18487022, upload-time = "2025-10-07T10:36:36.759Z" }, + { url = "https://files.pythonhosted.org/packages/a1/10/b54969a1c42fd9344ad39228d671faceb8aa9f144b67cd9531a63551757f/duckdb-1.4.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:685d3d1599dc08160e0fa0cf09e93ac4ff8b8ed399cb69f8b5391cd46b5b207c", size = 20491004, upload-time = "2025-10-07T10:36:39.318Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d5/7332ae8f804869a4e895937821b776199a283f8d9fc775fd3ae5a0558099/duckdb-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:78f1d28a15ae73bd449c43f80233732adffa49be1840a32de8f1a6bb5b286764", size = 12327619, upload-time = "2025-10-07T10:36:41.509Z" }, + { url = "https://files.pythonhosted.org/packages/0e/6c/906a3fe41cd247b5638866fc1245226b528de196588802d4df4df1e6e819/duckdb-1.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cd1765a7d180b7482874586859fc23bc9969d7d6c96ced83b245e6c6f49cde7f", size = 29076820, upload-time = "2025-10-07T10:36:43.782Z" }, + { url = "https://files.pythonhosted.org/packages/66/c7/01dd33083f01f618c2a29f6dd068baf16945b8cbdb132929d3766610bbbb/duckdb-1.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8ed7a86725185470953410823762956606693c0813bb64e09c7d44dbd9253a64", size = 16167558, upload-time = "2025-10-07T10:36:46.003Z" }, + { url = "https://files.pythonhosted.org/packages/81/e2/f983b4b7ae1dfbdd2792dd31dee9a0d35f88554452cbfc6c9d65e22fdfa9/duckdb-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a189bdfc64cfb9cc1adfbe4f2dcfde0a4992ec08505ad8ce33c886e4813f0bf", size = 13762226, upload-time = "2025-10-07T10:36:48.55Z" }, + { url = "https://files.pythonhosted.org/packages/ed/34/fb69a7be19b90f573b3cc890961be7b11870b77514769655657514f10a98/duckdb-1.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9090089b6486f7319c92acdeed8acda022d4374032d78a465956f50fc52fabf", size = 18500901, upload-time = "2025-10-07T10:36:52.445Z" }, + { url = "https://files.pythonhosted.org/packages/e4/a5/1395d7b49d5589e85da9a9d7ffd8b50364c9d159c2807bef72d547f0ad1e/duckdb-1.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:142552ea3e768048e0e8c832077a545ca07792631c59edaee925e3e67401c2a0", size = 20514177, upload-time = "2025-10-07T10:36:55.358Z" }, + { url = "https://files.pythonhosted.org/packages/c0/21/08f10706d30252753349ec545833fc0cea67c11abd0b5223acf2827f1056/duckdb-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:567f3b3a785a9e8650612461893c49ca799661d2345a6024dda48324ece89ded", size = 12336422, upload-time = "2025-10-07T10:36:57.521Z" }, + { url = "https://files.pythonhosted.org/packages/d7/08/705988c33e38665c969f7876b3ca4328be578554aa7e3dc0f34158da3e64/duckdb-1.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:46496a2518752ae0c6c5d75d4cdecf56ea23dd098746391176dd8e42cf157791", size = 29077070, upload-time = "2025-10-07T10:36:59.83Z" }, + { url = "https://files.pythonhosted.org/packages/99/c5/7c9165f1e6b9069441bcda4da1e19382d4a2357783d37ff9ae238c5c41ac/duckdb-1.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1c65ae7e9b541cea07d8075343bcfebdecc29a3c0481aa6078ee63d51951cfcd", size = 16167506, upload-time = "2025-10-07T10:37:02.24Z" }, + { url = "https://files.pythonhosted.org/packages/38/46/267f4a570a0ee3ae6871ddc03435f9942884284e22a7ba9b7cb252ee69b6/duckdb-1.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:598d1a314e34b65d9399ddd066ccce1eeab6a60a2ef5885a84ce5ed62dbaf729", size = 13762330, upload-time = "2025-10-07T10:37:04.581Z" }, + { url = "https://files.pythonhosted.org/packages/15/7b/c4f272a40c36d82df20937d93a1780eb39ab0107fe42b62cba889151eab9/duckdb-1.4.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2f16b8def782d484a9f035fc422bb6f06941ed0054b4511ddcdc514a7fb6a75", size = 18504687, upload-time = "2025-10-07T10:37:06.991Z" }, + { url = "https://files.pythonhosted.org/packages/17/fc/9b958751f0116d7b0406406b07fa6f5a10c22d699be27826d0b896f9bf51/duckdb-1.4.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a7d0aed068a5c33622a8848857947cab5cfb3f2a315b1251849bac2c74c492", size = 20513823, upload-time = "2025-10-07T10:37:09.349Z" }, + { url = "https://files.pythonhosted.org/packages/30/79/4f544d73fcc0513b71296cb3ebb28a227d22e80dec27204977039b9fa875/duckdb-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:280fd663dacdd12bb3c3bf41f3e5b2e5b95e00b88120afabb8b8befa5f335c6f", size = 12336460, upload-time = "2025-10-07T10:37:12.154Z" }, +] + [[package]] name = "elastic-transport" version = "9.2.0" @@ -1379,6 +1411,9 @@ disk = [ { name = "diskcache" }, { name = "pathvalidate" }, ] +duckdb = [ + { name = "duckdb" }, +] dynamodb = [ { name = "aioboto3" }, { name = "types-aiobotocore-dynamodb" }, @@ -1426,7 +1461,7 @@ wrappers-encryption = [ [package.dev-dependencies] dev = [ { name = "py-key-value", extra = ["dev"] }, - { name = "py-key-value-aio", extra = ["disk", "dynamodb", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, + { name = "py-key-value-aio", extra = ["disk", "duckdb", "dynamodb", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, { name = "py-key-value-aio", extra = ["valkey"], marker = "sys_platform != 'win32'" }, ] @@ -1440,6 +1475,7 @@ requires-dist = [ { name = "cryptography", marker = "extra == 'wrappers-encryption'", specifier = ">=45.0.0" }, { name = "dbus-python", marker = "extra == 'keyring-linux'", specifier = ">=1.4.0" }, { name = "diskcache", marker = "extra == 'disk'", specifier = ">=5.6.0" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=1.0.0" }, { name = "elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=9.0.0" }, { name = "hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "keyring", marker = "extra == 'keyring'", specifier = ">=25.6.0" }, @@ -1455,13 +1491,13 @@ requires-dist = [ { name = "types-hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "valkey-glide", marker = "extra == 'valkey'", specifier = ">=2.1.0" }, ] -provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "dynamodb", "keyring", "keyring-linux", "pydantic", "rocksdb", "wrappers-encryption"] +provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "dynamodb", "keyring", "keyring-linux", "pydantic", "rocksdb", "duckdb", "wrappers-encryption"] [package.metadata.requires-dev] dev = [ { name = "py-key-value", extras = ["dev"], editable = "." }, { name = "py-key-value-aio", extras = ["keyring"] }, - { name = "py-key-value-aio", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "dynamodb", "rocksdb"] }, + { name = "py-key-value-aio", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "dynamodb", "rocksdb", "duckdb"] }, { name = "py-key-value-aio", extras = ["pydantic"] }, { name = "py-key-value-aio", extras = ["valkey"], marker = "sys_platform != 'win32'" }, { name = "py-key-value-aio", extras = ["wrappers-encryption"] }, From 5194565b52bf18be102e1773d953e95f6d8010f1 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 26 Oct 2025 23:47:29 +0000 Subject: [PATCH 02/12] fix: Address CodeRabbit PR review feedback for DuckDB store MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add connection ownership tracking with _owns_connection flag - Add closed-state guards to prevent use-after-close errors - Document metadata column duplication rationale - Add error handling to __del__ with try-except - Remove invalid @override decorators from test methods All 294 tests passing โœ… Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 50 ++++++++++++++++--- .../tests/stores/duckdb/test_duckdb.py | 3 -- 2 files changed, 44 insertions(+), 9 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index 1ea761a2..67a3044b 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -19,10 +19,15 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): DuckDB is an in-process SQL OLAP database that provides excellent performance for analytical workloads while supporting standard SQL operations. This store can operate in memory-only mode or persist data to disk. + + Note on connection ownership: When you provide an existing connection, the store + will take ownership and close it when the store is closed or garbage collected. + If you need to reuse a connection, create separate DuckDB connections for each store. """ _connection: duckdb.DuckDBPyConnection _is_closed: bool + _owns_connection: bool @overload def __init__( @@ -34,6 +39,10 @@ def __init__( ) -> None: """Initialize the DuckDB store with an existing connection. + Warning: The store will take ownership of the connection and close it + when the store is closed or garbage collected. If you need to reuse + a connection, create separate DuckDB connections for each store. + Args: connection: An existing DuckDB connection to use. default_collection: The default collection to use if no collection is provided. @@ -78,6 +87,7 @@ def __init__( if connection is not None: self._connection = connection + self._owns_connection = True # We take ownership even of provided connections else: # Convert Path to string if needed if isinstance(database_path, Path): @@ -88,6 +98,7 @@ def __init__( self._connection = duckdb.connect(":memory:") else: self._connection = duckdb.connect(database=database_path) + self._owns_connection = True self._is_closed = False self._stable_api = False @@ -96,7 +107,18 @@ def __init__( @override async def _setup(self) -> None: - """Initialize the database schema for key-value storage.""" + """Initialize the database schema for key-value storage. + + Note: The schema stores created_at, ttl, and expires_at as separate columns + in addition to the serialized ManagedEntry in value_json. This duplication + is intentional for future features: + - The expires_at column with its index enables efficient expiration-based + cleanup queries (e.g., DELETE FROM kv_entries WHERE expires_at < now()) + - The separate columns allow for metadata queries without deserializing JSON + - Currently, only value_json is read during _get_managed_entry + + This design trades storage space for query flexibility and future extensibility. + """ # Create the main table for storing key-value entries self._connection.execute(""" CREATE TABLE IF NOT EXISTS kv_entries ( @@ -116,7 +138,7 @@ async def _setup(self) -> None: ON kv_entries(collection) """) - # Create index for expiration-based queries + # Create index for expiration-based queries (for future cleanup features) self._connection.execute(""" CREATE INDEX IF NOT EXISTS idx_kv_expires_at ON kv_entries(expires_at) @@ -125,6 +147,10 @@ async def _setup(self) -> None: @override async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: """Retrieve a managed entry by key from the specified collection.""" + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + result = self._connection.execute( "SELECT value_json FROM kv_entries WHERE collection = ? AND key = ?", [collection, key], @@ -145,6 +171,10 @@ async def _put_managed_entry( managed_entry: ManagedEntry, ) -> None: """Store a managed entry by key in the specified collection.""" + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + # Insert or replace the entry self._connection.execute( """ @@ -165,6 +195,10 @@ async def _put_managed_entry( @override async def _delete_managed_entry(self, *, key: str, collection: str) -> bool: """Delete a managed entry by key from the specified collection.""" + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + result = self._connection.execute( "DELETE FROM kv_entries WHERE collection = ? AND key = ? RETURNING key", [collection, key], @@ -177,12 +211,16 @@ async def _delete_managed_entry(self, *, key: str, collection: str) -> bool: @override async def _close(self) -> None: """Close the DuckDB connection.""" - if not self._is_closed: + if not self._is_closed and self._owns_connection: self._connection.close() self._is_closed = True def __del__(self) -> None: """Clean up the DuckDB connection on deletion.""" - if not self._is_closed: - self._connection.close() - self._is_closed = True + try: + if not self._is_closed and self._owns_connection and hasattr(self, "_connection"): + self._connection.close() + self._is_closed = True + except Exception: # noqa: S110 + # Suppress errors during cleanup to avoid issues during interpreter shutdown + pass diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index c54d3a83..9b8c5efa 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -20,7 +20,6 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: await duckdb_store.close() @pytest.mark.skip(reason="Local disk stores are unbounded") - @override async def test_not_unbounded(self, store: BaseStore): ... @@ -36,7 +35,6 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: await duckdb_store.close() @pytest.mark.skip(reason="Local disk stores are unbounded") - @override async def test_not_unbounded(self, store: BaseStore): ... @@ -138,5 +136,4 @@ async def test_connection_initialization(self): await store.close() @pytest.mark.skip(reason="Local disk stores are unbounded") - @override async def test_not_unbounded(self, store: BaseStore): ... From 7cb568147508278e3e7789ff22832b5b0716b764 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Mon, 27 Oct 2025 00:48:04 +0000 Subject: [PATCH 03/12] Update uv.lock with relaxed version constraints and DuckDB Co-authored-by: William Easton --- uv.lock | 42 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/uv.lock b/uv.lock index 4d470a35..3163dcb4 100644 --- a/uv.lock +++ b/uv.lock @@ -674,6 +674,38 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, ] +[[package]] +name = "duckdb" +version = "1.4.1" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/ea/e7/21cf50a3d52ffceee1f0bcc3997fa96a5062e6bab705baee4f6c4e33cce5/duckdb-1.4.1.tar.gz", hash = "sha256:f903882f045d057ebccad12ac69975952832edfe133697694854bb784b8d6c76", size = 18461687, upload-time = "2025-10-07T10:37:28.605Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/68/cc/00a07de0e33d16763edd4132d7c8a2f9efd57a2f296a25a948f239a1fadf/duckdb-1.4.1-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:296b4fff3908fb4c47b0aa1d77bd1933375e75401009d2dc81af8e7a0b8a05b4", size = 29062814, upload-time = "2025-10-07T10:36:14.261Z" }, + { url = "https://files.pythonhosted.org/packages/17/ea/fb0fda8886d1928f1b2a53a1163ef94f6f4b41f6d8b29eee457acfc2fa67/duckdb-1.4.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:0b4182800092115feee5d71a8691efb283d3c9f5eb0b36362b308ef007a12222", size = 16161652, upload-time = "2025-10-07T10:36:17.358Z" }, + { url = "https://files.pythonhosted.org/packages/b4/5f/052e6436a71f461e61cd3a982954c029145a84b58cefa1dfb3eb2d96e4fc/duckdb-1.4.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:67cc3b6c7f7ba07a69e9331b8ccea7a60cbcd4204bb473e5da9b71588bd2eca9", size = 13753030, upload-time = "2025-10-07T10:36:19.782Z" }, + { url = "https://files.pythonhosted.org/packages/c2/fd/3ae3c89d0f6ad54c0be4430e572306fbfc9f173c97b23c5025a540449325/duckdb-1.4.1-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:0cef0cee7030b561640cb9af718f8841b19cdd2aa020d53561057b5743bea90b", size = 18487683, upload-time = "2025-10-07T10:36:22.375Z" }, + { url = "https://files.pythonhosted.org/packages/d4/3c/eef454cd7c3880c2d55b50e18a9c7a213bf91ded79efcfb573d8d6dd8a47/duckdb-1.4.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:2bf93347f37a46bacce6ac859d651dbf5731e2c94a64ab358300425b09e3de23", size = 20487080, upload-time = "2025-10-07T10:36:24.692Z" }, + { url = "https://files.pythonhosted.org/packages/bb/5b/b619f4c986a1cb0b06315239da9ce5fd94a20c07a344d03e2635d56a6967/duckdb-1.4.1-cp310-cp310-win_amd64.whl", hash = "sha256:2e60d2361f978908a3d96eebaf1f4b346f283afcc467351aae50ea45ca293a2b", size = 12324436, upload-time = "2025-10-07T10:36:27.458Z" }, + { url = "https://files.pythonhosted.org/packages/d9/52/606f13fa9669a24166d2fe523e28982d8ef9039874b4de774255c7806d1f/duckdb-1.4.1-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:605d563c1d5203ca992497cd33fb386ac3d533deca970f9dcf539f62a34e22a9", size = 29065894, upload-time = "2025-10-07T10:36:29.837Z" }, + { url = "https://files.pythonhosted.org/packages/84/57/138241952ece868b9577e607858466315bed1739e1fbb47205df4dfdfd88/duckdb-1.4.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d3305c7c4b70336171de7adfdb50431f23671c000f11839b580c4201d9ce6ef5", size = 16163720, upload-time = "2025-10-07T10:36:32.241Z" }, + { url = "https://files.pythonhosted.org/packages/a3/81/afa3a0a78498a6f4acfea75c48a70c5082032d9ac87822713d7c2d164af1/duckdb-1.4.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:a063d6febbe34b32f1ad2e68822db4d0e4b1102036f49aaeeb22b844427a75df", size = 13756223, upload-time = "2025-10-07T10:36:34.673Z" }, + { url = "https://files.pythonhosted.org/packages/47/dd/5f6064fbd9248e37a3e806a244f81e0390ab8f989d231b584fb954f257fc/duckdb-1.4.1-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:d1ffcaaf74f7d1df3684b54685cbf8d3ce732781c541def8e1ced304859733ae", size = 18487022, upload-time = "2025-10-07T10:36:36.759Z" }, + { url = "https://files.pythonhosted.org/packages/a1/10/b54969a1c42fd9344ad39228d671faceb8aa9f144b67cd9531a63551757f/duckdb-1.4.1-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:685d3d1599dc08160e0fa0cf09e93ac4ff8b8ed399cb69f8b5391cd46b5b207c", size = 20491004, upload-time = "2025-10-07T10:36:39.318Z" }, + { url = "https://files.pythonhosted.org/packages/ed/d5/7332ae8f804869a4e895937821b776199a283f8d9fc775fd3ae5a0558099/duckdb-1.4.1-cp311-cp311-win_amd64.whl", hash = "sha256:78f1d28a15ae73bd449c43f80233732adffa49be1840a32de8f1a6bb5b286764", size = 12327619, upload-time = "2025-10-07T10:36:41.509Z" }, + { url = "https://files.pythonhosted.org/packages/0e/6c/906a3fe41cd247b5638866fc1245226b528de196588802d4df4df1e6e819/duckdb-1.4.1-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:cd1765a7d180b7482874586859fc23bc9969d7d6c96ced83b245e6c6f49cde7f", size = 29076820, upload-time = "2025-10-07T10:36:43.782Z" }, + { url = "https://files.pythonhosted.org/packages/66/c7/01dd33083f01f618c2a29f6dd068baf16945b8cbdb132929d3766610bbbb/duckdb-1.4.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:8ed7a86725185470953410823762956606693c0813bb64e09c7d44dbd9253a64", size = 16167558, upload-time = "2025-10-07T10:36:46.003Z" }, + { url = "https://files.pythonhosted.org/packages/81/e2/f983b4b7ae1dfbdd2792dd31dee9a0d35f88554452cbfc6c9d65e22fdfa9/duckdb-1.4.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:8a189bdfc64cfb9cc1adfbe4f2dcfde0a4992ec08505ad8ce33c886e4813f0bf", size = 13762226, upload-time = "2025-10-07T10:36:48.55Z" }, + { url = "https://files.pythonhosted.org/packages/ed/34/fb69a7be19b90f573b3cc890961be7b11870b77514769655657514f10a98/duckdb-1.4.1-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a9090089b6486f7319c92acdeed8acda022d4374032d78a465956f50fc52fabf", size = 18500901, upload-time = "2025-10-07T10:36:52.445Z" }, + { url = "https://files.pythonhosted.org/packages/e4/a5/1395d7b49d5589e85da9a9d7ffd8b50364c9d159c2807bef72d547f0ad1e/duckdb-1.4.1-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:142552ea3e768048e0e8c832077a545ca07792631c59edaee925e3e67401c2a0", size = 20514177, upload-time = "2025-10-07T10:36:55.358Z" }, + { url = "https://files.pythonhosted.org/packages/c0/21/08f10706d30252753349ec545833fc0cea67c11abd0b5223acf2827f1056/duckdb-1.4.1-cp312-cp312-win_amd64.whl", hash = "sha256:567f3b3a785a9e8650612461893c49ca799661d2345a6024dda48324ece89ded", size = 12336422, upload-time = "2025-10-07T10:36:57.521Z" }, + { url = "https://files.pythonhosted.org/packages/d7/08/705988c33e38665c969f7876b3ca4328be578554aa7e3dc0f34158da3e64/duckdb-1.4.1-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:46496a2518752ae0c6c5d75d4cdecf56ea23dd098746391176dd8e42cf157791", size = 29077070, upload-time = "2025-10-07T10:36:59.83Z" }, + { url = "https://files.pythonhosted.org/packages/99/c5/7c9165f1e6b9069441bcda4da1e19382d4a2357783d37ff9ae238c5c41ac/duckdb-1.4.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:1c65ae7e9b541cea07d8075343bcfebdecc29a3c0481aa6078ee63d51951cfcd", size = 16167506, upload-time = "2025-10-07T10:37:02.24Z" }, + { url = "https://files.pythonhosted.org/packages/38/46/267f4a570a0ee3ae6871ddc03435f9942884284e22a7ba9b7cb252ee69b6/duckdb-1.4.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:598d1a314e34b65d9399ddd066ccce1eeab6a60a2ef5885a84ce5ed62dbaf729", size = 13762330, upload-time = "2025-10-07T10:37:04.581Z" }, + { url = "https://files.pythonhosted.org/packages/15/7b/c4f272a40c36d82df20937d93a1780eb39ab0107fe42b62cba889151eab9/duckdb-1.4.1-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e2f16b8def782d484a9f035fc422bb6f06941ed0054b4511ddcdc514a7fb6a75", size = 18504687, upload-time = "2025-10-07T10:37:06.991Z" }, + { url = "https://files.pythonhosted.org/packages/17/fc/9b958751f0116d7b0406406b07fa6f5a10c22d699be27826d0b896f9bf51/duckdb-1.4.1-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a5a7d0aed068a5c33622a8848857947cab5cfb3f2a315b1251849bac2c74c492", size = 20513823, upload-time = "2025-10-07T10:37:09.349Z" }, + { url = "https://files.pythonhosted.org/packages/30/79/4f544d73fcc0513b71296cb3ebb28a227d22e80dec27204977039b9fa875/duckdb-1.4.1-cp313-cp313-win_amd64.whl", hash = "sha256:280fd663dacdd12bb3c3bf41f3e5b2e5b95e00b88120afabb8b8befa5f335c6f", size = 12336460, upload-time = "2025-10-07T10:37:12.154Z" }, +] + [[package]] name = "elastic-transport" version = "9.2.0" @@ -1379,6 +1411,9 @@ disk = [ { name = "diskcache" }, { name = "pathvalidate" }, ] +duckdb = [ + { name = "duckdb" }, +] dynamodb = [ { name = "aioboto3" }, { name = "types-aiobotocore-dynamodb" }, @@ -1426,7 +1461,7 @@ wrappers-encryption = [ [package.dev-dependencies] dev = [ { name = "py-key-value", extra = ["dev"] }, - { name = "py-key-value-aio", extra = ["disk", "dynamodb", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, + { name = "py-key-value-aio", extra = ["disk", "duckdb", "dynamodb", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, { name = "py-key-value-aio", extra = ["valkey"], marker = "sys_platform != 'win32'" }, ] @@ -1440,6 +1475,7 @@ requires-dist = [ { name = "cryptography", marker = "extra == 'wrappers-encryption'", specifier = ">=45.0.0" }, { name = "dbus-python", marker = "extra == 'keyring-linux'", specifier = ">=1.4.0" }, { name = "diskcache", marker = "extra == 'disk'", specifier = ">=5.0.0" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=1.0.0" }, { name = "elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=8.0.0" }, { name = "hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "keyring", marker = "extra == 'keyring'", specifier = ">=25.6.0" }, @@ -1455,13 +1491,13 @@ requires-dist = [ { name = "types-hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "valkey-glide", marker = "extra == 'valkey'", specifier = ">=2.1.0" }, ] -provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "dynamodb", "keyring", "keyring-linux", "pydantic", "rocksdb", "wrappers-encryption"] +provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "dynamodb", "keyring", "keyring-linux", "pydantic", "rocksdb", "duckdb", "wrappers-encryption"] [package.metadata.requires-dev] dev = [ { name = "py-key-value", extras = ["dev"], editable = "." }, { name = "py-key-value-aio", extras = ["keyring"] }, - { name = "py-key-value-aio", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "dynamodb", "rocksdb"] }, + { name = "py-key-value-aio", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "dynamodb", "rocksdb", "duckdb"] }, { name = "py-key-value-aio", extras = ["pydantic"] }, { name = "py-key-value-aio", extras = ["valkey"], marker = "sys_platform != 'win32'" }, { name = "py-key-value-aio", extras = ["wrappers-encryption"] }, From 75317fded736c38d043896e768ab513c6e0af009 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:00:37 +0000 Subject: [PATCH 04/12] fix: revert unrelated changes from bad merge Remove unrelated documentation and workflow changes that were accidentally included in the DuckDB store PR. These changes included license modifications, wrapper documentation updates, and workflow instruction changes that should be in separate PRs. Co-authored-by: William Easton --- .github/workflows/claude-on-mention.yml | 3 +++ .github/workflows/claude-on-open-label.yml | 1 + AGENTS.md | 9 +++++---- Makefile | 2 +- README.md | 8 +++++--- 5 files changed, 15 insertions(+), 8 deletions(-) diff --git a/.github/workflows/claude-on-mention.yml b/.github/workflows/claude-on-mention.yml index e817b7f1..3cad9fb5 100644 --- a/.github/workflows/claude-on-mention.yml +++ b/.github/workflows/claude-on-mention.yml @@ -47,6 +47,9 @@ jobs: 1. Call the generate_agents_md tool to get a high-level summary of the project you're working in 2. Get the ${{ github.event_name == 'issues' && 'issue' || 'pull request' }} ${{ github.event.issue.number || github.event.pull_request.number }} in the GitHub repository: ${{ github.repository }}. 3. Don't forget about your MCP tools to call search_code, get_files, etc. to search the repository and other repositories to identify the related classes, methods, docs, tests, etc that are relevant to the code. + 4. Be Thorough! Go the extra mile! Do great work! + 5. If anything you needed or wanted to do during your work was not possible, document the problem in a `Problems Encountered` section + of your response. Especially if it was something you were asked to do like research the web, run tests, lint, etc. # Using Make and Git Commands You can run make commands (e.g., `make lint`, `make typecheck`, `make sync`) to build, test, or lint the code. You can also run git commands (e.g., `git status`, `git log`, `git diff`) to inspect the repository. You cannot run arbitrary bash commands - only make and git commands are allowed. diff --git a/.github/workflows/claude-on-open-label.yml b/.github/workflows/claude-on-open-label.yml index 309de013..81a3cb2f 100644 --- a/.github/workflows/claude-on-open-label.yml +++ b/.github/workflows/claude-on-open-label.yml @@ -45,6 +45,7 @@ jobs: 3. You will avoid speculation and only assert facts that are deeply rooted (traceable) to the codebase, language/framework conventions, related issues, related pull requests, etc. 4. The main branch of the repository has been cloned locally, but changes will not be accepted and you are not allowed to make pull requests or other changes. You can search the local repository for relevant code. You will use the available MCP Server tools identify related issues and pull requests (search_issues and search_pull_requests) and you can use search_code to look at the code in relevant dependent packages. For example, you can use search_code to look at a particular store's SDK `https://github.com/elastic/elasticsearch-py` to see how it implements a certain class or function relevant to the issue at hand. 5. You can run make commands (e.g., `make lint`, `make typecheck`, `make sync`) to build, test, or lint the code. You can also run git commands (e.g., `git status`, `git log`, `git diff`) to inspect the repository. You cannot run arbitrary bash commands - only make and git commands are allowed. + 6. Be Thorough! Go the extra mile! # Getting Started 1. Call the generate_agents_md tool to get a high-level summary of the project you're working in diff --git a/AGENTS.md b/AGENTS.md index 04d2b7ec..00dd6d83 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -7,13 +7,14 @@ py-key-value project. For human developers, see [DEVELOPING.md](DEVELOPING.md). ### Required Pre-commit Checks -All three checks must pass before committing: +All four checks must pass before committing: 1. `make lint` - Runs Ruff formatting and linting (Python + Markdown) 2. `make typecheck` - Runs Basedpyright type checking 3. `make codegen` - Regenerates sync library from async +4. `make lint` - Runs Ruff formatting and linting after the other checks have -Or run all three together: +Or run all four together: ```bash make precommit @@ -173,8 +174,8 @@ Wrappers add functionality to stores and are located in: - Async: `key-value/key-value-aio/src/key_value/aio/wrappers/` - Sync: `key-value/key-value-sync/src/key_value/sync/code_gen/wrappers/` -Wrappers include: Compression, Encryption, Logging, Statistics, Retry, -Timeout, Cache, Prefix, TTL clamping, and more. +Wrappers include: Compression, DefaultValue, Encryption, Logging, Statistics, +Retry, Timeout, Cache, Prefix, TTL clamping, and more. ## Adapters diff --git a/Makefile b/Makefile index 45f57cc7..073f0170 100644 --- a/Makefile +++ b/Makefile @@ -119,4 +119,4 @@ else @cd key-value/key-value-shared && uv build . endif -precommit: lint typecheck codegen \ No newline at end of file +precommit: lint codegen lint typecheck \ No newline at end of file diff --git a/README.md b/README.md index 7dfa2cf3..152fbbd6 100644 --- a/README.md +++ b/README.md @@ -135,7 +135,7 @@ from key_value.aio.stores.memory import MemoryStore async def example(key_value: AsyncKeyValue) -> None: await key_value.put(key="123", value={"name": "Alice"}, collection="users", ttl=3600) - value = await store.get(key="123", collection="users") + value = await key_value.get(key="123", collection="users") await key_value.delete(key="123", collection="users") @@ -300,6 +300,7 @@ The following wrappers are available: |---------|---------------|-----| | CollectionRoutingWrapper | Route operations to different stores based on a collection name. | `CollectionRoutingWrapper(collection_map={"sessions": redis_store, "users": dynamo_store}, default_store=memory_store)` | | CompressionWrapper | Compress values before storing and decompress on retrieval. | `CompressionWrapper(key_value=memory_store, min_size_to_compress=0)` | +| DefaultValueWrapper | Return a default value when key is missing. | `DefaultValueWrapper(key_value=memory_store, default_value={})` | | FernetEncryptionWrapper | Encrypt values before storing and decrypt on retrieval. | `FernetEncryptionWrapper(key_value=memory_store, source_material="your-source-material", salt="your-salt")` | | FallbackWrapper | Fallback to a secondary store when the primary store fails. | `FallbackWrapper(primary_key_value=memory_store, fallback_key_value=memory_store)` | | LimitSizeWrapper | Limit the size of entries stored in the cache. | `LimitSizeWrapper(key_value=memory_store, max_size=1024, raise_on_too_large=True)` | @@ -311,8 +312,9 @@ The following wrappers are available: | RetryWrapper | Retry failed operations with exponential backoff. | `RetryWrapper(key_value=memory_store, max_retries=3, initial_delay=0.1, max_delay=10.0, exponential_base=2.0)` | | RoutingWrapper | Route operations to different stores based on a routing function. | `RoutingWrapper(routing_function=lambda collection: redis_store if collection == "sessions" else dynamo_store, default_store=memory_store)` | | SingleCollectionWrapper | Wrap a store to only use a single collection. | `SingleCollectionWrapper(key_value=memory_store, single_collection="users")` | -| TTLClampWrapper | Clamp the TTL to a given range. | `TTLClampWrapper(key_value=memory_store, min_ttl=60, max_ttl=3600)` | | StatisticsWrapper | Track operation statistics for the store. | `StatisticsWrapper(key_value=memory_store)` | +| TimeoutWrapper | Add timeout protection to store operations. | `TimeoutWrapper(key_value=redis_store, timeout=5.0)` | +| TTLClampWrapper | Clamp the TTL to a given range. | `TTLClampWrapper(key_value=memory_store, min_ttl=60, max_ttl=3600)` | Wrappers can be stacked on top of each other to create more complex functionality. @@ -409,4 +411,4 @@ Contributions welcome but may not be accepted. File an issue before submitting a pull request. If you do not get agreement on your proposal before making a pull request you may have a bad time. -MIT licensed. +Apache 2.0 licensed. From 5c3ec025745ca8b6e4979c0b5ddf71bae7ea320d Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 19:53:19 +0000 Subject: [PATCH 05/12] feat: add native storage support to DuckDB store with JSON column option - Add use_json_column parameter (defaults to True) for native JSON vs TEXT storage - Update schema to use native DuckDB types (JSON, TIMESTAMP) for better queryability - Store value data separately from metadata to eliminate duplication - Metadata (created_at, ttl, expires_at) in native columns for efficient SQL queries - Add comprehensive tests for both JSON and TEXT modes - Add test_native_sql_queryability() demonstrating direct SQL queries on stored data - Fix timezone handling to ensure consistency with ManagedEntry expectations - All 411 tests passing (404 passed, 7 skipped) This enables users to query the DuckDB database directly with SQL for analytics and data exploration while maintaining full compatibility with the key-value API. Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 115 ++++++++++++++---- .../tests/stores/duckdb/test_duckdb.py | 85 +++++++++++++ 2 files changed, 176 insertions(+), 24 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index 67a3044b..cf718769 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -20,6 +20,9 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): for analytical workloads while supporting standard SQL operations. This store can operate in memory-only mode or persist data to disk. + The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries + on stored data. Users can query the database directly for analytics or data exploration. + Note on connection ownership: When you provide an existing connection, the store will take ownership and close it when the store is closed or garbage collected. If you need to reuse a connection, create separate DuckDB connections for each store. @@ -28,6 +31,7 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): _connection: duckdb.DuckDBPyConnection _is_closed: bool _owns_connection: bool + _use_json_column: bool @overload def __init__( @@ -36,6 +40,7 @@ def __init__( connection: duckdb.DuckDBPyConnection, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, + use_json_column: bool = True, ) -> None: """Initialize the DuckDB store with an existing connection. @@ -47,6 +52,8 @@ def __init__( connection: An existing DuckDB connection to use. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. + use_json_column: If True, use native JSON column type; if False, use TEXT. + Default is True for better queryability and native type support. """ @overload @@ -56,6 +63,7 @@ def __init__( database_path: Path | str | None = None, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, + use_json_column: bool = True, ) -> None: """Initialize the DuckDB store with a database path. @@ -63,6 +71,8 @@ def __init__( database_path: Path to the database file. If None or ':memory:', uses in-memory database. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. + use_json_column: If True, use native JSON column type; if False, use TEXT. + Default is True for better queryability and native type support. """ def __init__( @@ -72,6 +82,7 @@ def __init__( database_path: Path | str | None = None, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, + use_json_column: bool = True, ) -> None: """Initialize the DuckDB store. @@ -80,6 +91,8 @@ def __init__( database_path: Path to the database file. If None or ':memory:', uses in-memory database. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. + use_json_column: If True, use native JSON column type; if False, use TEXT. + Default is True for better queryability and native type support. """ if connection is not None and database_path is not None: msg = "Provide only one of connection or database_path" @@ -101,6 +114,7 @@ def __init__( self._owns_connection = True self._is_closed = False + self._use_json_column = use_json_column self._stable_api = False super().__init__(default_collection=default_collection, seed=seed) @@ -109,25 +123,30 @@ def __init__( async def _setup(self) -> None: """Initialize the database schema for key-value storage. - Note: The schema stores created_at, ttl, and expires_at as separate columns - in addition to the serialized ManagedEntry in value_json. This duplication - is intentional for future features: - - The expires_at column with its index enables efficient expiration-based - cleanup queries (e.g., DELETE FROM kv_entries WHERE expires_at < now()) - - The separate columns allow for metadata queries without deserializing JSON - - Currently, only value_json is read during _get_managed_entry - - This design trades storage space for query flexibility and future extensibility. + The schema uses native DuckDB types for efficient querying: + - value: JSON or TEXT column storing the actual value data (not full ManagedEntry) + - created_at: TIMESTAMP for native datetime operations + - ttl: DOUBLE for time-to-live in seconds + - expires_at: TIMESTAMP for native expiration queries + + This design enables: + - Direct SQL queries on the database for analytics + - Efficient expiration cleanup: DELETE FROM kv_entries WHERE expires_at < now() + - Metadata queries without JSON deserialization + - No data duplication (metadata in columns, value in JSON/TEXT) """ + # Determine column type based on use_json_column setting + value_column_type = "JSON" if self._use_json_column else "TEXT" + # Create the main table for storing key-value entries - self._connection.execute(""" + self._connection.execute(f""" CREATE TABLE IF NOT EXISTS kv_entries ( collection VARCHAR NOT NULL, key VARCHAR NOT NULL, - value_json TEXT NOT NULL, - created_at DOUBLE, + value {value_column_type} NOT NULL, + created_at TIMESTAMP, ttl DOUBLE, - expires_at DOUBLE, + expires_at TIMESTAMP, PRIMARY KEY (collection, key) ) """) @@ -138,7 +157,7 @@ async def _setup(self) -> None: ON kv_entries(collection) """) - # Create index for expiration-based queries (for future cleanup features) + # Create index for expiration-based queries self._connection.execute(""" CREATE INDEX IF NOT EXISTS idx_kv_expires_at ON kv_entries(expires_at) @@ -146,21 +165,50 @@ async def _setup(self) -> None: @override async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: - """Retrieve a managed entry by key from the specified collection.""" + """Retrieve a managed entry by key from the specified collection. + + Reconstructs the ManagedEntry from the value column and metadata columns. + The value column contains only the value data (not the full ManagedEntry), + and metadata (created_at, ttl, expires_at) is stored in separate columns. + """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) result = self._connection.execute( - "SELECT value_json FROM kv_entries WHERE collection = ? AND key = ?", + "SELECT value, created_at, ttl, expires_at FROM kv_entries WHERE collection = ? AND key = ?", [collection, key], ).fetchone() if result is None: return None - value_json = result[0] - return ManagedEntry.from_json(json_str=value_json) + value_data, created_at, ttl, expires_at = result + + # Convert value from JSON/TEXT to dict + # If it's already a dict (from JSON column), use it; otherwise parse from string + if isinstance(value_data, str): + import json + value = json.loads(value_data) + else: + value = value_data + + # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones + # Convert to timezone-aware UTC timestamps + from datetime import timezone + + if created_at is not None and created_at.tzinfo is None: + created_at = created_at.replace(tzinfo=timezone.utc) + if expires_at is not None and expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + + # Reconstruct ManagedEntry with metadata from columns + return ManagedEntry( + value=value, + created_at=created_at, + ttl=ttl, + expires_at=expires_at, + ) @override async def _put_managed_entry( @@ -170,25 +218,44 @@ async def _put_managed_entry( collection: str, managed_entry: ManagedEntry, ) -> None: - """Store a managed entry by key in the specified collection.""" + """Store a managed entry by key in the specified collection. + + Stores the value and metadata separately: + - value: JSON string of just the value data (not full ManagedEntry) + - created_at, ttl, expires_at: Stored in native columns for efficient querying + """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) - # Insert or replace the entry + # Get just the value as JSON (not the full ManagedEntry) + value_json = managed_entry.value_as_json + + # Ensure timestamps are timezone-aware (convert naive to UTC if needed) + from datetime import timezone + + created_at = managed_entry.created_at + if created_at is not None and created_at.tzinfo is None: + created_at = created_at.replace(tzinfo=timezone.utc) + + expires_at = managed_entry.expires_at + if expires_at is not None and expires_at.tzinfo is None: + expires_at = expires_at.replace(tzinfo=timezone.utc) + + # Insert or replace the entry with metadata in separate columns self._connection.execute( """ INSERT OR REPLACE INTO kv_entries - (collection, key, value_json, created_at, ttl, expires_at) + (collection, key, value, created_at, ttl, expires_at) VALUES (?, ?, ?, ?, ?, ?) """, [ collection, key, - managed_entry.to_json(), - managed_entry.created_at.timestamp() if managed_entry.created_at else None, + value_json, + created_at, managed_entry.ttl, - managed_entry.expires_at.timestamp() if managed_entry.expires_at else None, + expires_at, ], ) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index 9b8c5efa..c5c061ba 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -38,6 +38,21 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_not_unbounded(self, store: BaseStore): ... +class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): + """Test DuckDB store with TEXT column mode instead of JSON.""" + + @override + @pytest.fixture + async def store(self) -> AsyncGenerator[DuckDBStore, None]: + """Test with in-memory DuckDB database using TEXT column.""" + duckdb_store = DuckDBStore(use_json_column=False) + yield duckdb_store + await duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + async def test_not_unbounded(self, store: BaseStore): ... + + class TestDuckDBStoreSpecific: """Test DuckDB-specific functionality.""" @@ -48,6 +63,76 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: yield duckdb_store await duckdb_store.close() + async def test_native_sql_queryability(self): + """Test that users can query the database directly with SQL.""" + store = DuckDBStore(use_json_column=True) + + # Store some test data with known metadata + await store.put(collection="products", key="item1", value={"name": "Widget", "price": 10.99}, ttl=3600) + await store.put(collection="products", key="item2", value={"name": "Gadget", "price": 25.50}, ttl=7200) + await store.put(collection="orders", key="order1", value={"total": 100.00, "items": 3}) + + # Query directly via SQL to verify native storage + # Check that value is stored as JSON (can extract fields) + result = store._connection.execute(""" + SELECT key, value->'name' as name, value->'price' as price + FROM kv_entries + WHERE collection = 'products' + ORDER BY key + """).fetchall() + + assert len(result) == 2 + assert result[0][0] == "item1" + assert result[0][1] == '"Widget"' # JSON strings are quoted + assert result[1][0] == "item2" + + # Query by expiration timestamp + result = store._connection.execute(""" + SELECT COUNT(*) + FROM kv_entries + WHERE expires_at > now() OR expires_at IS NULL + """).fetchone() + + assert result[0] == 3 # All 3 entries should not be expired + + # Query metadata columns directly + result = store._connection.execute(""" + SELECT key, ttl, created_at IS NOT NULL as has_created + FROM kv_entries + WHERE collection = 'products' AND ttl > 3600 + """).fetchall() + + assert len(result) == 1 # Only item2 has ttl > 3600 + assert result[0][0] == "item2" + assert result[0][1] == 7200 + assert result[0][2] is True # has_created + + await store.close() + + async def test_text_mode_storage(self): + """Test that TEXT mode stores value as string instead of native JSON.""" + store = DuckDBStore(use_json_column=False) + + await store.put(collection="test", key="key1", value={"data": "value"}) + + # Query to check column type - in TEXT mode, value should be a string + result = store._connection.execute(""" + SELECT value, typeof(value) as value_type + FROM kv_entries + WHERE collection = 'test' AND key = 'key1' + """).fetchone() + + assert result is not None + value_str, value_type = result + + # In TEXT mode, value should be stored as VARCHAR/TEXT + assert value_type in ("VARCHAR", "TEXT") + # Value should be a JSON string + assert isinstance(value_str, str) + assert "data" in value_str + + await store.close() + async def test_database_path_initialization(self): """Test that store can be initialized with different database path options.""" # In-memory (default) From ae38e744442b919146172e04e9f9fcd8d42fe2ef Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 28 Oct 2025 20:12:41 +0000 Subject: [PATCH 06/12] feat: add table_name parameter and split value storage columns in DuckDB store - Add table_name parameter with default 'kv_entries' for multi-tenancy - Rename use_json_column to native_storage for consistency with Elasticsearch - Split value storage into separate value_json (TEXT) and value_dict (JSON) columns - Add SQL helper functions for generating CREATE, SELECT, INSERT, DELETE statements - Update tests to use native_storage parameter and verify both storage modes - Add test_custom_table_name() and test_native_vs_stringified_storage() - All 406 tests passing, 7 skipped Following MongoDB/Elasticsearch pattern with separated value columns. Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 208 +++++++++++++----- .../tests/stores/duckdb/test_duckdb.py | 94 ++++++-- 2 files changed, 228 insertions(+), 74 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index cf718769..c3448bba 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -1,5 +1,5 @@ from pathlib import Path -from typing import overload +from typing import Any, cast, overload from key_value.shared.utils.managed_entry import ManagedEntry from typing_extensions import override @@ -23,6 +23,10 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries on stored data. Users can query the database directly for analytics or data exploration. + Storage modes: + - native_storage=True: Stores values in a JSON column as native dicts for queryability + - native_storage=False: Stores values as stringified JSON in a TEXT column + Note on connection ownership: When you provide an existing connection, the store will take ownership and close it when the store is closed or garbage collected. If you need to reuse a connection, create separate DuckDB connections for each store. @@ -31,16 +35,18 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): _connection: duckdb.DuckDBPyConnection _is_closed: bool _owns_connection: bool - _use_json_column: bool + _native_storage: bool + _table_name: str @overload def __init__( self, *, connection: duckdb.DuckDBPyConnection, + table_name: str = "kv_entries", + native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, - use_json_column: bool = True, ) -> None: """Initialize the DuckDB store with an existing connection. @@ -50,10 +56,11 @@ def __init__( Args: connection: An existing DuckDB connection to use. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. - use_json_column: If True, use native JSON column type; if False, use TEXT. - Default is True for better queryability and native type support. """ @overload @@ -61,18 +68,20 @@ def __init__( self, *, database_path: Path | str | None = None, + table_name: str = "kv_entries", + native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, - use_json_column: bool = True, ) -> None: """Initialize the DuckDB store with a database path. Args: database_path: Path to the database file. If None or ':memory:', uses in-memory database. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. - use_json_column: If True, use native JSON column type; if False, use TEXT. - Default is True for better queryability and native type support. """ def __init__( @@ -80,19 +89,21 @@ def __init__( *, connection: duckdb.DuckDBPyConnection | None = None, database_path: Path | str | None = None, + table_name: str = "kv_entries", + native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, - use_json_column: bool = True, ) -> None: """Initialize the DuckDB store. Args: connection: An existing DuckDB connection to use. database_path: Path to the database file. If None or ':memory:', uses in-memory database. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. - use_json_column: If True, use native JSON column type; if False, use TEXT. - Default is True for better queryability and native type support. """ if connection is not None and database_path is not None: msg = "Provide only one of connection or database_path" @@ -114,84 +125,157 @@ def __init__( self._owns_connection = True self._is_closed = False - self._use_json_column = use_json_column + self._native_storage = native_storage + self._table_name = table_name self._stable_api = False super().__init__(default_collection=default_collection, seed=seed) + def _get_create_table_sql(self) -> str: + """Generate SQL for creating the key-value entries table. + + Returns: + SQL CREATE TABLE statement. + """ + return f""" + CREATE TABLE IF NOT EXISTS {self._table_name} ( + collection VARCHAR NOT NULL, + key VARCHAR NOT NULL, + value_json TEXT, + value_dict JSON, + created_at TIMESTAMP, + ttl DOUBLE, + expires_at TIMESTAMP, + PRIMARY KEY (collection, key) + ) + """ + + def _get_create_collection_index_sql(self) -> str: + """Generate SQL for creating index on collection column. + + Returns: + SQL CREATE INDEX statement. + """ + return f""" + CREATE INDEX IF NOT EXISTS idx_{self._table_name}_collection + ON {self._table_name}(collection) + """ + + def _get_create_expires_index_sql(self) -> str: + """Generate SQL for creating index on expires_at column. + + Returns: + SQL CREATE INDEX statement. + """ + return f""" + CREATE INDEX IF NOT EXISTS idx_{self._table_name}_expires_at + ON {self._table_name}(expires_at) + """ + + def _get_select_sql(self) -> str: + """Generate SQL for selecting an entry by collection and key. + + Returns: + SQL SELECT statement with placeholders. + """ + return f""" + SELECT value_json, value_dict, created_at, ttl, expires_at + FROM {self._table_name} + WHERE collection = ? AND key = ? + """ # noqa: S608 + + def _get_insert_sql(self) -> str: + """Generate SQL for inserting or replacing an entry. + + Returns: + SQL INSERT OR REPLACE statement with placeholders. + """ + return f""" + INSERT OR REPLACE INTO {self._table_name} + (collection, key, value_json, value_dict, created_at, ttl, expires_at) + VALUES (?, ?, ?, ?, ?, ?, ?) + """ # noqa: S608 + + def _get_delete_sql(self) -> str: + """Generate SQL for deleting an entry by collection and key. + + Returns: + SQL DELETE statement with RETURNING clause. + """ + return f""" + DELETE FROM {self._table_name} + WHERE collection = ? AND key = ? + RETURNING key + """ # noqa: S608 + @override async def _setup(self) -> None: """Initialize the database schema for key-value storage. The schema uses native DuckDB types for efficient querying: - - value: JSON or TEXT column storing the actual value data (not full ManagedEntry) + - value_json: TEXT column storing stringified JSON (used when native_storage=False) + - value_dict: JSON column storing native dicts (used when native_storage=True) - created_at: TIMESTAMP for native datetime operations - ttl: DOUBLE for time-to-live in seconds - expires_at: TIMESTAMP for native expiration queries - This design enables: - - Direct SQL queries on the database for analytics - - Efficient expiration cleanup: DELETE FROM kv_entries WHERE expires_at < now() + This design follows the Elasticsearch/MongoDB pattern of separating native and stringified + storage, enabling: + - Direct SQL queries on the database for analytics (when using native storage) + - Efficient expiration cleanup: DELETE FROM table WHERE expires_at < now() - Metadata queries without JSON deserialization - - No data duplication (metadata in columns, value in JSON/TEXT) + - Flexibility to choose between native dict storage and stringified JSON """ - # Determine column type based on use_json_column setting - value_column_type = "JSON" if self._use_json_column else "TEXT" - # Create the main table for storing key-value entries - self._connection.execute(f""" - CREATE TABLE IF NOT EXISTS kv_entries ( - collection VARCHAR NOT NULL, - key VARCHAR NOT NULL, - value {value_column_type} NOT NULL, - created_at TIMESTAMP, - ttl DOUBLE, - expires_at TIMESTAMP, - PRIMARY KEY (collection, key) - ) - """) + self._connection.execute(self._get_create_table_sql()) # Create index for efficient collection queries - self._connection.execute(""" - CREATE INDEX IF NOT EXISTS idx_kv_collection - ON kv_entries(collection) - """) + self._connection.execute(self._get_create_collection_index_sql()) # Create index for expiration-based queries - self._connection.execute(""" - CREATE INDEX IF NOT EXISTS idx_kv_expires_at - ON kv_entries(expires_at) - """) + self._connection.execute(self._get_create_expires_index_sql()) @override async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: """Retrieve a managed entry by key from the specified collection. - Reconstructs the ManagedEntry from the value column and metadata columns. - The value column contains only the value data (not the full ManagedEntry), - and metadata (created_at, ttl, expires_at) is stored in separate columns. + Reconstructs the ManagedEntry from value columns and metadata columns. + Tries value_dict first (native storage), falls back to value_json (stringified). """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) result = self._connection.execute( - "SELECT value, created_at, ttl, expires_at FROM kv_entries WHERE collection = ? AND key = ?", + self._get_select_sql(), [collection, key], ).fetchone() if result is None: return None - value_data, created_at, ttl, expires_at = result + value_json, value_dict, created_at, ttl, expires_at = result + + # Determine which value column to use (prefer value_dict if present) + import json - # Convert value from JSON/TEXT to dict - # If it's already a dict (from JSON column), use it; otherwise parse from string - if isinstance(value_data, str): - import json - value = json.loads(value_data) + value: dict[str, Any] + if value_dict is not None: + # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) + if isinstance(value_dict, dict): + value = cast(dict[str, Any], value_dict) + elif isinstance(value_dict, str): + # DuckDB sometimes returns JSON as string + value = json.loads(value_dict) + else: + msg = f"value_dict has unexpected type: {type(value_dict)}" + raise TypeError(msg) + elif value_json is not None: + # Stringified JSON mode - parse from string + value = json.loads(value_json) else: - value = value_data + # Neither column has data - this shouldn't happen + return None # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones # Convert to timezone-aware UTC timestamps @@ -221,15 +305,24 @@ async def _put_managed_entry( """Store a managed entry by key in the specified collection. Stores the value and metadata separately: - - value: JSON string of just the value data (not full ManagedEntry) + - value_json/value_dict: Stores value based on native_storage setting - created_at, ttl, expires_at: Stored in native columns for efficient querying """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) - # Get just the value as JSON (not the full ManagedEntry) - value_json = managed_entry.value_as_json + # Store in appropriate column based on native_storage setting + value_json: str | None = None + value_dict: str | None = None + + if self._native_storage: + # Native storage: store as JSON string in JSON column (DuckDB will handle as JSON type) + # We use value_as_json to ensure serialization errors are caught + value_dict = managed_entry.value_as_json + else: + # Stringified storage: store JSON string in TEXT column + value_json = managed_entry.value_as_json # Ensure timestamps are timezone-aware (convert naive to UTC if needed) from datetime import timezone @@ -244,15 +337,12 @@ async def _put_managed_entry( # Insert or replace the entry with metadata in separate columns self._connection.execute( - """ - INSERT OR REPLACE INTO kv_entries - (collection, key, value, created_at, ttl, expires_at) - VALUES (?, ?, ?, ?, ?, ?) - """, + self._get_insert_sql(), [ collection, key, value_json, + value_dict, created_at, managed_entry.ttl, expires_at, @@ -267,7 +357,7 @@ async def _delete_managed_entry(self, *, key: str, collection: str) -> bool: raise RuntimeError(msg) result = self._connection.execute( - "DELETE FROM kv_entries WHERE collection = ? AND key = ? RETURNING key", + self._get_delete_sql(), [collection, key], ) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index c5c061ba..2cf80cc4 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -39,13 +39,13 @@ async def test_not_unbounded(self, store: BaseStore): ... class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): - """Test DuckDB store with TEXT column mode instead of JSON.""" + """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" @override @pytest.fixture async def store(self) -> AsyncGenerator[DuckDBStore, None]: - """Test with in-memory DuckDB database using TEXT column.""" - duckdb_store = DuckDBStore(use_json_column=False) + """Test with in-memory DuckDB database using TEXT column for stringified JSON.""" + duckdb_store = DuckDBStore(native_storage=False) yield duckdb_store await duckdb_store.close() @@ -65,7 +65,7 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_native_sql_queryability(self): """Test that users can query the database directly with SQL.""" - store = DuckDBStore(use_json_column=True) + store = DuckDBStore(native_storage=True) # Store some test data with known metadata await store.put(collection="products", key="item1", value={"name": "Widget", "price": 10.99}, ttl=3600) @@ -73,9 +73,9 @@ async def test_native_sql_queryability(self): await store.put(collection="orders", key="order1", value={"total": 100.00, "items": 3}) # Query directly via SQL to verify native storage - # Check that value is stored as JSON (can extract fields) + # Check that value_dict is stored as JSON (can extract fields) result = store._connection.execute(""" - SELECT key, value->'name' as name, value->'price' as price + SELECT key, value_dict->'name' as name, value_dict->'price' as price FROM kv_entries WHERE collection = 'products' ORDER BY key @@ -110,26 +110,28 @@ async def test_native_sql_queryability(self): await store.close() async def test_text_mode_storage(self): - """Test that TEXT mode stores value as string instead of native JSON.""" - store = DuckDBStore(use_json_column=False) + """Test that TEXT mode stores value as stringified JSON instead of native JSON.""" + store = DuckDBStore(native_storage=False) await store.put(collection="test", key="key1", value={"data": "value"}) - # Query to check column type - in TEXT mode, value should be a string + # Query to check column type - in TEXT mode, value_json should be populated result = store._connection.execute(""" - SELECT value, typeof(value) as value_type + SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type FROM kv_entries WHERE collection = 'test' AND key = 'key1' """).fetchone() assert result is not None - value_str, value_type = result + value_json, value_dict, json_type, _dict_type = result - # In TEXT mode, value should be stored as VARCHAR/TEXT - assert value_type in ("VARCHAR", "TEXT") + # In TEXT mode (native_storage=False), value_json should be populated, value_dict should be NULL + assert value_json is not None + assert value_dict is None + assert json_type in ("VARCHAR", "TEXT") # Value should be a JSON string - assert isinstance(value_str, str) - assert "data" in value_str + assert isinstance(value_json, str) + assert "data" in value_json await store.close() @@ -220,5 +222,67 @@ async def test_connection_initialization(self): await store.close() + async def test_custom_table_name(self): + """Test that store can use custom table name.""" + custom_table = "my_custom_kv_table" + store = DuckDBStore(table_name=custom_table) + + # Store some data + await store.put(collection="test", key="key1", value={"data": "value"}) + + # Verify the custom table exists and contains the data + result = store._connection.execute(f""" + SELECT key, collection + FROM {custom_table} + WHERE key = 'key1' + """).fetchone() # noqa: S608 + + assert result is not None + assert result[0] == "key1" + assert result[1] == "test" + + # Verify default table doesn't exist + tables = store._connection.execute(""" + SELECT table_name + FROM information_schema.tables + WHERE table_name = 'kv_entries' + """).fetchall() + + assert len(tables) == 0 + + await store.close() + + async def test_native_vs_stringified_storage(self): + """Test that native and stringified storage modes work correctly.""" + # Native storage (default) + store_native = DuckDBStore(native_storage=True) + await store_native.put(collection="test", key="key1", value={"name": "native"}) + + result = store_native._connection.execute(""" + SELECT value_dict, value_json + FROM kv_entries + WHERE key = 'key1' + """).fetchone() + + assert result[0] is not None # value_dict should be populated + assert result[1] is None # value_json should be NULL + + await store_native.close() + + # Stringified storage + store_string = DuckDBStore(native_storage=False) + await store_string.put(collection="test", key="key2", value={"name": "stringified"}) + + result = store_string._connection.execute(""" + SELECT value_dict, value_json + FROM kv_entries + WHERE key = 'key2' + """).fetchone() + + assert result[0] is None # value_dict should be NULL + assert result[1] is not None # value_json should be populated + + await store_string.close() + @pytest.mark.skip(reason="Local disk stores are unbounded") async def test_not_unbounded(self, store: BaseStore): ... From 9e40074b7edb11a9849538646931d2d2954e108f Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 31 Oct 2025 18:03:08 +0000 Subject: [PATCH 07/12] refactor: update DuckDB store to use SerializationAdapter pattern - Add DuckDBSerializationAdapter class following MongoDB/Elasticsearch patterns - Move inline imports (json, timezone) to module level - Update _get_managed_entry() to use adapter.load_dict() - Update _put_managed_entry() to use adapter.dump_dict() - Fix floating point precision issue in SQL queryability test - All tests passing (405 passed, 7 skipped) Aligns DuckDB store with SerializationAdapter refactoring from PR #184 Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 190 +++++++++++------- .../tests/stores/duckdb/test_duckdb.py | 2 +- 2 files changed, 121 insertions(+), 71 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index c3448bba..b4de1529 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -1,7 +1,11 @@ +import json +from datetime import timezone from pathlib import Path -from typing import Any, cast, overload +from typing import Any, overload +from key_value.shared.errors import DeserializationError from key_value.shared.utils.managed_entry import ManagedEntry +from key_value.shared.utils.serialization import SerializationAdapter from typing_extensions import override from key_value.aio.stores.base import SEED_DATA_TYPE, BaseContextManagerStore, BaseStore @@ -13,6 +17,95 @@ raise ImportError(msg) from e +class DuckDBSerializationAdapter(SerializationAdapter): + """Adapter for DuckDB with support for native JSON and TEXT storage modes.""" + + _native_storage: bool + _value_column: str + + def __init__(self, *, native_storage: bool = True) -> None: + """Initialize the DuckDB adapter. + + Args: + native_storage: If True, use JSON column for native dict storage. + If False, use TEXT column for stringified JSON. + """ + super().__init__() + + self._native_storage = native_storage + self._date_format = "datetime" + # Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns + self._value_format = "string" + self._value_column = "value_dict" if native_storage else "value_json" + + @override + def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: + """Prepare data for dumping to DuckDB. + + Moves the value to the appropriate column (value_dict or value_json) + and sets the other column to None. + """ + value = data.pop("value") + + # Set both columns to None, then populate the appropriate one + data["value_json"] = None + data["value_dict"] = None + + if self._native_storage: + # For native storage, we pass the JSON string to DuckDB's JSON column + # DuckDB will parse it and store it as native JSON + data["value_dict"] = value + else: + # For TEXT storage, value should be a JSON string + data["value_json"] = value + + return data + + @override + def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: + """Prepare data loaded from DuckDB for conversion to ManagedEntry. + + Extracts value from the appropriate column and handles timezone conversion + for DuckDB's naive timestamps. + """ + value_json = data.pop("value_json", None) + value_dict = data.pop("value_dict", None) + + # Determine which value column to use (prefer value_dict if present) + if value_dict is not None: + # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) + if isinstance(value_dict, dict): + data["value"] = value_dict + elif isinstance(value_dict, str): + # DuckDB sometimes returns JSON as string, parse it + data["value"] = json.loads(value_dict) + else: + msg = f"value_dict has unexpected type: {type(value_dict)}" + raise DeserializationError(message=msg) + elif value_json is not None: + # Stringified JSON mode - parse from string + if isinstance(value_json, str): + data["value"] = json.loads(value_json) + else: + msg = f"value_json has unexpected type: {type(value_json)}" + raise DeserializationError(message=msg) + else: + msg = "Neither value_dict nor value_json column contains data" + raise DeserializationError(message=msg) + + # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones + # Convert to timezone-aware UTC timestamps. Handle None values explicitly. + created_at = data.get("created_at") + if created_at is not None and created_at.tzinfo is None: + data["created_at"] = created_at.replace(tzinfo=timezone.utc) + + expires_at = data.get("expires_at") + if expires_at is not None and expires_at.tzinfo is None: + data["expires_at"] = expires_at.replace(tzinfo=timezone.utc) + + return data + + class DuckDBStore(BaseContextManagerStore, BaseStore): """A DuckDB-based key-value store supporting both in-memory and persistent storage. @@ -35,7 +128,7 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): _connection: duckdb.DuckDBPyConnection _is_closed: bool _owns_connection: bool - _native_storage: bool + _adapter: SerializationAdapter _table_name: str @overload @@ -125,7 +218,7 @@ def __init__( self._owns_connection = True self._is_closed = False - self._native_storage = native_storage + self._adapter = DuckDBSerializationAdapter(native_storage=native_storage) self._table_name = table_name self._stable_api = False @@ -239,8 +332,8 @@ async def _setup(self) -> None: async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: """Retrieve a managed entry by key from the specified collection. - Reconstructs the ManagedEntry from value columns and metadata columns. - Tries value_dict first (native storage), falls back to value_json (stringified). + Reconstructs the ManagedEntry from value columns and metadata columns + using the serialization adapter. """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" @@ -254,45 +347,23 @@ async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry if result is None: return None - value_json, value_dict, created_at, ttl, expires_at = result + value_json, value_dict, created_at, _ttl, expires_at = result - # Determine which value column to use (prefer value_dict if present) - import json - - value: dict[str, Any] - if value_dict is not None: - # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) - if isinstance(value_dict, dict): - value = cast(dict[str, Any], value_dict) - elif isinstance(value_dict, str): - # DuckDB sometimes returns JSON as string - value = json.loads(value_dict) - else: - msg = f"value_dict has unexpected type: {type(value_dict)}" - raise TypeError(msg) - elif value_json is not None: - # Stringified JSON mode - parse from string - value = json.loads(value_json) - else: - # Neither column has data - this shouldn't happen - return None + # Build document dict for the adapter (exclude None values) + document: dict[str, Any] = { + "value_json": value_json, + "value_dict": value_dict, + } - # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones - # Convert to timezone-aware UTC timestamps - from datetime import timezone + if created_at is not None: + document["created_at"] = created_at + if expires_at is not None: + document["expires_at"] = expires_at - if created_at is not None and created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - if expires_at is not None and expires_at.tzinfo is None: - expires_at = expires_at.replace(tzinfo=timezone.utc) - - # Reconstruct ManagedEntry with metadata from columns - return ManagedEntry( - value=value, - created_at=created_at, - ttl=ttl, - expires_at=expires_at, - ) + try: + return self._adapter.load_dict(data=document) + except DeserializationError: + return None @override async def _put_managed_entry( @@ -304,36 +375,15 @@ async def _put_managed_entry( ) -> None: """Store a managed entry by key in the specified collection. - Stores the value and metadata separately: - - value_json/value_dict: Stores value based on native_storage setting - - created_at, ttl, expires_at: Stored in native columns for efficient querying + Uses the serialization adapter to convert the ManagedEntry to the + appropriate storage format. """ if self._is_closed: msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) - # Store in appropriate column based on native_storage setting - value_json: str | None = None - value_dict: str | None = None - - if self._native_storage: - # Native storage: store as JSON string in JSON column (DuckDB will handle as JSON type) - # We use value_as_json to ensure serialization errors are caught - value_dict = managed_entry.value_as_json - else: - # Stringified storage: store JSON string in TEXT column - value_json = managed_entry.value_as_json - - # Ensure timestamps are timezone-aware (convert naive to UTC if needed) - from datetime import timezone - - created_at = managed_entry.created_at - if created_at is not None and created_at.tzinfo is None: - created_at = created_at.replace(tzinfo=timezone.utc) - - expires_at = managed_entry.expires_at - if expires_at is not None and expires_at.tzinfo is None: - expires_at = expires_at.replace(tzinfo=timezone.utc) + # Use adapter to dump the managed entry to a dict + document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False) # Insert or replace the entry with metadata in separate columns self._connection.execute( @@ -341,11 +391,11 @@ async def _put_managed_entry( [ collection, key, - value_json, - value_dict, - created_at, + document["value_json"], + document["value_dict"], + document.get("created_at"), managed_entry.ttl, - expires_at, + document.get("expires_at"), ], ) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index 2cf80cc4..3aa57e8d 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -104,7 +104,7 @@ async def test_native_sql_queryability(self): assert len(result) == 1 # Only item2 has ttl > 3600 assert result[0][0] == "item2" - assert result[0][1] == 7200 + assert abs(result[0][1] - 7200) < 1 # TTL should be approximately 7200 (floating point precision) assert result[0][2] is True # has_created await store.close() From 91d55b867e1717f3bc8286d0e5f6b3a6f9693cc6 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sun, 2 Nov 2025 00:52:34 +0000 Subject: [PATCH 08/12] fix: resolve type checking errors in DuckDB tests - Add pyright: ignore comments for intentional _connection access in tests - Add null assertions for fetchone() results - Move type ignore comments outside SQL strings to avoid DuckDB parse errors - All 406 DuckDB tests now passing Co-authored-by: William Easton --- .../tests/stores/duckdb/test_duckdb.py | 35 ++++++++++--------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index 3aa57e8d..976de18c 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -79,7 +79,7 @@ async def test_native_sql_queryability(self): FROM kv_entries WHERE collection = 'products' ORDER BY key - """).fetchall() + """).fetchall() # pyright: ignore[reportPrivateUsage] assert len(result) == 2 assert result[0][0] == "item1" @@ -87,20 +87,21 @@ async def test_native_sql_queryability(self): assert result[1][0] == "item2" # Query by expiration timestamp - result = store._connection.execute(""" + count_result = store._connection.execute(""" SELECT COUNT(*) FROM kv_entries WHERE expires_at > now() OR expires_at IS NULL - """).fetchone() + """).fetchone() # pyright: ignore[reportPrivateUsage] - assert result[0] == 3 # All 3 entries should not be expired + assert count_result is not None + assert count_result[0] == 3 # All 3 entries should not be expired # Query metadata columns directly result = store._connection.execute(""" SELECT key, ttl, created_at IS NOT NULL as has_created FROM kv_entries WHERE collection = 'products' AND ttl > 3600 - """).fetchall() + """).fetchall() # pyright: ignore[reportPrivateUsage] assert len(result) == 1 # Only item2 has ttl > 3600 assert result[0][0] == "item2" @@ -120,7 +121,7 @@ async def test_text_mode_storage(self): SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type FROM kv_entries WHERE collection = 'test' AND key = 'key1' - """).fetchone() + """).fetchone() # pyright: ignore[reportPrivateUsage] assert result is not None value_json, value_dict, json_type, _dict_type = result @@ -235,7 +236,7 @@ async def test_custom_table_name(self): SELECT key, collection FROM {custom_table} WHERE key = 'key1' - """).fetchone() # noqa: S608 + """).fetchone() # pyright: ignore[reportPrivateUsage] # noqa: S608 assert result is not None assert result[0] == "key1" @@ -246,7 +247,7 @@ async def test_custom_table_name(self): SELECT table_name FROM information_schema.tables WHERE table_name = 'kv_entries' - """).fetchall() + """).fetchall() # pyright: ignore[reportPrivateUsage] assert len(tables) == 0 @@ -258,14 +259,15 @@ async def test_native_vs_stringified_storage(self): store_native = DuckDBStore(native_storage=True) await store_native.put(collection="test", key="key1", value={"name": "native"}) - result = store_native._connection.execute(""" + result_native = store_native._connection.execute(""" SELECT value_dict, value_json FROM kv_entries WHERE key = 'key1' - """).fetchone() + """).fetchone() # pyright: ignore[reportPrivateUsage] - assert result[0] is not None # value_dict should be populated - assert result[1] is None # value_json should be NULL + assert result_native is not None + assert result_native[0] is not None # value_dict should be populated + assert result_native[1] is None # value_json should be NULL await store_native.close() @@ -273,14 +275,15 @@ async def test_native_vs_stringified_storage(self): store_string = DuckDBStore(native_storage=False) await store_string.put(collection="test", key="key2", value={"name": "stringified"}) - result = store_string._connection.execute(""" + result_string = store_string._connection.execute(""" SELECT value_dict, value_json FROM kv_entries WHERE key = 'key2' - """).fetchone() + """).fetchone() # pyright: ignore[reportPrivateUsage] - assert result[0] is None # value_dict should be NULL - assert result[1] is not None # value_json should be populated + assert result_string is not None + assert result_string[0] is None # value_dict should be NULL + assert result_string[1] is not None # value_json should be populated await store_string.close() From d261b377910d794de389824466c60b8a278f9163 Mon Sep 17 00:00:00 2001 From: William Easton Date: Sat, 1 Nov 2025 22:35:47 -0500 Subject: [PATCH 09/12] Fix DuckDB TTL Handling --- key-value/key-value-aio/pyproject.toml | 2 +- .../src/key_value/aio/stores/duckdb/store.py | 31 +- .../tests/stores/duckdb/test_duckdb.py | 81 ++-- key-value/key-value-sync/pyproject.toml | 3 +- .../sync/code_gen/stores/duckdb/__init__.py | 6 + .../sync/code_gen/stores/duckdb/store.py | 380 ++++++++++++++++++ .../key_value/sync/stores/duckdb/__init__.py | 6 + .../tests/code_gen/stores/duckdb/__init__.py | 4 + .../code_gen/stores/duckdb/test_duckdb.py | 321 +++++++++++++++ pyproject.toml | 1 + uv.lock | 25 +- 11 files changed, 809 insertions(+), 51 deletions(-) create mode 100644 key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/__init__.py create mode 100644 key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py create mode 100644 key-value/key-value-sync/src/key_value/sync/stores/duckdb/__init__.py create mode 100644 key-value/key-value-sync/tests/code_gen/stores/duckdb/__init__.py create mode 100644 key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py diff --git a/key-value/key-value-aio/pyproject.toml b/key-value/key-value-aio/pyproject.toml index bd8e702b..f5b3830a 100644 --- a/key-value/key-value-aio/pyproject.toml +++ b/key-value/key-value-aio/pyproject.toml @@ -48,7 +48,7 @@ rocksdb = [ "rocksdict>=0.3.24 ; python_version >= '3.12'", # RocksDB 0.3.24 is the first version to support Python 3.13 "rocksdict>=0.3.2 ; python_version < '3.12'" ] -duckdb = ["duckdb>=1.0.0"] +duckdb = ["duckdb>=1.0.0", "pytz>=2025.2"] wrappers-encryption = ["cryptography>=45.0.0"] [tool.pytest.ini_options] diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index b4de1529..fee22312 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -1,5 +1,5 @@ import json -from datetime import timezone +from datetime import datetime, timezone from pathlib import Path from typing import Any, overload @@ -21,7 +21,6 @@ class DuckDBSerializationAdapter(SerializationAdapter): """Adapter for DuckDB with support for native JSON and TEXT storage modes.""" _native_storage: bool - _value_column: str def __init__(self, *, native_storage: bool = True) -> None: """Initialize the DuckDB adapter. @@ -36,7 +35,6 @@ def __init__(self, *, native_storage: bool = True) -> None: self._date_format = "datetime" # Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns self._value_format = "string" - self._value_column = "value_dict" if native_storage else "value_json" @override def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: @@ -96,12 +94,12 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones # Convert to timezone-aware UTC timestamps. Handle None values explicitly. created_at = data.get("created_at") - if created_at is not None and created_at.tzinfo is None: - data["created_at"] = created_at.replace(tzinfo=timezone.utc) + if created_at is not None and isinstance(created_at, datetime) and created_at.tzinfo is None: + data["created_at"] = created_at.astimezone(tz=timezone.utc) expires_at = data.get("expires_at") - if expires_at is not None and expires_at.tzinfo is None: - data["expires_at"] = expires_at.replace(tzinfo=timezone.utc) + if expires_at is not None and isinstance(expires_at, datetime) and expires_at.tzinfo is None: + data["expires_at"] = expires_at.astimezone(tz=timezone.utc) return data @@ -237,7 +235,6 @@ def _get_create_table_sql(self) -> str: value_json TEXT, value_dict JSON, created_at TIMESTAMP, - ttl DOUBLE, expires_at TIMESTAMP, PRIMARY KEY (collection, key) ) @@ -272,7 +269,7 @@ def _get_select_sql(self) -> str: SQL SELECT statement with placeholders. """ return f""" - SELECT value_json, value_dict, created_at, ttl, expires_at + SELECT value_json, value_dict, created_at, expires_at FROM {self._table_name} WHERE collection = ? AND key = ? """ # noqa: S608 @@ -285,8 +282,8 @@ def _get_insert_sql(self) -> str: """ return f""" INSERT OR REPLACE INTO {self._table_name} - (collection, key, value_json, value_dict, created_at, ttl, expires_at) - VALUES (?, ?, ?, ?, ?, ?, ?) + (collection, key, value_json, value_dict, created_at, expires_at) + VALUES (?, ?, ?, ?, ?, ?) """ # noqa: S608 def _get_delete_sql(self) -> str: @@ -309,7 +306,6 @@ async def _setup(self) -> None: - value_json: TEXT column storing stringified JSON (used when native_storage=False) - value_dict: JSON column storing native dicts (used when native_storage=True) - created_at: TIMESTAMP for native datetime operations - - ttl: DOUBLE for time-to-live in seconds - expires_at: TIMESTAMP for native expiration queries This design follows the Elasticsearch/MongoDB pattern of separating native and stringified @@ -347,7 +343,7 @@ async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry if result is None: return None - value_json, value_dict, created_at, _ttl, expires_at = result + value_json, value_dict, created_at, expires_at = result # Build document dict for the adapter (exclude None values) document: dict[str, Any] = { @@ -355,10 +351,10 @@ async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry "value_dict": value_dict, } - if created_at is not None: - document["created_at"] = created_at - if expires_at is not None: - document["expires_at"] = expires_at + if created_at is not None and isinstance(created_at, datetime): + document["created_at"] = created_at.astimezone(tz=timezone.utc) + if expires_at is not None and isinstance(expires_at, datetime): + document["expires_at"] = expires_at.astimezone(tz=timezone.utc) try: return self._adapter.load_dict(data=document) @@ -394,7 +390,6 @@ async def _put_managed_entry( document["value_json"], document["value_dict"], document.get("created_at"), - managed_entry.ttl, document.get("expires_at"), ], ) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index 976de18c..24cac26c 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -3,6 +3,9 @@ from tempfile import TemporaryDirectory import pytest +from _duckdb import DuckDBPyConnection +from duckdb import CatalogException +from inline_snapshot import snapshot from typing_extensions import override from key_value.aio.stores.base import BaseStore @@ -10,6 +13,10 @@ from tests.stores.base import BaseStoreTests, ContextManagerStoreTestMixin +def get_client_from_store(store: DuckDBStore) -> DuckDBPyConnection: + return store._connection # pyright: ignore[reportPrivateUsage] + + class TestDuckDBStore(ContextManagerStoreTestMixin, BaseStoreTests): @override @pytest.fixture @@ -74,12 +81,16 @@ async def test_native_sql_queryability(self): # Query directly via SQL to verify native storage # Check that value_dict is stored as JSON (can extract fields) - result = store._connection.execute(""" + result = ( + get_client_from_store(store) + .execute(""" SELECT key, value_dict->'name' as name, value_dict->'price' as price FROM kv_entries WHERE collection = 'products' ORDER BY key - """).fetchall() # pyright: ignore[reportPrivateUsage] + """) + .fetchall() + ) # pyright: ignore[reportPrivateUsage] assert len(result) == 2 assert result[0][0] == "item1" @@ -87,21 +98,29 @@ async def test_native_sql_queryability(self): assert result[1][0] == "item2" # Query by expiration timestamp - count_result = store._connection.execute(""" + count_result = ( + get_client_from_store(store) + .execute(""" SELECT COUNT(*) FROM kv_entries WHERE expires_at > now() OR expires_at IS NULL - """).fetchone() # pyright: ignore[reportPrivateUsage] + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] assert count_result is not None assert count_result[0] == 3 # All 3 entries should not be expired # Query metadata columns directly - result = store._connection.execute(""" + result = ( + get_client_from_store(store) + .execute(""" SELECT key, ttl, created_at IS NOT NULL as has_created FROM kv_entries WHERE collection = 'products' AND ttl > 3600 - """).fetchall() # pyright: ignore[reportPrivateUsage] + """) + .fetchall() + ) # pyright: ignore[reportPrivateUsage] assert len(result) == 1 # Only item2 has ttl > 3600 assert result[0][0] == "item2" @@ -117,11 +136,15 @@ async def test_text_mode_storage(self): await store.put(collection="test", key="key1", value={"data": "value"}) # Query to check column type - in TEXT mode, value_json should be populated - result = store._connection.execute(""" + result = ( + get_client_from_store(store) + .execute(""" SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type FROM kv_entries WHERE collection = 'test' AND key = 'key1' - """).fetchone() # pyright: ignore[reportPrivateUsage] + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] assert result is not None value_json, value_dict, json_type, _dict_type = result @@ -232,24 +255,20 @@ async def test_custom_table_name(self): await store.put(collection="test", key="key1", value={"data": "value"}) # Verify the custom table exists and contains the data - result = store._connection.execute(f""" - SELECT key, collection - FROM {custom_table} - WHERE key = 'key1' - """).fetchone() # pyright: ignore[reportPrivateUsage] # noqa: S608 + tables = ( + get_client_from_store(store) + .table(custom_table) + .filter(filter_expr="key = 'key1'") + .select("key", "collection") + .execute() + .fetchone() + ) - assert result is not None - assert result[0] == "key1" - assert result[1] == "test" + assert tables == snapshot(("key1", "test")) # Verify default table doesn't exist - tables = store._connection.execute(""" - SELECT table_name - FROM information_schema.tables - WHERE table_name = 'kv_entries' - """).fetchall() # pyright: ignore[reportPrivateUsage] - - assert len(tables) == 0 + with pytest.raises(CatalogException): + get_client_from_store(store).table("kv_entries") await store.close() @@ -259,11 +278,15 @@ async def test_native_vs_stringified_storage(self): store_native = DuckDBStore(native_storage=True) await store_native.put(collection="test", key="key1", value={"name": "native"}) - result_native = store_native._connection.execute(""" + result_native = ( + get_client_from_store(store_native) + .execute(""" SELECT value_dict, value_json FROM kv_entries WHERE key = 'key1' - """).fetchone() # pyright: ignore[reportPrivateUsage] + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] assert result_native is not None assert result_native[0] is not None # value_dict should be populated @@ -275,11 +298,15 @@ async def test_native_vs_stringified_storage(self): store_string = DuckDBStore(native_storage=False) await store_string.put(collection="test", key="key2", value={"name": "stringified"}) - result_string = store_string._connection.execute(""" + result_string = ( + get_client_from_store(store_string) + .execute(""" SELECT value_dict, value_json FROM kv_entries WHERE key = 'key2' - """).fetchone() # pyright: ignore[reportPrivateUsage] + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] assert result_string is not None assert result_string[0] is None # value_dict should be NULL diff --git a/key-value/key-value-sync/pyproject.toml b/key-value/key-value-sync/pyproject.toml index c268ed04..73927a99 100644 --- a/key-value/key-value-sync/pyproject.toml +++ b/key-value/key-value-sync/pyproject.toml @@ -47,6 +47,7 @@ rocksdb = [ "rocksdict>=0.3.24 ; python_version >= '3.12'", # RocksDB 0.3.24 is the first version to support Python 3.13 "rocksdict>=0.3.2 ; python_version < '3.12'" ] +duckdb = ["duckdb>=1.0.0", "pytz>=2025.2"] wrappers-encryption = ["cryptography>=45.0.0"] [tool.pytest.ini_options] @@ -66,7 +67,7 @@ env_files = [".env"] [dependency-groups] dev = [ - "py-key-value-sync[memory,disk,redis,elasticsearch,memcached,mongodb,vault,rocksdb]", + "py-key-value-sync[memory,disk,redis,elasticsearch,memcached,mongodb,vault,rocksdb,duckdb]", "py-key-value-sync[valkey]; platform_system != 'Windows'", "py-key-value-sync[pydantic]", "py-key-value-sync[keyring]", diff --git a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/__init__.py b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/__init__.py new file mode 100644 index 00000000..7cde61a6 --- /dev/null +++ b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/__init__.py @@ -0,0 +1,6 @@ +# WARNING: this file is auto-generated by 'build_sync_library.py' +# from the original file '__init__.py' +# DO NOT CHANGE! Change the original file instead. +from key_value.sync.code_gen.stores.duckdb.store import DuckDBStore + +__all__ = ["DuckDBStore"] diff --git a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py new file mode 100644 index 00000000..3196d3c3 --- /dev/null +++ b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py @@ -0,0 +1,380 @@ +# WARNING: this file is auto-generated by 'build_sync_library.py' +# from the original file 'store.py' +# DO NOT CHANGE! Change the original file instead. +import json +from datetime import datetime, timezone +from pathlib import Path +from typing import Any, overload + +from key_value.shared.errors import DeserializationError +from key_value.shared.utils.managed_entry import ManagedEntry +from key_value.shared.utils.serialization import SerializationAdapter +from typing_extensions import override + +from key_value.sync.code_gen.stores.base import SEED_DATA_TYPE, BaseContextManagerStore, BaseStore + +try: + import duckdb +except ImportError as e: + msg = "DuckDBStore requires py-key-value-aio[duckdb]" + raise ImportError(msg) from e + + +class DuckDBSerializationAdapter(SerializationAdapter): + """Adapter for DuckDB with support for native JSON and TEXT storage modes.""" + + _native_storage: bool + + def __init__(self, *, native_storage: bool = True) -> None: + """Initialize the DuckDB adapter. + + Args: + native_storage: If True, use JSON column for native dict storage. + If False, use TEXT column for stringified JSON. + """ + super().__init__() + + self._native_storage = native_storage + self._date_format = "datetime" + # Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns + self._value_format = "string" + + @override + def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: + """Prepare data for dumping to DuckDB. + + Moves the value to the appropriate column (value_dict or value_json) + and sets the other column to None. + """ + value = data.pop("value") + + # Set both columns to None, then populate the appropriate one + data["value_json"] = None + data["value_dict"] = None + + if self._native_storage: + # For native storage, we pass the JSON string to DuckDB's JSON column + # DuckDB will parse it and store it as native JSON + data["value_dict"] = value + else: + # For TEXT storage, value should be a JSON string + data["value_json"] = value + + return data + + @override + def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: + """Prepare data loaded from DuckDB for conversion to ManagedEntry. + + Extracts value from the appropriate column and handles timezone conversion + for DuckDB's naive timestamps. + """ + value_json = data.pop("value_json", None) + value_dict = data.pop("value_dict", None) + + # Determine which value column to use (prefer value_dict if present) + if value_dict is not None: + # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) + if isinstance(value_dict, dict): + data["value"] = value_dict + elif isinstance(value_dict, str): + # DuckDB sometimes returns JSON as string, parse it + data["value"] = json.loads(value_dict) + else: + msg = f"value_dict has unexpected type: {type(value_dict)}" + raise DeserializationError(message=msg) + elif value_json is not None: + # Stringified JSON mode - parse from string + if isinstance(value_json, str): + data["value"] = json.loads(value_json) + else: + msg = f"value_json has unexpected type: {type(value_json)}" + raise DeserializationError(message=msg) + else: + msg = "Neither value_dict nor value_json column contains data" + raise DeserializationError(message=msg) + + # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones + # Convert to timezone-aware UTC timestamps. Handle None values explicitly. + created_at = data.get("created_at") + if created_at is not None and isinstance(created_at, datetime) and (created_at.tzinfo is None): + data["created_at"] = created_at.astimezone(tz=timezone.utc) + + expires_at = data.get("expires_at") + if expires_at is not None and isinstance(expires_at, datetime) and (expires_at.tzinfo is None): + data["expires_at"] = expires_at.astimezone(tz=timezone.utc) + + return data + + +class DuckDBStore(BaseContextManagerStore, BaseStore): + """A DuckDB-based key-value store supporting both in-memory and persistent storage. + + DuckDB is an in-process SQL OLAP database that provides excellent performance + for analytical workloads while supporting standard SQL operations. This store + can operate in memory-only mode or persist data to disk. + + The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries + on stored data. Users can query the database directly for analytics or data exploration. + + Storage modes: + - native_storage=True: Stores values in a JSON column as native dicts for queryability + - native_storage=False: Stores values as stringified JSON in a TEXT column + + Note on connection ownership: When you provide an existing connection, the store + will take ownership and close it when the store is closed or garbage collected. + If you need to reuse a connection, create separate DuckDB connections for each store. + """ + + _connection: duckdb.DuckDBPyConnection + _is_closed: bool + _owns_connection: bool + _adapter: SerializationAdapter + _table_name: str + + @overload + def __init__( + self, + *, + connection: duckdb.DuckDBPyConnection, + table_name: str = "kv_entries", + native_storage: bool = True, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store with an existing connection. + + Warning: The store will take ownership of the connection and close it + when the store is closed or garbage collected. If you need to reuse + a connection, create separate DuckDB connections for each store. + + Args: + connection: An existing DuckDB connection to use. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + + @overload + def __init__( + self, + *, + database_path: Path | str | None = None, + table_name: str = "kv_entries", + native_storage: bool = True, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store with a database path. + + Args: + database_path: Path to the database file. If None or ':memory:', uses in-memory database. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + + def __init__( + self, + *, + connection: duckdb.DuckDBPyConnection | None = None, + database_path: Path | str | None = None, + table_name: str = "kv_entries", + native_storage: bool = True, + default_collection: str | None = None, + seed: SEED_DATA_TYPE | None = None, + ) -> None: + """Initialize the DuckDB store. + + Args: + connection: An existing DuckDB connection to use. + database_path: Path to the database file. If None or ':memory:', uses in-memory database. + table_name: Name of the table to store key-value entries. Defaults to "kv_entries". + native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. + Default is True for better queryability and native type support. + default_collection: The default collection to use if no collection is provided. + seed: Optional seed data to pre-populate the store. + """ + if connection is not None and database_path is not None: + msg = "Provide only one of connection or database_path" + raise ValueError(msg) + + if connection is not None: + self._connection = connection + self._owns_connection = True # We take ownership even of provided connections + else: + # Convert Path to string if needed + if isinstance(database_path, Path): + database_path = str(database_path) + # Use in-memory database if no path specified + if database_path is None or database_path == ":memory:": + self._connection = duckdb.connect(":memory:") + else: + self._connection = duckdb.connect(database=database_path) + self._owns_connection = True + + self._is_closed = False + self._adapter = DuckDBSerializationAdapter(native_storage=native_storage) + self._table_name = table_name + self._stable_api = False + + super().__init__(default_collection=default_collection, seed=seed) + + def _get_create_table_sql(self) -> str: + """Generate SQL for creating the key-value entries table. + + Returns: + SQL CREATE TABLE statement. + """ + return f"\n CREATE TABLE IF NOT EXISTS {self._table_name} (\n collection VARCHAR NOT NULL,\n key VARCHAR NOT NULL,\n value_json TEXT,\n value_dict JSON,\n created_at TIMESTAMP,\n expires_at TIMESTAMP,\n PRIMARY KEY (collection, key)\n )\n " + + def _get_create_collection_index_sql(self) -> str: + """Generate SQL for creating index on collection column. + + Returns: + SQL CREATE INDEX statement. + """ + return f"\n CREATE INDEX IF NOT EXISTS idx_{self._table_name}_collection\n ON {self._table_name}(collection)\n " + + def _get_create_expires_index_sql(self) -> str: + """Generate SQL for creating index on expires_at column. + + Returns: + SQL CREATE INDEX statement. + """ + return f"\n CREATE INDEX IF NOT EXISTS idx_{self._table_name}_expires_at\n ON {self._table_name}(expires_at)\n " + + def _get_select_sql(self) -> str: + """Generate SQL for selecting an entry by collection and key. + + Returns: + SQL SELECT statement with placeholders. + """ + return f"\n SELECT value_json, value_dict, created_at, expires_at\n FROM {self._table_name}\n WHERE collection = ? AND key = ?\n " # noqa: S608 + + def _get_insert_sql(self) -> str: + """Generate SQL for inserting or replacing an entry. + + Returns: + SQL INSERT OR REPLACE statement with placeholders. + """ + return f"\n INSERT OR REPLACE INTO {self._table_name}\n (collection, key, value_json, value_dict, created_at, expires_at)\n VALUES (?, ?, ?, ?, ?, ?)\n " # noqa: S608 + + def _get_delete_sql(self) -> str: + """Generate SQL for deleting an entry by collection and key. + + Returns: + SQL DELETE statement with RETURNING clause. + """ + return f"\n DELETE FROM {self._table_name}\n WHERE collection = ? AND key = ?\n RETURNING key\n " # noqa: S608 + + @override + def _setup(self) -> None: + """Initialize the database schema for key-value storage. + + The schema uses native DuckDB types for efficient querying: + - value_json: TEXT column storing stringified JSON (used when native_storage=False) + - value_dict: JSON column storing native dicts (used when native_storage=True) + - created_at: TIMESTAMP for native datetime operations + - expires_at: TIMESTAMP for native expiration queries + + This design follows the Elasticsearch/MongoDB pattern of separating native and stringified + storage, enabling: + - Direct SQL queries on the database for analytics (when using native storage) + - Efficient expiration cleanup: DELETE FROM table WHERE expires_at < now() + - Metadata queries without JSON deserialization + - Flexibility to choose between native dict storage and stringified JSON + """ + # Create the main table for storing key-value entries + self._connection.execute(self._get_create_table_sql()) + + # Create index for efficient collection queries + self._connection.execute(self._get_create_collection_index_sql()) + + # Create index for expiration-based queries + self._connection.execute(self._get_create_expires_index_sql()) + + @override + def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: + """Retrieve a managed entry by key from the specified collection. + + Reconstructs the ManagedEntry from value columns and metadata columns + using the serialization adapter. + """ + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + + result = self._connection.execute(self._get_select_sql(), [collection, key]).fetchone() + + if result is None: + return None + + (value_json, value_dict, created_at, expires_at) = result + + # Build document dict for the adapter (exclude None values) + document: dict[str, Any] = {"value_json": value_json, "value_dict": value_dict} + + if created_at is not None and isinstance(created_at, datetime): + document["created_at"] = created_at.astimezone(tz=timezone.utc) + if expires_at is not None and isinstance(expires_at, datetime): + document["expires_at"] = expires_at.astimezone(tz=timezone.utc) + + try: + return self._adapter.load_dict(data=document) + except DeserializationError: + return None + + @override + def _put_managed_entry(self, *, key: str, collection: str, managed_entry: ManagedEntry) -> None: + """Store a managed entry by key in the specified collection. + + Uses the serialization adapter to convert the ManagedEntry to the + appropriate storage format. + """ + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + + # Use adapter to dump the managed entry to a dict + document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False) + + # Insert or replace the entry with metadata in separate columns + self._connection.execute( + self._get_insert_sql(), + [collection, key, document["value_json"], document["value_dict"], document.get("created_at"), document.get("expires_at")], + ) + + @override + def _delete_managed_entry(self, *, key: str, collection: str) -> bool: + """Delete a managed entry by key from the specified collection.""" + if self._is_closed: + msg = "Cannot operate on closed DuckDBStore" + raise RuntimeError(msg) + + result = self._connection.execute(self._get_delete_sql(), [collection, key]) + + # Check if any rows were deleted by counting returned rows + deleted_rows = result.fetchall() + return len(deleted_rows) > 0 + + @override + def _close(self) -> None: + """Close the DuckDB connection.""" + if not self._is_closed and self._owns_connection: + self._connection.close() + self._is_closed = True + + def __del__(self) -> None: + """Clean up the DuckDB connection on deletion.""" + try: + if not self._is_closed and self._owns_connection and hasattr(self, "_connection"): + self._connection.close() + self._is_closed = True + except Exception: # noqa: S110 + # Suppress errors during cleanup to avoid issues during interpreter shutdown + pass diff --git a/key-value/key-value-sync/src/key_value/sync/stores/duckdb/__init__.py b/key-value/key-value-sync/src/key_value/sync/stores/duckdb/__init__.py new file mode 100644 index 00000000..7cde61a6 --- /dev/null +++ b/key-value/key-value-sync/src/key_value/sync/stores/duckdb/__init__.py @@ -0,0 +1,6 @@ +# WARNING: this file is auto-generated by 'build_sync_library.py' +# from the original file '__init__.py' +# DO NOT CHANGE! Change the original file instead. +from key_value.sync.code_gen.stores.duckdb.store import DuckDBStore + +__all__ = ["DuckDBStore"] diff --git a/key-value/key-value-sync/tests/code_gen/stores/duckdb/__init__.py b/key-value/key-value-sync/tests/code_gen/stores/duckdb/__init__.py new file mode 100644 index 00000000..0b0927be --- /dev/null +++ b/key-value/key-value-sync/tests/code_gen/stores/duckdb/__init__.py @@ -0,0 +1,4 @@ +# WARNING: this file is auto-generated by 'build_sync_library.py' +# from the original file '__init__.py' +# DO NOT CHANGE! Change the original file instead. +# DuckDB store tests diff --git a/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py new file mode 100644 index 00000000..ec4a8027 --- /dev/null +++ b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py @@ -0,0 +1,321 @@ +# WARNING: this file is auto-generated by 'build_sync_library.py' +# from the original file 'test_duckdb.py' +# DO NOT CHANGE! Change the original file instead. +from collections.abc import Generator +from pathlib import Path +from tempfile import TemporaryDirectory + +import pytest +from _duckdb import DuckDBPyConnection +from duckdb import CatalogException +from inline_snapshot import snapshot +from typing_extensions import override + +from key_value.sync.code_gen.stores.base import BaseStore +from key_value.sync.code_gen.stores.duckdb import DuckDBStore +from tests.code_gen.stores.base import BaseStoreTests, ContextManagerStoreTestMixin + + +def get_client_from_store(store: DuckDBStore) -> DuckDBPyConnection: + return store._connection # pyright: ignore[reportPrivateUsage] + + +class TestDuckDBStore(ContextManagerStoreTestMixin, BaseStoreTests): + @override + @pytest.fixture + def store(self) -> Generator[DuckDBStore, None, None]: + """Test with in-memory DuckDB database.""" + duckdb_store = DuckDBStore() + yield duckdb_store + duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + def test_not_unbounded(self, store: BaseStore): ... + + +class TestDuckDBStorePersistent(ContextManagerStoreTestMixin, BaseStoreTests): + @override + @pytest.fixture + def store(self) -> Generator[DuckDBStore, None, None]: + """Test with persistent DuckDB database file.""" + with TemporaryDirectory() as temp_dir: + db_path = Path(temp_dir) / "test.db" + duckdb_store = DuckDBStore(database_path=db_path) + yield duckdb_store + duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + def test_not_unbounded(self, store: BaseStore): ... + + +class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): + """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" + + @override + @pytest.fixture + def store(self) -> Generator[DuckDBStore, None, None]: + """Test with in-memory DuckDB database using TEXT column for stringified JSON.""" + duckdb_store = DuckDBStore(native_storage=False) + yield duckdb_store + duckdb_store.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + def test_not_unbounded(self, store: BaseStore): ... + + +class TestDuckDBStoreSpecific: + """Test DuckDB-specific functionality.""" + + @pytest.fixture + def store(self) -> Generator[DuckDBStore, None, None]: + """Provide DuckDB store instance.""" + duckdb_store = DuckDBStore() + yield duckdb_store + duckdb_store.close() + + def test_native_sql_queryability(self): + """Test that users can query the database directly with SQL.""" + store = DuckDBStore(native_storage=True) + + # Store some test data with known metadata + store.put(collection="products", key="item1", value={"name": "Widget", "price": 10.99}, ttl=3600) + store.put(collection="products", key="item2", value={"name": "Gadget", "price": 25.5}, ttl=7200) + store.put(collection="orders", key="order1", value={"total": 100.0, "items": 3}) + + # Query directly via SQL to verify native storage + # Check that value_dict is stored as JSON (can extract fields) + result = ( + get_client_from_store(store) + .execute(""" + SELECT key, value_dict->'name' as name, value_dict->'price' as price + FROM kv_entries + WHERE collection = 'products' + ORDER BY key + """) + .fetchall() + ) # pyright: ignore[reportPrivateUsage] + + assert len(result) == 2 + assert result[0][0] == "item1" + assert result[0][1] == '"Widget"' # JSON strings are quoted + assert result[1][0] == "item2" + + # Query by expiration timestamp + count_result = ( + get_client_from_store(store) + .execute(""" + SELECT COUNT(*) + FROM kv_entries + WHERE expires_at > now() OR expires_at IS NULL + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] + + assert count_result is not None + assert count_result[0] == 3 # All 3 entries should not be expired + + # Query metadata columns directly + result = ( + get_client_from_store(store) + .execute(""" + SELECT key, ttl, created_at IS NOT NULL as has_created + FROM kv_entries + WHERE collection = 'products' AND ttl > 3600 + """) + .fetchall() + ) # pyright: ignore[reportPrivateUsage] + + assert len(result) == 1 # Only item2 has ttl > 3600 + assert result[0][0] == "item2" + assert abs(result[0][1] - 7200) < 1 # TTL should be approximately 7200 (floating point precision) + assert result[0][2] is True # has_created + + store.close() + + def test_text_mode_storage(self): + """Test that TEXT mode stores value as stringified JSON instead of native JSON.""" + store = DuckDBStore(native_storage=False) + + store.put(collection="test", key="key1", value={"data": "value"}) + + # Query to check column type - in TEXT mode, value_json should be populated + result = ( + get_client_from_store(store) + .execute(""" + SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type + FROM kv_entries + WHERE collection = 'test' AND key = 'key1' + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] + + assert result is not None + (value_json, value_dict, json_type, _dict_type) = result + + # In TEXT mode (native_storage=False), value_json should be populated, value_dict should be NULL + assert value_json is not None + assert value_dict is None + assert json_type in ("VARCHAR", "TEXT") + # Value should be a JSON string + assert isinstance(value_json, str) + assert "data" in value_json + + store.close() + + def test_database_path_initialization(self): + """Test that store can be initialized with different database path options.""" + # In-memory (default) + store1 = DuckDBStore() + store1.put(collection="test", key="key1", value={"test": "value1"}) + result1 = store1.get(collection="test", key="key1") + assert result1 == {"test": "value1"} + store1.close() + + # Explicit in-memory + store2 = DuckDBStore(database_path=":memory:") + store2.put(collection="test", key="key2", value={"test": "value2"}) + result2 = store2.get(collection="test", key="key2") + assert result2 == {"test": "value2"} + store2.close() + + def test_persistent_database(self): + """Test that data persists across store instances when using file database.""" + with TemporaryDirectory() as temp_dir: + db_path = Path(temp_dir) / "persist_test.db" + + # Store data in first instance + store1 = DuckDBStore(database_path=db_path) + store1.put(collection="test", key="persist_key", value={"data": "persistent"}) + store1.close() + + # Create second instance with same database file + store2 = DuckDBStore(database_path=db_path) + result = store2.get(collection="test", key="persist_key") + store2.close() + + assert result == {"data": "persistent"} + + def test_sql_injection_protection(self, store: DuckDBStore): + """Test that the store is protected against SQL injection attacks.""" + malicious_collection = "test'; DROP TABLE kv_entries; --" + malicious_key = "key'; DELETE FROM kv_entries; --" + + # These operations should not cause SQL injection + store.put(collection=malicious_collection, key=malicious_key, value={"safe": "data"}) + result = store.get(collection=malicious_collection, key=malicious_key) + assert result == {"safe": "data"} + + # Verify the table still exists and other data is safe + store.put(collection="normal", key="normal_key", value={"normal": "data"}) + normal_result = store.get(collection="normal", key="normal_key") + assert normal_result == {"normal": "data"} + + def test_large_data_storage(self, store: DuckDBStore): + """Test storing and retrieving large data values.""" + # Create a large value (1MB of data) + large_value = {"large_data": "x" * (1024 * 1024)} + + store.put(collection="test", key="large_key", value=large_value) + result = store.get(collection="test", key="large_key") + + assert result == large_value + + def test_unicode_support(self, store: DuckDBStore): + """Test that the store properly handles Unicode characters.""" + unicode_data = { + "english": "Hello World", + "chinese": "ไฝ ๅฅฝไธ–็•Œ", + "japanese": "ใ“ใ‚“ใซใกใฏไธ–็•Œ", + "arabic": "ู…ุฑุญุจุง ุจุงู„ุนุงู„ู…", + "emoji": "๐ŸŒ๐Ÿš€๐Ÿ’ป", + "special": "Special chars: !@#$%^&*()_+-={}[]|\\:;\"'<>?,./", + } + + store.put(collection="unicode_test", key="unicode_key", value=unicode_data) + result = store.get(collection="unicode_test", key="unicode_key") + + assert result == unicode_data + + def test_connection_initialization(self): + """Test that store can be initialized with existing DuckDB connection.""" + import duckdb + + conn = duckdb.connect(":memory:") + store = DuckDBStore(connection=conn) + + store.put(collection="test", key="conn_test", value={"test": "value"}) + result = store.get(collection="test", key="conn_test") + assert result == {"test": "value"} + + store.close() + + def test_custom_table_name(self): + """Test that store can use custom table name.""" + custom_table = "my_custom_kv_table" + store = DuckDBStore(table_name=custom_table) + + # Store some data + store.put(collection="test", key="key1", value={"data": "value"}) + + # Verify the custom table exists and contains the data + tables = ( + get_client_from_store(store) + .table(custom_table) + .filter(filter_expr="key = 'key1'") + .select("key", "collection") + .execute() + .fetchone() + ) + + assert tables == snapshot(("key1", "test")) + + # Verify default table doesn't exist + with pytest.raises(CatalogException): + get_client_from_store(store).table("kv_entries") + + store.close() + + def test_native_vs_stringified_storage(self): + """Test that native and stringified storage modes work correctly.""" + # Native storage (default) + store_native = DuckDBStore(native_storage=True) + store_native.put(collection="test", key="key1", value={"name": "native"}) + + result_native = ( + get_client_from_store(store_native) + .execute(""" + SELECT value_dict, value_json + FROM kv_entries + WHERE key = 'key1' + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] + + assert result_native is not None + assert result_native[0] is not None # value_dict should be populated + assert result_native[1] is None # value_json should be NULL + + store_native.close() + + # Stringified storage + store_string = DuckDBStore(native_storage=False) + store_string.put(collection="test", key="key2", value={"name": "stringified"}) + + result_string = ( + get_client_from_store(store_string) + .execute(""" + SELECT value_dict, value_json + FROM kv_entries + WHERE key = 'key2' + """) + .fetchone() + ) # pyright: ignore[reportPrivateUsage] + + assert result_string is not None + assert result_string[0] is None # value_dict should be NULL + assert result_string[1] is not None # value_json should be populated + + store_string.close() + + @pytest.mark.skip(reason="Local disk stores are unbounded") + def test_not_unbounded(self, store: BaseStore): ... diff --git a/pyproject.toml b/pyproject.toml index fef6cc86..e78d0e31 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -73,6 +73,7 @@ line-length = 140 "**/code_gen/**/*.py" = [ "ARG001", # Unused argument, Pyright captures this for us "ARG002", # Unused argument, Pyright captures this for us + "E501", # Ignore long lines ] [project.optional-dependencies] diff --git a/uv.lock b/uv.lock index 22953090..cbc8d7b7 100644 --- a/uv.lock +++ b/uv.lock @@ -1,5 +1,5 @@ version = 1 -revision = 3 +revision = 2 requires-python = ">=3.10" resolution-markers = [ "python_full_version >= '3.12' and sys_platform != 'win32'", @@ -1744,6 +1744,7 @@ disk = [ ] duckdb = [ { name = "duckdb" }, + { name = "pytz" }, ] dynamodb = [ { name = "aioboto3" }, @@ -1815,6 +1816,7 @@ requires-dist = [ { name = "py-key-value-shared", editable = "key-value/key-value-shared" }, { name = "pydantic", marker = "extra == 'pydantic'", specifier = ">=2.11.9" }, { name = "pymongo", marker = "extra == 'mongodb'", specifier = ">=4.0.0" }, + { name = "pytz", marker = "extra == 'duckdb'", specifier = ">=2025.2" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=4.3.0" }, { name = "rocksdict", marker = "python_full_version >= '3.12' and extra == 'rocksdb'", specifier = ">=0.3.24" }, { name = "rocksdict", marker = "python_full_version < '3.12' and extra == 'rocksdb'", specifier = ">=0.3.2" }, @@ -1908,6 +1910,10 @@ disk = [ { name = "diskcache" }, { name = "pathvalidate" }, ] +duckdb = [ + { name = "duckdb" }, + { name = "pytz" }, +] elasticsearch = [ { name = "aiohttp" }, { name = "elasticsearch" }, @@ -1951,7 +1957,7 @@ wrappers-encryption = [ [package.dev-dependencies] dev = [ { name = "py-key-value", extra = ["dev"] }, - { name = "py-key-value-sync", extra = ["disk", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, + { name = "py-key-value-sync", extra = ["disk", "duckdb", "elasticsearch", "keyring", "memcached", "memory", "mongodb", "pydantic", "redis", "rocksdb", "vault", "wrappers-encryption"] }, { name = "py-key-value-sync", extra = ["valkey"], marker = "sys_platform != 'win32'" }, ] @@ -1964,6 +1970,7 @@ requires-dist = [ { name = "cryptography", marker = "extra == 'wrappers-encryption'", specifier = ">=45.0.0" }, { name = "dbus-python", marker = "extra == 'keyring-linux'", specifier = ">=1.4.0" }, { name = "diskcache", marker = "extra == 'disk'", specifier = ">=5.0.0" }, + { name = "duckdb", marker = "extra == 'duckdb'", specifier = ">=1.0.0" }, { name = "elasticsearch", marker = "extra == 'elasticsearch'", specifier = ">=8.0.0" }, { name = "hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "keyring", marker = "extra == 'keyring'", specifier = ">=25.6.0" }, @@ -1972,19 +1979,20 @@ requires-dist = [ { name = "py-key-value-shared", editable = "key-value/key-value-shared" }, { name = "pydantic", marker = "extra == 'pydantic'", specifier = ">=2.11.9" }, { name = "pymongo", marker = "extra == 'mongodb'", specifier = ">=4.0.0" }, + { name = "pytz", marker = "extra == 'duckdb'", specifier = ">=2025.2" }, { name = "redis", marker = "extra == 'redis'", specifier = ">=4.3.0" }, { name = "rocksdict", marker = "python_full_version >= '3.12' and extra == 'rocksdb'", specifier = ">=0.3.24" }, { name = "rocksdict", marker = "python_full_version < '3.12' and extra == 'rocksdb'", specifier = ">=0.3.2" }, { name = "types-hvac", marker = "extra == 'vault'", specifier = ">=2.3.0" }, { name = "valkey-glide-sync", marker = "extra == 'valkey'", specifier = ">=2.1.0" }, ] -provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "pydantic", "keyring", "keyring-linux", "rocksdb", "wrappers-encryption"] +provides-extras = ["memory", "disk", "redis", "mongodb", "valkey", "vault", "memcached", "elasticsearch", "pydantic", "keyring", "keyring-linux", "rocksdb", "duckdb", "wrappers-encryption"] [package.metadata.requires-dev] dev = [ { name = "py-key-value", extras = ["dev"], editable = "." }, { name = "py-key-value-sync", extras = ["keyring"] }, - { name = "py-key-value-sync", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "rocksdb"] }, + { name = "py-key-value-sync", extras = ["memory", "disk", "redis", "elasticsearch", "memcached", "mongodb", "vault", "rocksdb", "duckdb"] }, { name = "py-key-value-sync", extras = ["pydantic"] }, { name = "py-key-value-sync", extras = ["valkey"], marker = "sys_platform != 'win32'" }, { name = "py-key-value-sync", extras = ["wrappers-encryption"] }, @@ -2324,6 +2332,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" }, ] +[[package]] +name = "pytz" +version = "2025.2" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/f8/bf/abbd3cdfb8fbc7fb3d4d38d320f2441b1e7cbe29be4f23797b4a2b5d8aac/pytz-2025.2.tar.gz", hash = "sha256:360b9e3dbb49a209c21ad61809c7fb453643e048b38924c765813546746e81c3", size = 320884, upload-time = "2025-03-25T02:25:00.538Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/81/c4/34e93fe5f5429d7570ec1fa436f1986fb1f00c3e0f43a589fe2bbcd22c3f/pytz-2025.2-py2.py3-none-any.whl", hash = "sha256:5ddf76296dd8c44c26eb8f4b6f35488f3ccbf6fbbd7adee0b7262d43f0ec2f00", size = 509225, upload-time = "2025-03-25T02:24:58.468Z" }, +] + [[package]] name = "pywin32" version = "311" From 40fbc823e63c2d1805e5400d2aa5840aff1f583b Mon Sep 17 00:00:00 2001 From: William Easton Date: Sun, 2 Nov 2025 08:30:49 -0600 Subject: [PATCH 10/12] updates for duckdb tests --- .../tests/stores/duckdb/test_duckdb.py | 20 ++++--------------- .../code_gen/stores/duckdb/test_duckdb.py | 20 ++++--------------- 2 files changed, 8 insertions(+), 32 deletions(-) diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index 24cac26c..d939bc4c 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -17,6 +17,7 @@ def get_client_from_store(store: DuckDBStore) -> DuckDBPyConnection: return store._connection # pyright: ignore[reportPrivateUsage] +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStore(ContextManagerStoreTestMixin, BaseStoreTests): @override @pytest.fixture @@ -30,6 +31,7 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStorePersistent(ContextManagerStoreTestMixin, BaseStoreTests): @override @pytest.fixture @@ -45,6 +47,7 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" @@ -60,6 +63,7 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreSpecific: """Test DuckDB-specific functionality.""" @@ -111,22 +115,6 @@ async def test_native_sql_queryability(self): assert count_result is not None assert count_result[0] == 3 # All 3 entries should not be expired - # Query metadata columns directly - result = ( - get_client_from_store(store) - .execute(""" - SELECT key, ttl, created_at IS NOT NULL as has_created - FROM kv_entries - WHERE collection = 'products' AND ttl > 3600 - """) - .fetchall() - ) # pyright: ignore[reportPrivateUsage] - - assert len(result) == 1 # Only item2 has ttl > 3600 - assert result[0][0] == "item2" - assert abs(result[0][1] - 7200) < 1 # TTL should be approximately 7200 (floating point precision) - assert result[0][2] is True # has_created - await store.close() async def test_text_mode_storage(self): diff --git a/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py index ec4a8027..07c40360 100644 --- a/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py @@ -20,6 +20,7 @@ def get_client_from_store(store: DuckDBStore) -> DuckDBPyConnection: return store._connection # pyright: ignore[reportPrivateUsage] +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStore(ContextManagerStoreTestMixin, BaseStoreTests): @override @pytest.fixture @@ -33,6 +34,7 @@ def store(self) -> Generator[DuckDBStore, None, None]: def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStorePersistent(ContextManagerStoreTestMixin, BaseStoreTests): @override @pytest.fixture @@ -48,6 +50,7 @@ def store(self) -> Generator[DuckDBStore, None, None]: def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" @@ -63,6 +66,7 @@ def store(self) -> Generator[DuckDBStore, None, None]: def test_not_unbounded(self, store: BaseStore): ... +@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreSpecific: """Test DuckDB-specific functionality.""" @@ -114,22 +118,6 @@ def test_native_sql_queryability(self): assert count_result is not None assert count_result[0] == 3 # All 3 entries should not be expired - # Query metadata columns directly - result = ( - get_client_from_store(store) - .execute(""" - SELECT key, ttl, created_at IS NOT NULL as has_created - FROM kv_entries - WHERE collection = 'products' AND ttl > 3600 - """) - .fetchall() - ) # pyright: ignore[reportPrivateUsage] - - assert len(result) == 1 # Only item2 has ttl > 3600 - assert result[0][0] == "item2" - assert abs(result[0][1] - 7200) < 1 # TTL should be approximately 7200 (floating point precision) - assert result[0][2] is True # has_created - store.close() def test_text_mode_storage(self): From 2a2094f2145b85a08e8d3f9e679352e06ababceb Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Fri, 7 Nov 2025 23:43:03 +0000 Subject: [PATCH 11/12] feat: update DuckDB store for PR #204 serialization changes - Add version, key, and collection fields to serialized JSON documents - Update DuckDBSerializationAdapter to store metadata in JSON columns - Split prepare_load into helper methods for better code organization - Add type annotations to satisfy type checker - Regenerate sync library Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 90 +++++++++++++------ .../sync/code_gen/stores/duckdb/store.py | 90 +++++++++++++------ 2 files changed, 122 insertions(+), 58 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index fee22312..27339583 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -1,7 +1,7 @@ import json from datetime import datetime, timezone from pathlib import Path -from typing import Any, overload +from typing import Any, cast, overload from key_value.shared.errors import DeserializationError from key_value.shared.utils.managed_entry import ManagedEntry @@ -41,21 +41,37 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data for dumping to DuckDB. Moves the value to the appropriate column (value_dict or value_json) - and sets the other column to None. + and sets the other column to None. Also includes version, key, and collection + fields in the JSON for compatibility with deserialization. """ value = data.pop("value") + # Extract version, key, and collection to include in the JSON + version = data.pop("version", None) + key = data.pop("key", None) + collection_name = data.pop("collection", None) + + # Build the document to store in JSON columns + json_document: dict[str, Any] = {"value": value} + + if version is not None: + json_document["version"] = version + if key is not None: + json_document["key"] = key + if collection_name is not None: + json_document["collection"] = collection_name + # Set both columns to None, then populate the appropriate one data["value_json"] = None data["value_dict"] = None if self._native_storage: - # For native storage, we pass the JSON string to DuckDB's JSON column + # For native storage, convert the document to JSON string for DuckDB's JSON column # DuckDB will parse it and store it as native JSON - data["value_dict"] = value + data["value_dict"] = json.dumps(json_document) else: - # For TEXT storage, value should be a JSON string - data["value_json"] = value + # For TEXT storage, store as JSON string + data["value_json"] = json.dumps(json_document) return data @@ -63,36 +79,54 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data loaded from DuckDB for conversion to ManagedEntry. - Extracts value from the appropriate column and handles timezone conversion - for DuckDB's naive timestamps. + Extracts value, version, key, and collection from the JSON columns + and handles timezone conversion for DuckDB's naive timestamps. """ value_json = data.pop("value_json", None) value_dict = data.pop("value_dict", None) - # Determine which value column to use (prefer value_dict if present) + # Parse the JSON document from the appropriate column + json_document = self._parse_json_column(value_dict, value_json) + + # Extract fields from the JSON document + data["value"] = json_document.get("value") + if "version" in json_document: + data["version"] = json_document["version"] + if "key" in json_document: + data["key"] = json_document["key"] + if "collection" in json_document: + data["collection"] = json_document["collection"] + + # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones + self._convert_timestamps_to_utc(data) + + return data + + def _parse_json_column(self, value_dict: Any, value_json: Any) -> dict[str, Any]: # noqa: ANN401 + """Parse JSON from value_dict or value_json column.""" if value_dict is not None: # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) if isinstance(value_dict, dict): - data["value"] = value_dict - elif isinstance(value_dict, str): - # DuckDB sometimes returns JSON as string, parse it - data["value"] = json.loads(value_dict) - else: - msg = f"value_dict has unexpected type: {type(value_dict)}" - raise DeserializationError(message=msg) - elif value_json is not None: + return cast(dict[str, Any], value_dict) + if isinstance(value_dict, str): + parsed: dict[str, Any] = json.loads(value_dict) + return parsed + msg = f"value_dict has unexpected type: {type(value_dict)}" + raise DeserializationError(message=msg) + + if value_json is not None: # Stringified JSON mode - parse from string if isinstance(value_json, str): - data["value"] = json.loads(value_json) - else: - msg = f"value_json has unexpected type: {type(value_json)}" - raise DeserializationError(message=msg) - else: - msg = "Neither value_dict nor value_json column contains data" + parsed_json: dict[str, Any] = json.loads(value_json) + return parsed_json + msg = f"value_json has unexpected type: {type(value_json)}" raise DeserializationError(message=msg) - # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones - # Convert to timezone-aware UTC timestamps. Handle None values explicitly. + msg = "Neither value_dict nor value_json column contains data" + raise DeserializationError(message=msg) + + def _convert_timestamps_to_utc(self, data: dict[str, Any]) -> None: + """Convert naive timestamps to UTC timezone-aware timestamps.""" created_at = data.get("created_at") if created_at is not None and isinstance(created_at, datetime) and created_at.tzinfo is None: data["created_at"] = created_at.astimezone(tz=timezone.utc) @@ -101,8 +135,6 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: if expires_at is not None and isinstance(expires_at, datetime) and expires_at.tzinfo is None: data["expires_at"] = expires_at.astimezone(tz=timezone.utc) - return data - class DuckDBStore(BaseContextManagerStore, BaseStore): """A DuckDB-based key-value store supporting both in-memory and persistent storage. @@ -378,8 +410,8 @@ async def _put_managed_entry( msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) - # Use adapter to dump the managed entry to a dict - document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False) + # Use adapter to dump the managed entry to a dict with key and collection + document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False, key=key, collection=collection) # Insert or replace the entry with metadata in separate columns self._connection.execute( diff --git a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py index 3196d3c3..c06571ed 100644 --- a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py +++ b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py @@ -4,7 +4,7 @@ import json from datetime import datetime, timezone from pathlib import Path -from typing import Any, overload +from typing import Any, cast, overload from key_value.shared.errors import DeserializationError from key_value.shared.utils.managed_entry import ManagedEntry @@ -44,21 +44,37 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data for dumping to DuckDB. Moves the value to the appropriate column (value_dict or value_json) - and sets the other column to None. + and sets the other column to None. Also includes version, key, and collection + fields in the JSON for compatibility with deserialization. """ value = data.pop("value") + # Extract version, key, and collection to include in the JSON + version = data.pop("version", None) + key = data.pop("key", None) + collection_name = data.pop("collection", None) + + # Build the document to store in JSON columns + json_document: dict[str, Any] = {"value": value} + + if version is not None: + json_document["version"] = version + if key is not None: + json_document["key"] = key + if collection_name is not None: + json_document["collection"] = collection_name + # Set both columns to None, then populate the appropriate one data["value_json"] = None data["value_dict"] = None if self._native_storage: - # For native storage, we pass the JSON string to DuckDB's JSON column + # For native storage, convert the document to JSON string for DuckDB's JSON column # DuckDB will parse it and store it as native JSON - data["value_dict"] = value + data["value_dict"] = json.dumps(json_document) else: - # For TEXT storage, value should be a JSON string - data["value_json"] = value + # For TEXT storage, store as JSON string + data["value_json"] = json.dumps(json_document) return data @@ -66,36 +82,54 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data loaded from DuckDB for conversion to ManagedEntry. - Extracts value from the appropriate column and handles timezone conversion - for DuckDB's naive timestamps. + Extracts value, version, key, and collection from the JSON columns + and handles timezone conversion for DuckDB's naive timestamps. """ value_json = data.pop("value_json", None) value_dict = data.pop("value_dict", None) - # Determine which value column to use (prefer value_dict if present) + # Parse the JSON document from the appropriate column + json_document = self._parse_json_column(value_dict, value_json) + + # Extract fields from the JSON document + data["value"] = json_document.get("value") + if "version" in json_document: + data["version"] = json_document["version"] + if "key" in json_document: + data["key"] = json_document["key"] + if "collection" in json_document: + data["collection"] = json_document["collection"] + + # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones + self._convert_timestamps_to_utc(data) + + return data + + def _parse_json_column(self, value_dict: Any, value_json: Any) -> dict[str, Any]: + "Parse JSON from value_dict or value_json column." if value_dict is not None: # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) if isinstance(value_dict, dict): - data["value"] = value_dict - elif isinstance(value_dict, str): - # DuckDB sometimes returns JSON as string, parse it - data["value"] = json.loads(value_dict) - else: - msg = f"value_dict has unexpected type: {type(value_dict)}" - raise DeserializationError(message=msg) - elif value_json is not None: + return cast("dict[str, Any]", value_dict) + if isinstance(value_dict, str): + parsed: dict[str, Any] = json.loads(value_dict) + return parsed + msg = f"value_dict has unexpected type: {type(value_dict)}" + raise DeserializationError(message=msg) + + if value_json is not None: # Stringified JSON mode - parse from string if isinstance(value_json, str): - data["value"] = json.loads(value_json) - else: - msg = f"value_json has unexpected type: {type(value_json)}" - raise DeserializationError(message=msg) - else: - msg = "Neither value_dict nor value_json column contains data" + parsed_json: dict[str, Any] = json.loads(value_json) + return parsed_json + msg = f"value_json has unexpected type: {type(value_json)}" raise DeserializationError(message=msg) - # DuckDB always returns naive timestamps, but ManagedEntry expects timezone-aware ones - # Convert to timezone-aware UTC timestamps. Handle None values explicitly. + msg = "Neither value_dict nor value_json column contains data" + raise DeserializationError(message=msg) + + def _convert_timestamps_to_utc(self, data: dict[str, Any]) -> None: + """Convert naive timestamps to UTC timezone-aware timestamps.""" created_at = data.get("created_at") if created_at is not None and isinstance(created_at, datetime) and (created_at.tzinfo is None): data["created_at"] = created_at.astimezone(tz=timezone.utc) @@ -104,8 +138,6 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: if expires_at is not None and isinstance(expires_at, datetime) and (expires_at.tzinfo is None): data["expires_at"] = expires_at.astimezone(tz=timezone.utc) - return data - class DuckDBStore(BaseContextManagerStore, BaseStore): """A DuckDB-based key-value store supporting both in-memory and persistent storage. @@ -340,8 +372,8 @@ def _put_managed_entry(self, *, key: str, collection: str, managed_entry: Manage msg = "Cannot operate on closed DuckDBStore" raise RuntimeError(msg) - # Use adapter to dump the managed entry to a dict - document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False) + # Use adapter to dump the managed entry to a dict with key and collection + document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False, key=key, collection=collection) # Insert or replace the entry with metadata in separate columns self._connection.execute( From 48cb8a022d40b2da7c16faeef9723442006874a3 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Sat, 8 Nov 2025 04:06:08 +0000 Subject: [PATCH 12/12] refactor: simplify DuckDB store to native JSON storage only - Remove native_storage parameter from DuckDBSerializationAdapter and DuckDBStore - Remove value_json TEXT column from schema, use only value_dict JSON column - Update all SQL statements to work with single value_dict column - Remove TestDuckDBStoreTextMode test class - Remove test_text_mode_storage() and test_native_vs_stringified_storage() tests - Update test_native_sql_queryability() to use correct JSON path for nested value - Regenerate sync library with simplified implementation This simplification focuses on native JSON storage for better queryability and removes the complexity of supporting dual storage modes. Co-authored-by: William Easton --- .../src/key_value/aio/stores/duckdb/store.py | 118 ++++++------------ .../tests/stores/duckdb/test_duckdb.py | 92 +------------- .../sync/code_gen/stores/duckdb/store.py | 118 ++++++------------ .../code_gen/stores/duckdb/test_duckdb.py | 92 +------------- 4 files changed, 83 insertions(+), 337 deletions(-) diff --git a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py index 27339583..4ac4fe01 100644 --- a/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py +++ b/key-value/key-value-aio/src/key_value/aio/stores/duckdb/store.py @@ -18,31 +18,22 @@ class DuckDBSerializationAdapter(SerializationAdapter): - """Adapter for DuckDB with support for native JSON and TEXT storage modes.""" + """Adapter for DuckDB with native JSON storage.""" - _native_storage: bool - - def __init__(self, *, native_storage: bool = True) -> None: - """Initialize the DuckDB adapter. - - Args: - native_storage: If True, use JSON column for native dict storage. - If False, use TEXT column for stringified JSON. - """ + def __init__(self) -> None: + """Initialize the DuckDB adapter.""" super().__init__() - self._native_storage = native_storage self._date_format = "datetime" - # Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns + # Use string format - DuckDB needs JSON strings for JSON columns self._value_format = "string" @override def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data for dumping to DuckDB. - Moves the value to the appropriate column (value_dict or value_json) - and sets the other column to None. Also includes version, key, and collection - fields in the JSON for compatibility with deserialization. + Stores the value in the value_dict JSON column and includes version, key, + and collection fields in the JSON for compatibility with deserialization. """ value = data.pop("value") @@ -51,7 +42,7 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: key = data.pop("key", None) collection_name = data.pop("collection", None) - # Build the document to store in JSON columns + # Build the document to store in JSON column json_document: dict[str, Any] = {"value": value} if version is not None: @@ -61,17 +52,9 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: if collection_name is not None: json_document["collection"] = collection_name - # Set both columns to None, then populate the appropriate one - data["value_json"] = None - data["value_dict"] = None - - if self._native_storage: - # For native storage, convert the document to JSON string for DuckDB's JSON column - # DuckDB will parse it and store it as native JSON - data["value_dict"] = json.dumps(json_document) - else: - # For TEXT storage, store as JSON string - data["value_json"] = json.dumps(json_document) + # Store as JSON string for DuckDB's JSON column + # DuckDB will parse it and store it as native JSON + data["value_dict"] = json.dumps(json_document) return data @@ -79,14 +62,13 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data loaded from DuckDB for conversion to ManagedEntry. - Extracts value, version, key, and collection from the JSON columns + Extracts value, version, key, and collection from the JSON column and handles timezone conversion for DuckDB's naive timestamps. """ - value_json = data.pop("value_json", None) value_dict = data.pop("value_dict", None) - # Parse the JSON document from the appropriate column - json_document = self._parse_json_column(value_dict, value_json) + # Parse the JSON document from the value_dict column + json_document = self._parse_json_column(value_dict) # Extract fields from the JSON document data["value"] = json_document.get("value") @@ -102,27 +84,20 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: return data - def _parse_json_column(self, value_dict: Any, value_json: Any) -> dict[str, Any]: # noqa: ANN401 - """Parse JSON from value_dict or value_json column.""" - if value_dict is not None: - # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) - if isinstance(value_dict, dict): - return cast(dict[str, Any], value_dict) - if isinstance(value_dict, str): - parsed: dict[str, Any] = json.loads(value_dict) - return parsed - msg = f"value_dict has unexpected type: {type(value_dict)}" + def _parse_json_column(self, value_dict: Any) -> dict[str, Any]: + """Parse JSON from value_dict column.""" + if value_dict is None: + msg = "value_dict column contains no data" raise DeserializationError(message=msg) - if value_json is not None: - # Stringified JSON mode - parse from string - if isinstance(value_json, str): - parsed_json: dict[str, Any] = json.loads(value_json) - return parsed_json - msg = f"value_json has unexpected type: {type(value_json)}" - raise DeserializationError(message=msg) + # value_dict can be dict or string (DuckDB JSON returns as string) + if isinstance(value_dict, dict): + return cast("dict[str, Any]", value_dict) + if isinstance(value_dict, str): + parsed: dict[str, Any] = json.loads(value_dict) + return parsed - msg = "Neither value_dict nor value_json column contains data" + msg = f"value_dict has unexpected type: {type(value_dict)}" raise DeserializationError(message=msg) def _convert_timestamps_to_utc(self, data: dict[str, Any]) -> None: @@ -146,9 +121,8 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries on stored data. Users can query the database directly for analytics or data exploration. - Storage modes: - - native_storage=True: Stores values in a JSON column as native dicts for queryability - - native_storage=False: Stores values as stringified JSON in a TEXT column + Values are stored in a JSON column as native dicts, allowing direct SQL queries + on the stored data for analytics and reporting. Note on connection ownership: When you provide an existing connection, the store will take ownership and close it when the store is closed or garbage collected. @@ -167,7 +141,6 @@ def __init__( *, connection: duckdb.DuckDBPyConnection, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -180,8 +153,6 @@ def __init__( Args: connection: An existing DuckDB connection to use. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -192,7 +163,6 @@ def __init__( *, database_path: Path | str | None = None, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -201,8 +171,6 @@ def __init__( Args: database_path: Path to the database file. If None or ':memory:', uses in-memory database. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -213,7 +181,6 @@ def __init__( connection: duckdb.DuckDBPyConnection | None = None, database_path: Path | str | None = None, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -223,8 +190,6 @@ def __init__( connection: An existing DuckDB connection to use. database_path: Path to the database file. If None or ':memory:', uses in-memory database. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -248,7 +213,7 @@ def __init__( self._owns_connection = True self._is_closed = False - self._adapter = DuckDBSerializationAdapter(native_storage=native_storage) + self._adapter = DuckDBSerializationAdapter() self._table_name = table_name self._stable_api = False @@ -264,8 +229,7 @@ def _get_create_table_sql(self) -> str: CREATE TABLE IF NOT EXISTS {self._table_name} ( collection VARCHAR NOT NULL, key VARCHAR NOT NULL, - value_json TEXT, - value_dict JSON, + value_dict JSON NOT NULL, created_at TIMESTAMP, expires_at TIMESTAMP, PRIMARY KEY (collection, key) @@ -301,7 +265,7 @@ def _get_select_sql(self) -> str: SQL SELECT statement with placeholders. """ return f""" - SELECT value_json, value_dict, created_at, expires_at + SELECT value_dict, created_at, expires_at FROM {self._table_name} WHERE collection = ? AND key = ? """ # noqa: S608 @@ -314,8 +278,8 @@ def _get_insert_sql(self) -> str: """ return f""" INSERT OR REPLACE INTO {self._table_name} - (collection, key, value_json, value_dict, created_at, expires_at) - VALUES (?, ?, ?, ?, ?, ?) + (collection, key, value_dict, created_at, expires_at) + VALUES (?, ?, ?, ?, ?) """ # noqa: S608 def _get_delete_sql(self) -> str: @@ -335,17 +299,15 @@ async def _setup(self) -> None: """Initialize the database schema for key-value storage. The schema uses native DuckDB types for efficient querying: - - value_json: TEXT column storing stringified JSON (used when native_storage=False) - - value_dict: JSON column storing native dicts (used when native_storage=True) + - value_dict: JSON column storing native dicts for queryability - created_at: TIMESTAMP for native datetime operations - expires_at: TIMESTAMP for native expiration queries - This design follows the Elasticsearch/MongoDB pattern of separating native and stringified - storage, enabling: - - Direct SQL queries on the database for analytics (when using native storage) + This design enables: + - Direct SQL queries on the database for analytics - Efficient expiration cleanup: DELETE FROM table WHERE expires_at < now() - Metadata queries without JSON deserialization - - Flexibility to choose between native dict storage and stringified JSON + - Native JSON column support for rich querying capabilities """ # Create the main table for storing key-value entries self._connection.execute(self._get_create_table_sql()) @@ -360,7 +322,7 @@ async def _setup(self) -> None: async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: """Retrieve a managed entry by key from the specified collection. - Reconstructs the ManagedEntry from value columns and metadata columns + Reconstructs the ManagedEntry from value column and metadata columns using the serialization adapter. """ if self._is_closed: @@ -375,11 +337,10 @@ async def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry if result is None: return None - value_json, value_dict, created_at, expires_at = result + value_dict, created_at, expires_at = result - # Build document dict for the adapter (exclude None values) + # Build document dict for the adapter document: dict[str, Any] = { - "value_json": value_json, "value_dict": value_dict, } @@ -411,7 +372,7 @@ async def _put_managed_entry( raise RuntimeError(msg) # Use adapter to dump the managed entry to a dict with key and collection - document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False, key=key, collection=collection) + document = self._adapter.dump_dict(entry=managed_entry, key=key, collection=collection) # Insert or replace the entry with metadata in separate columns self._connection.execute( @@ -419,7 +380,6 @@ async def _put_managed_entry( [ collection, key, - document["value_json"], document["value_dict"], document.get("created_at"), document.get("expires_at"), diff --git a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py index d939bc4c..ac78bb23 100644 --- a/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-aio/tests/stores/duckdb/test_duckdb.py @@ -47,22 +47,6 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_not_unbounded(self, store: BaseStore): ... -@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") -class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): - """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" - - @override - @pytest.fixture - async def store(self) -> AsyncGenerator[DuckDBStore, None]: - """Test with in-memory DuckDB database using TEXT column for stringified JSON.""" - duckdb_store = DuckDBStore(native_storage=False) - yield duckdb_store - await duckdb_store.close() - - @pytest.mark.skip(reason="Local disk stores are unbounded") - async def test_not_unbounded(self, store: BaseStore): ... - - @pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreSpecific: """Test DuckDB-specific functionality.""" @@ -76,7 +60,7 @@ async def store(self) -> AsyncGenerator[DuckDBStore, None]: async def test_native_sql_queryability(self): """Test that users can query the database directly with SQL.""" - store = DuckDBStore(native_storage=True) + store = DuckDBStore() # Store some test data with known metadata await store.put(collection="products", key="item1", value={"name": "Widget", "price": 10.99}, ttl=3600) @@ -88,7 +72,7 @@ async def test_native_sql_queryability(self): result = ( get_client_from_store(store) .execute(""" - SELECT key, value_dict->'name' as name, value_dict->'price' as price + SELECT key, value_dict->'value'->'name' as name, value_dict->'value'->'price' as price FROM kv_entries WHERE collection = 'products' ORDER BY key @@ -117,36 +101,6 @@ async def test_native_sql_queryability(self): await store.close() - async def test_text_mode_storage(self): - """Test that TEXT mode stores value as stringified JSON instead of native JSON.""" - store = DuckDBStore(native_storage=False) - - await store.put(collection="test", key="key1", value={"data": "value"}) - - # Query to check column type - in TEXT mode, value_json should be populated - result = ( - get_client_from_store(store) - .execute(""" - SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type - FROM kv_entries - WHERE collection = 'test' AND key = 'key1' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result is not None - value_json, value_dict, json_type, _dict_type = result - - # In TEXT mode (native_storage=False), value_json should be populated, value_dict should be NULL - assert value_json is not None - assert value_dict is None - assert json_type in ("VARCHAR", "TEXT") - # Value should be a JSON string - assert isinstance(value_json, str) - assert "data" in value_json - - await store.close() - async def test_database_path_initialization(self): """Test that store can be initialized with different database path options.""" # In-memory (default) @@ -260,47 +214,5 @@ async def test_custom_table_name(self): await store.close() - async def test_native_vs_stringified_storage(self): - """Test that native and stringified storage modes work correctly.""" - # Native storage (default) - store_native = DuckDBStore(native_storage=True) - await store_native.put(collection="test", key="key1", value={"name": "native"}) - - result_native = ( - get_client_from_store(store_native) - .execute(""" - SELECT value_dict, value_json - FROM kv_entries - WHERE key = 'key1' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result_native is not None - assert result_native[0] is not None # value_dict should be populated - assert result_native[1] is None # value_json should be NULL - - await store_native.close() - - # Stringified storage - store_string = DuckDBStore(native_storage=False) - await store_string.put(collection="test", key="key2", value={"name": "stringified"}) - - result_string = ( - get_client_from_store(store_string) - .execute(""" - SELECT value_dict, value_json - FROM kv_entries - WHERE key = 'key2' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result_string is not None - assert result_string[0] is None # value_dict should be NULL - assert result_string[1] is not None # value_json should be populated - - await store_string.close() - @pytest.mark.skip(reason="Local disk stores are unbounded") async def test_not_unbounded(self, store: BaseStore): ... diff --git a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py index c06571ed..9de4182a 100644 --- a/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py +++ b/key-value/key-value-sync/src/key_value/sync/code_gen/stores/duckdb/store.py @@ -21,31 +21,22 @@ class DuckDBSerializationAdapter(SerializationAdapter): - """Adapter for DuckDB with support for native JSON and TEXT storage modes.""" + """Adapter for DuckDB with native JSON storage.""" - _native_storage: bool - - def __init__(self, *, native_storage: bool = True) -> None: - """Initialize the DuckDB adapter. - - Args: - native_storage: If True, use JSON column for native dict storage. - If False, use TEXT column for stringified JSON. - """ + def __init__(self) -> None: + """Initialize the DuckDB adapter.""" super().__init__() - self._native_storage = native_storage self._date_format = "datetime" - # Always use string format - DuckDB needs JSON strings for both TEXT and JSON columns + # Use string format - DuckDB needs JSON strings for JSON columns self._value_format = "string" @override def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data for dumping to DuckDB. - Moves the value to the appropriate column (value_dict or value_json) - and sets the other column to None. Also includes version, key, and collection - fields in the JSON for compatibility with deserialization. + Stores the value in the value_dict JSON column and includes version, key, + and collection fields in the JSON for compatibility with deserialization. """ value = data.pop("value") @@ -54,7 +45,7 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: key = data.pop("key", None) collection_name = data.pop("collection", None) - # Build the document to store in JSON columns + # Build the document to store in JSON column json_document: dict[str, Any] = {"value": value} if version is not None: @@ -64,17 +55,9 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: if collection_name is not None: json_document["collection"] = collection_name - # Set both columns to None, then populate the appropriate one - data["value_json"] = None - data["value_dict"] = None - - if self._native_storage: - # For native storage, convert the document to JSON string for DuckDB's JSON column - # DuckDB will parse it and store it as native JSON - data["value_dict"] = json.dumps(json_document) - else: - # For TEXT storage, store as JSON string - data["value_json"] = json.dumps(json_document) + # Store as JSON string for DuckDB's JSON column + # DuckDB will parse it and store it as native JSON + data["value_dict"] = json.dumps(json_document) return data @@ -82,14 +65,13 @@ def prepare_dump(self, data: dict[str, Any]) -> dict[str, Any]: def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: """Prepare data loaded from DuckDB for conversion to ManagedEntry. - Extracts value, version, key, and collection from the JSON columns + Extracts value, version, key, and collection from the JSON column and handles timezone conversion for DuckDB's naive timestamps. """ - value_json = data.pop("value_json", None) value_dict = data.pop("value_dict", None) - # Parse the JSON document from the appropriate column - json_document = self._parse_json_column(value_dict, value_json) + # Parse the JSON document from the value_dict column + json_document = self._parse_json_column(value_dict) # Extract fields from the JSON document data["value"] = json_document.get("value") @@ -105,27 +87,20 @@ def prepare_load(self, data: dict[str, Any]) -> dict[str, Any]: return data - def _parse_json_column(self, value_dict: Any, value_json: Any) -> dict[str, Any]: - "Parse JSON from value_dict or value_json column." - if value_dict is not None: - # Native storage mode - value_dict can be dict or string (DuckDB JSON returns as string) - if isinstance(value_dict, dict): - return cast("dict[str, Any]", value_dict) - if isinstance(value_dict, str): - parsed: dict[str, Any] = json.loads(value_dict) - return parsed - msg = f"value_dict has unexpected type: {type(value_dict)}" + def _parse_json_column(self, value_dict: Any) -> dict[str, Any]: + "Parse JSON from value_dict column." + if value_dict is None: + msg = "value_dict column contains no data" raise DeserializationError(message=msg) - if value_json is not None: - # Stringified JSON mode - parse from string - if isinstance(value_json, str): - parsed_json: dict[str, Any] = json.loads(value_json) - return parsed_json - msg = f"value_json has unexpected type: {type(value_json)}" - raise DeserializationError(message=msg) + # value_dict can be dict or string (DuckDB JSON returns as string) + if isinstance(value_dict, dict): + return cast("dict[str, Any]", value_dict) + if isinstance(value_dict, str): + parsed: dict[str, Any] = json.loads(value_dict) + return parsed - msg = "Neither value_dict nor value_json column contains data" + msg = f"value_dict has unexpected type: {type(value_dict)}" raise DeserializationError(message=msg) def _convert_timestamps_to_utc(self, data: dict[str, Any]) -> None: @@ -149,9 +124,8 @@ class DuckDBStore(BaseContextManagerStore, BaseStore): The store uses native DuckDB types (JSON, TIMESTAMP) to enable efficient SQL queries on stored data. Users can query the database directly for analytics or data exploration. - Storage modes: - - native_storage=True: Stores values in a JSON column as native dicts for queryability - - native_storage=False: Stores values as stringified JSON in a TEXT column + Values are stored in a JSON column as native dicts, allowing direct SQL queries + on the stored data for analytics and reporting. Note on connection ownership: When you provide an existing connection, the store will take ownership and close it when the store is closed or garbage collected. @@ -170,7 +144,6 @@ def __init__( *, connection: duckdb.DuckDBPyConnection, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -183,8 +156,6 @@ def __init__( Args: connection: An existing DuckDB connection to use. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -195,7 +166,6 @@ def __init__( *, database_path: Path | str | None = None, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -204,8 +174,6 @@ def __init__( Args: database_path: Path to the database file. If None or ':memory:', uses in-memory database. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -216,7 +184,6 @@ def __init__( connection: duckdb.DuckDBPyConnection | None = None, database_path: Path | str | None = None, table_name: str = "kv_entries", - native_storage: bool = True, default_collection: str | None = None, seed: SEED_DATA_TYPE | None = None, ) -> None: @@ -226,8 +193,6 @@ def __init__( connection: An existing DuckDB connection to use. database_path: Path to the database file. If None or ':memory:', uses in-memory database. table_name: Name of the table to store key-value entries. Defaults to "kv_entries". - native_storage: If True, use native JSON column for dict storage; if False, use TEXT for stringified JSON. - Default is True for better queryability and native type support. default_collection: The default collection to use if no collection is provided. seed: Optional seed data to pre-populate the store. """ @@ -250,7 +215,7 @@ def __init__( self._owns_connection = True self._is_closed = False - self._adapter = DuckDBSerializationAdapter(native_storage=native_storage) + self._adapter = DuckDBSerializationAdapter() self._table_name = table_name self._stable_api = False @@ -262,7 +227,7 @@ def _get_create_table_sql(self) -> str: Returns: SQL CREATE TABLE statement. """ - return f"\n CREATE TABLE IF NOT EXISTS {self._table_name} (\n collection VARCHAR NOT NULL,\n key VARCHAR NOT NULL,\n value_json TEXT,\n value_dict JSON,\n created_at TIMESTAMP,\n expires_at TIMESTAMP,\n PRIMARY KEY (collection, key)\n )\n " + return f"\n CREATE TABLE IF NOT EXISTS {self._table_name} (\n collection VARCHAR NOT NULL,\n key VARCHAR NOT NULL,\n value_dict JSON NOT NULL,\n created_at TIMESTAMP,\n expires_at TIMESTAMP,\n PRIMARY KEY (collection, key)\n )\n " def _get_create_collection_index_sql(self) -> str: """Generate SQL for creating index on collection column. @@ -286,7 +251,7 @@ def _get_select_sql(self) -> str: Returns: SQL SELECT statement with placeholders. """ - return f"\n SELECT value_json, value_dict, created_at, expires_at\n FROM {self._table_name}\n WHERE collection = ? AND key = ?\n " # noqa: S608 + return f"\n SELECT value_dict, created_at, expires_at\n FROM {self._table_name}\n WHERE collection = ? AND key = ?\n " # noqa: S608 def _get_insert_sql(self) -> str: """Generate SQL for inserting or replacing an entry. @@ -294,7 +259,7 @@ def _get_insert_sql(self) -> str: Returns: SQL INSERT OR REPLACE statement with placeholders. """ - return f"\n INSERT OR REPLACE INTO {self._table_name}\n (collection, key, value_json, value_dict, created_at, expires_at)\n VALUES (?, ?, ?, ?, ?, ?)\n " # noqa: S608 + return f"\n INSERT OR REPLACE INTO {self._table_name}\n (collection, key, value_dict, created_at, expires_at)\n VALUES (?, ?, ?, ?, ?)\n " # noqa: S608 def _get_delete_sql(self) -> str: """Generate SQL for deleting an entry by collection and key. @@ -309,17 +274,15 @@ def _setup(self) -> None: """Initialize the database schema for key-value storage. The schema uses native DuckDB types for efficient querying: - - value_json: TEXT column storing stringified JSON (used when native_storage=False) - - value_dict: JSON column storing native dicts (used when native_storage=True) + - value_dict: JSON column storing native dicts for queryability - created_at: TIMESTAMP for native datetime operations - expires_at: TIMESTAMP for native expiration queries - This design follows the Elasticsearch/MongoDB pattern of separating native and stringified - storage, enabling: - - Direct SQL queries on the database for analytics (when using native storage) + This design enables: + - Direct SQL queries on the database for analytics - Efficient expiration cleanup: DELETE FROM table WHERE expires_at < now() - Metadata queries without JSON deserialization - - Flexibility to choose between native dict storage and stringified JSON + - Native JSON column support for rich querying capabilities """ # Create the main table for storing key-value entries self._connection.execute(self._get_create_table_sql()) @@ -334,7 +297,7 @@ def _setup(self) -> None: def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | None: """Retrieve a managed entry by key from the specified collection. - Reconstructs the ManagedEntry from value columns and metadata columns + Reconstructs the ManagedEntry from value column and metadata columns using the serialization adapter. """ if self._is_closed: @@ -346,10 +309,10 @@ def _get_managed_entry(self, *, key: str, collection: str) -> ManagedEntry | Non if result is None: return None - (value_json, value_dict, created_at, expires_at) = result + (value_dict, created_at, expires_at) = result - # Build document dict for the adapter (exclude None values) - document: dict[str, Any] = {"value_json": value_json, "value_dict": value_dict} + # Build document dict for the adapter + document: dict[str, Any] = {"value_dict": value_dict} if created_at is not None and isinstance(created_at, datetime): document["created_at"] = created_at.astimezone(tz=timezone.utc) @@ -373,12 +336,11 @@ def _put_managed_entry(self, *, key: str, collection: str, managed_entry: Manage raise RuntimeError(msg) # Use adapter to dump the managed entry to a dict with key and collection - document = self._adapter.dump_dict(entry=managed_entry, exclude_none=False, key=key, collection=collection) + document = self._adapter.dump_dict(entry=managed_entry, key=key, collection=collection) # Insert or replace the entry with metadata in separate columns self._connection.execute( - self._get_insert_sql(), - [collection, key, document["value_json"], document["value_dict"], document.get("created_at"), document.get("expires_at")], + self._get_insert_sql(), [collection, key, document["value_dict"], document.get("created_at"), document.get("expires_at")] ) @override diff --git a/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py index 07c40360..75fc3529 100644 --- a/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py +++ b/key-value/key-value-sync/tests/code_gen/stores/duckdb/test_duckdb.py @@ -50,22 +50,6 @@ def store(self) -> Generator[DuckDBStore, None, None]: def test_not_unbounded(self, store: BaseStore): ... -@pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") -class TestDuckDBStoreTextMode(ContextManagerStoreTestMixin, BaseStoreTests): - """Test DuckDB store with TEXT column mode (stringified JSON) instead of native JSON.""" - - @override - @pytest.fixture - def store(self) -> Generator[DuckDBStore, None, None]: - """Test with in-memory DuckDB database using TEXT column for stringified JSON.""" - duckdb_store = DuckDBStore(native_storage=False) - yield duckdb_store - duckdb_store.close() - - @pytest.mark.skip(reason="Local disk stores are unbounded") - def test_not_unbounded(self, store: BaseStore): ... - - @pytest.mark.filterwarnings("ignore:A configured store is unstable and may change in a backwards incompatible way. Use at your own risk.") class TestDuckDBStoreSpecific: """Test DuckDB-specific functionality.""" @@ -79,7 +63,7 @@ def store(self) -> Generator[DuckDBStore, None, None]: def test_native_sql_queryability(self): """Test that users can query the database directly with SQL.""" - store = DuckDBStore(native_storage=True) + store = DuckDBStore() # Store some test data with known metadata store.put(collection="products", key="item1", value={"name": "Widget", "price": 10.99}, ttl=3600) @@ -91,7 +75,7 @@ def test_native_sql_queryability(self): result = ( get_client_from_store(store) .execute(""" - SELECT key, value_dict->'name' as name, value_dict->'price' as price + SELECT key, value_dict->'value'->'name' as name, value_dict->'value'->'price' as price FROM kv_entries WHERE collection = 'products' ORDER BY key @@ -120,36 +104,6 @@ def test_native_sql_queryability(self): store.close() - def test_text_mode_storage(self): - """Test that TEXT mode stores value as stringified JSON instead of native JSON.""" - store = DuckDBStore(native_storage=False) - - store.put(collection="test", key="key1", value={"data": "value"}) - - # Query to check column type - in TEXT mode, value_json should be populated - result = ( - get_client_from_store(store) - .execute(""" - SELECT value_json, value_dict, typeof(value_json) as json_type, typeof(value_dict) as dict_type - FROM kv_entries - WHERE collection = 'test' AND key = 'key1' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result is not None - (value_json, value_dict, json_type, _dict_type) = result - - # In TEXT mode (native_storage=False), value_json should be populated, value_dict should be NULL - assert value_json is not None - assert value_dict is None - assert json_type in ("VARCHAR", "TEXT") - # Value should be a JSON string - assert isinstance(value_json, str) - assert "data" in value_json - - store.close() - def test_database_path_initialization(self): """Test that store can be initialized with different database path options.""" # In-memory (default) @@ -263,47 +217,5 @@ def test_custom_table_name(self): store.close() - def test_native_vs_stringified_storage(self): - """Test that native and stringified storage modes work correctly.""" - # Native storage (default) - store_native = DuckDBStore(native_storage=True) - store_native.put(collection="test", key="key1", value={"name": "native"}) - - result_native = ( - get_client_from_store(store_native) - .execute(""" - SELECT value_dict, value_json - FROM kv_entries - WHERE key = 'key1' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result_native is not None - assert result_native[0] is not None # value_dict should be populated - assert result_native[1] is None # value_json should be NULL - - store_native.close() - - # Stringified storage - store_string = DuckDBStore(native_storage=False) - store_string.put(collection="test", key="key2", value={"name": "stringified"}) - - result_string = ( - get_client_from_store(store_string) - .execute(""" - SELECT value_dict, value_json - FROM kv_entries - WHERE key = 'key2' - """) - .fetchone() - ) # pyright: ignore[reportPrivateUsage] - - assert result_string is not None - assert result_string[0] is None # value_dict should be NULL - assert result_string[1] is not None # value_json should be populated - - store_string.close() - @pytest.mark.skip(reason="Local disk stores are unbounded") def test_not_unbounded(self, store: BaseStore): ...