From 05fdbbe7fc8fb3d571856a6a3ee2abdb397e7cf1 Mon Sep 17 00:00:00 2001 From: Larissa Date: Tue, 26 Mar 2024 11:25:32 -0400 Subject: [PATCH 1/8] consolidating metadata for read mode --- polaris/hub/client.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 2aff2cb7..4665dbe1 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -359,6 +359,9 @@ def open_zarr_file( try: store = zarr.storage.FSStore(path, fs=polaris_fs) + if mode in ["r", "r+"]: + zarr.consolidate_metadata(store) + return zarr.open_consolidated(store, mode=mode) return zarr.open(store, mode=mode) except Exception as e: raise PolarisHubError("Error opening Zarr store") from e From 676a09a6f993f7c507af37fd092c827fdd1dc577 Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 10:18:37 -0400 Subject: [PATCH 2/8] consolidate metadata on hub upload --- polaris/hub/client.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index c181653c..004bb1fd 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -358,7 +358,7 @@ def get_dataset(self, owner: Union[str, HubOwner], name: str, verify_checksum: b return Dataset(**response) def open_zarr_file( - self, owner: Union[str, HubOwner], name: str, path: str, mode: IOMode + self, owner: Union[str, HubOwner], name: str, path: str, mode: IOMode, as_consolidated: bool = True ) -> zarr.hierarchy.Group: """Open a Zarr file from a Polaris dataset @@ -367,6 +367,7 @@ def open_zarr_file( name: Name of the dataset. path: Path to the Zarr file within the dataset. mode: The mode in which the file is opened. + as_consolidated: Whether to open the store with consolidated metadata for optimized reading. This is only applicable in 'r' and 'r+' modes. Returns: The Zarr object representing the dataset. @@ -379,11 +380,10 @@ def open_zarr_file( try: store = zarr.storage.FSStore(path, fs=polaris_fs) - if mode in ["r", "r+"]: - zarr.consolidate_metadata(store) + if mode in ["r", "r+"] and as_consolidated: return zarr.open_consolidated(store, mode=mode) return zarr.open(store, mode=mode) - + except Exception as e: raise PolarisHubError("Error opening Zarr store") from e @@ -590,15 +590,20 @@ def upload_dataset( if dataset.zarr_root is not None: with tmp_attribute_change(self.settings, "default_timeout", timeout): # Copy the Zarr archive to the hub - # This does not copy the consolidated data dest = self.open_zarr_file( owner=dataset.owner, name=dataset.name, path=dataset_json["zarrRootPath"], mode="w", ) + # Locally consolidate the archive metadata + zarr.consolidate_metadata(dataset.zarr_root.store) + zmetadata_content = dataset.zarr_root.store['.zmetadata'] + dest.store['.zmetadata'] = zmetadata_content + logger.info("Copying Zarr archive to the Hub. This may take a while.") - zarr.copy_all(source=dataset.zarr_root, dest=dest, log=logger.info) + zarr.copy_all(source=dataset.zarr_root, dest=dest, log=logger.info) + logger.success( "Your dataset has been successfully uploaded to the Hub. " From 8c4ce751b24dbaf3acffe6a0549fb8f060ebd2a6 Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 10:20:18 -0400 Subject: [PATCH 3/8] formatting --- polaris/hub/client.py | 11 +++++------ polaris/utils/types.py | 6 +++--- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 004bb1fd..f95fdb91 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -383,7 +383,7 @@ def open_zarr_file( if mode in ["r", "r+"] and as_consolidated: return zarr.open_consolidated(store, mode=mode) return zarr.open(store, mode=mode) - + except Exception as e: raise PolarisHubError("Error opening Zarr store") from e @@ -598,12 +598,11 @@ def upload_dataset( ) # Locally consolidate the archive metadata zarr.consolidate_metadata(dataset.zarr_root.store) - zmetadata_content = dataset.zarr_root.store['.zmetadata'] - dest.store['.zmetadata'] = zmetadata_content - + zmetadata_content = dataset.zarr_root.store[".zmetadata"] + dest.store[".zmetadata"] = zmetadata_content + logger.info("Copying Zarr archive to the Hub. This may take a while.") - zarr.copy_all(source=dataset.zarr_root, dest=dest, log=logger.info) - + zarr.copy_all(source=dataset.zarr_root, dest=dest, log=logger.info) logger.success( "Your dataset has been successfully uploaded to the Hub. " diff --git a/polaris/utils/types.py b/polaris/utils/types.py index 94e50982..0cb3ba92 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -124,9 +124,9 @@ class License(BaseModel): Else it is required to manually specify this. """ - SPDX_LICENSE_DATA_PATH: ClassVar[str] = ( - "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" - ) + SPDX_LICENSE_DATA_PATH: ClassVar[ + str + ] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" id: str reference: Optional[HttpUrlString] = None From 42a742594c1d66d0d51e7c5fc2bb8b8f5bbb223f Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 13:23:18 -0400 Subject: [PATCH 4/8] CR changes --- polaris/hub/client.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/polaris/hub/client.py b/polaris/hub/client.py index f95fdb91..b503b87c 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -372,6 +372,9 @@ def open_zarr_file( Returns: The Zarr object representing the dataset. """ + if as_consolidated and mode not in ["r", "r+"]: + raise ValueError("Consolidated archives can only be used with 'r' or 'r+' mode.") + polaris_fs = PolarisFileSystem( polaris_client=self, dataset_owner=owner, @@ -596,7 +599,10 @@ def upload_dataset( path=dataset_json["zarrRootPath"], mode="w", ) - # Locally consolidate the archive metadata + + # Locally consolidate Zarr archive metadata. Future updates on handling consolidated + # metadata based on Zarr developers' recommendations can be tracked at: + # https://github.com/zarr-developers/zarr-python/issues/1731 zarr.consolidate_metadata(dataset.zarr_root.store) zmetadata_content = dataset.zarr_root.store[".zmetadata"] dest.store[".zmetadata"] = zmetadata_content From 4ddc05c2b0facdbcbf36706ad7724b9c63e3bcfd Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 13:42:27 -0400 Subject: [PATCH 5/8] open consolidated version in other parts of code --- polaris/dataset/_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index 5200b2de..e709d180 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -222,7 +222,7 @@ def zarr_root(self): if saved_on_hub: self._zarr_root = self.client.open_zarr_file(self.owner, self.name, self.zarr_root_path, "r+") else: - self._zarr_root = zarr.open(self.zarr_root_path, "r+") + self._zarr_root = zarr.open_consolidated(self.zarr_root_path, "r+") return self._zarr_root @computed_field From 1b30557ab58d210258025850e8f5e428bcd7e1c4 Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 14:43:11 -0400 Subject: [PATCH 6/8] fix zarr write consolidate error and try-catch for reading consolidated --- polaris/dataset/_dataset.py | 7 ++++++- polaris/hub/client.py | 1 + 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index e709d180..f32f44e2 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -222,7 +222,12 @@ def zarr_root(self): if saved_on_hub: self._zarr_root = self.client.open_zarr_file(self.owner, self.name, self.zarr_root_path, "r+") else: - self._zarr_root = zarr.open_consolidated(self.zarr_root_path, "r+") + try: + # Attempt to access .zmetadata to open consolidated file + _ = self.zarr_root.store[".zmetadata"] + self._zarr_root = zarr.open_consolidated(self.zarr_root.store, mode="r") + except KeyError: + self._zarr_root = zarr.open(self.zarr_root.store, mode="r+") return self._zarr_root @computed_field diff --git a/polaris/hub/client.py b/polaris/hub/client.py index b503b87c..96d65677 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -598,6 +598,7 @@ def upload_dataset( name=dataset.name, path=dataset_json["zarrRootPath"], mode="w", + as_consolidated=False, ) # Locally consolidate Zarr archive metadata. Future updates on handling consolidated From 4cbc06fceb90c78d728c84df60c11e79a6c0d659 Mon Sep 17 00:00:00 2001 From: Larissa Date: Thu, 28 Mar 2024 15:55:23 -0400 Subject: [PATCH 7/8] throw error for reading unconsolidated files + updated tests --- polaris/dataset/_dataset.py | 16 +++++++--------- tests/conftest.py | 5 +++++ tests/test_dataset.py | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index f32f44e2..40ba9f60 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -219,15 +219,13 @@ def zarr_root(self): # We open the archive in read-only mode if it is saved on the Hub if self._zarr_root is None: - if saved_on_hub: - self._zarr_root = self.client.open_zarr_file(self.owner, self.name, self.zarr_root_path, "r+") - else: - try: - # Attempt to access .zmetadata to open consolidated file - _ = self.zarr_root.store[".zmetadata"] - self._zarr_root = zarr.open_consolidated(self.zarr_root.store, mode="r") - except KeyError: - self._zarr_root = zarr.open(self.zarr_root.store, mode="r+") + try: + if saved_on_hub: + self._zarr_root = self.client.open_zarr_file(self.owner, self.name, self.zarr_root_path, "r+") + else: + self._zarr_root = zarr.open_consolidated(self.zarr_root_path, mode="r") + except KeyError as error: + raise InvalidDatasetError("A Zarr archive associated with a Polaris dataset has to be consolidated.") from error return self._zarr_root @computed_field diff --git a/tests/conftest.py b/tests/conftest.py index 8874e473..e5dad5d9 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -74,6 +74,11 @@ def zarr_archive(tmp_path): root = zarr.open_group(tmp_path, mode="w") root.array("A", data=np.random.random((100, 2048))) root.array("B", data=np.random.random((100, 2048))) + + zarr.consolidate_metadata(root.store) + zmetadata_content = root.store[".zmetadata"] + root.store[".zmetadata"] = zmetadata_content + return tmp_path diff --git a/tests/test_dataset.py b/tests/test_dataset.py index 26da2eb8..a1790095 100644 --- a/tests/test_dataset.py +++ b/tests/test_dataset.py @@ -47,6 +47,7 @@ def test_load_data(tmp_path, with_slice, with_caching): root = zarr.open(zarr_path, "w") root.array("A", data=arr) + zarr.consolidate_metadata(root.store) path = "A#0:5" if with_slice else "A#0" table = pd.DataFrame({"A": [path]}, index=[0]) From 607fa356a10b404e9311f7a8ffdf55e672b846b2 Mon Sep 17 00:00:00 2001 From: cwognum Date: Thu, 28 Mar 2024 17:36:06 -0400 Subject: [PATCH 8/8] Formatting and fixing test cases --- polaris/dataset/_dataset.py | 17 ++++++++++++++--- polaris/dataset/_factory.py | 1 + polaris/hub/client.py | 2 +- polaris/utils/types.py | 6 +++--- tests/conftest.py | 8 ++------ 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/polaris/dataset/_dataset.py b/polaris/dataset/_dataset.py index 40ba9f60..07317d55 100644 --- a/polaris/dataset/_dataset.py +++ b/polaris/dataset/_dataset.py @@ -221,11 +221,15 @@ def zarr_root(self): if self._zarr_root is None: try: if saved_on_hub: - self._zarr_root = self.client.open_zarr_file(self.owner, self.name, self.zarr_root_path, "r+") + self._zarr_root = self.client.open_zarr_file( + self.owner, self.name, self.zarr_root_path, "r+" + ) else: - self._zarr_root = zarr.open_consolidated(self.zarr_root_path, mode="r") + self._zarr_root = zarr.open_consolidated(self.zarr_root_path, mode="r+") except KeyError as error: - raise InvalidDatasetError("A Zarr archive associated with a Polaris dataset has to be consolidated.") from error + raise InvalidDatasetError( + "A Zarr archive associated with a Polaris dataset has to be consolidated." + ) from error return self._zarr_root @computed_field @@ -343,6 +347,13 @@ def to_json(self, destination: str) -> str: if self.zarr_root is not None: dest = zarr.open(zarr_archive, "w") zarr.copy_all(source=self.zarr_root, dest=dest) + + # Copy the .zmetadata file + # To track discussions on whether this should be done by copy_all() + # see https://github.com/zarr-developers/zarr-python/issues/1731 + zmetadata_content = self.zarr_root.store.store[".zmetadata"] + dest.store[".zmetadata"] = zmetadata_content + serialized["zarr_root_path"] = zarr_archive self.table.to_parquet(table_path) diff --git a/polaris/dataset/_factory.py b/polaris/dataset/_factory.py index b6dd48e3..c0329e9d 100644 --- a/polaris/dataset/_factory.py +++ b/polaris/dataset/_factory.py @@ -215,6 +215,7 @@ def add_from_file(self, path: str): def build(self) -> Dataset: """Returns a Dataset based on the current state of the factory.""" + zarr.consolidate_metadata(self.zarr_root.store) return Dataset( table=self._table, annotations=self._annotations, diff --git a/polaris/hub/client.py b/polaris/hub/client.py index 96d65677..c7a2feb2 100644 --- a/polaris/hub/client.py +++ b/polaris/hub/client.py @@ -605,7 +605,7 @@ def upload_dataset( # metadata based on Zarr developers' recommendations can be tracked at: # https://github.com/zarr-developers/zarr-python/issues/1731 zarr.consolidate_metadata(dataset.zarr_root.store) - zmetadata_content = dataset.zarr_root.store[".zmetadata"] + zmetadata_content = dataset.zarr_root.store.store[".zmetadata"] dest.store[".zmetadata"] = zmetadata_content logger.info("Copying Zarr archive to the Hub. This may take a while.") diff --git a/polaris/utils/types.py b/polaris/utils/types.py index 0cb3ba92..94e50982 100644 --- a/polaris/utils/types.py +++ b/polaris/utils/types.py @@ -124,9 +124,9 @@ class License(BaseModel): Else it is required to manually specify this. """ - SPDX_LICENSE_DATA_PATH: ClassVar[ - str - ] = "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" + SPDX_LICENSE_DATA_PATH: ClassVar[str] = ( + "https://raw.githubusercontent.com/spdx/license-list-data/main/json/licenses.json" + ) id: str reference: Optional[HttpUrlString] = None diff --git a/tests/conftest.py b/tests/conftest.py index e5dad5d9..8b9fd7cf 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -70,15 +70,11 @@ def test_dataset(test_data, test_org_owner): @pytest.fixture(scope="function") def zarr_archive(tmp_path): - tmp_path = fs.join(str(tmp_path), "data.zarr") - root = zarr.open_group(tmp_path, mode="w") + tmp_path = fs.join(tmp_path, "data.zarr") + root = zarr.open(tmp_path, mode="w") root.array("A", data=np.random.random((100, 2048))) root.array("B", data=np.random.random((100, 2048))) - zarr.consolidate_metadata(root.store) - zmetadata_content = root.store[".zmetadata"] - root.store[".zmetadata"] = zmetadata_content - return tmp_path