diff --git a/polaris/dataset/zarr/codecs.py b/polaris/dataset/zarr/codecs.py index 2ca118c6..0459b684 100644 --- a/polaris/dataset/zarr/codecs.py +++ b/polaris/dataset/zarr/codecs.py @@ -22,12 +22,14 @@ def encode(self, buf: np.ndarray): """ Encode a chunk of RDKit Mols to byte strings """ - to_encode = np.empty(shape=len(buf), dtype=object) + # NOTE (cwognum): I ran into a Cython issue because we could pass None to the VLenBytes codec. + # Using np.full() ensures all elements are initialized as empty byte strings instead. + to_encode = np.full(fill_value=b"", shape=len(buf), dtype=object) for idx, mol in enumerate(buf): if mol is None or (isinstance(mol, bytes) and len(mol) == 0): continue if not isinstance(mol, Chem.Mol): - raise ValueError(f"Expected an RDKitMol, but got {type(buf)} instead.") + raise ValueError(f"Expected an RDKitMol, but got {type(mol)} instead.") props = Chem.PropertyPickleOptions.AllProps to_encode[idx] = mol.ToBinary(props) diff --git a/tests/test_codecs.py b/tests/test_codecs.py index c38e4dab..a750f4c3 100644 --- a/tests/test_codecs.py +++ b/tests/test_codecs.py @@ -6,6 +6,13 @@ def test_rdkit_mol_codec(): mol = dm.to_mol("C1=CC=CC=C1") - arr = zarr.array([mol, mol], chunks=(2,), dtype=object, object_codec=RDKitMolCodec()) + + arr = zarr.empty(shape=10, chunks=2, dtype=object, object_codec=RDKitMolCodec()) + + arr[0] = mol + arr[1] = mol + arr[2] = mol + assert dm.same_mol(arr[0], mol) assert dm.same_mol(arr[1], mol) + assert dm.same_mol(arr[2], mol)