Skip to content

Commit

Permalink
Merge branch 'master' into adjust-gql-error-handling
Browse files Browse the repository at this point in the history
* master:
  Add AnnData as format (#2974)
  Add unit tests for selector_fn (#3394)
  Quilt3 v5.2.1 (#3399)
  Render status report if no `bucketConfig` (#3396)
  Fix CSV serialization with pandas 2 (#3395)
  • Loading branch information
nl0 committed Apr 6, 2023
2 parents a96d626 + 3d4d22c commit 081914a
Show file tree
Hide file tree
Showing 8 changed files with 187 additions and 20 deletions.
2 changes: 1 addition & 1 deletion api/python/quilt3/VERSION
@@ -1 +1 @@
5.2.0
5.2.1
89 changes: 75 additions & 14 deletions api/python/quilt3/formats.py
Expand Up @@ -69,12 +69,20 @@
import copy
import csv
import gzip
import importlib
import io
import json
import sys
import tempfile
import warnings
from abc import ABC, abstractmethod
from collections import defaultdict
from pathlib import Path

try:
from importlib import metadata as importlib_metadata
except ImportError:
import importlib_metadata

from .util import QuiltException

Expand Down Expand Up @@ -318,6 +326,7 @@ def all_supported_formats(cls):
Python Object Type Serialization Formats
<class 'pandas.core.frame.DataFrame'> [ssv, csv, tsv, parquet]
<class 'anndata.AnnData'> [.h5ad]
<class 'numpy.ndarray'> [npy, npz]
<class 'str'> [md, json, rst, txt]
<class 'dict'> [json]
Expand All @@ -328,19 +337,18 @@ def all_supported_formats(cls):
<class 'float'> [json]
<class 'bytes'> [bin]
"""
try:
import numpy as np
except ImportError:
pass
else:
cls.search(np.ndarray) # Force FormatHandlers to register np.ndarray as a supported object type

try:
import pandas as pd
except ImportError:
pass
else:
cls.search(pd.DataFrame) # Force FormatHandlers to register pd.DataFrame as a supported object type
# Force FormatHandlers to register these classes as supported object types
for mod_name, cls_name in [
('numpy', 'ndarray'),
('pandas', 'DataFrame'),
('anndata', 'AnnData'),
]:
try:
mod = importlib.import_module(mod_name)
except ImportError:
pass
else:
cls.search(getattr(mod, cls_name))

type_map = defaultdict(set)
for handler in cls.registered_handlers:
Expand Down Expand Up @@ -751,7 +759,11 @@ def get_ser_kwargs(self, opts):

name_map = {
'fieldsep': 'sep',
'linesep': 'line_terminator',
'linesep': (
'lineterminator'
if int(importlib_metadata.version('pandas').split('.')[0]) >= 2 else
'line_terminator'
),
'use_index': 'index',
'index_names': 'index_label',
}
Expand Down Expand Up @@ -1024,6 +1036,55 @@ def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
ParquetFormatHandler().register() # latest is preferred


class AnnDataFormatHandler(BaseFormatHandler):
"""Format for AnnData <--> .h5ad
Format Opts:
The following options may be used anywhere format opts are accepted,
or directly in metadata under `{'format': {'opts': {...: ...}}}`.
compression('gzip', 'lzf', None): applies during serialization only.
"""
name = 'h5ad'
handled_extensions = ['h5ad']
opts = ('compression',)
defaults = dict(
compression='lzf',
)

def handles_type(self, typ: type) -> bool:
# don't load module unless we actually have to use it.
if 'anndata' not in sys.modules:
return False
import anndata as ad
self.handled_types.add(ad.AnnData)
return super().handles_type(typ)

def serialize(self, obj, meta=None, ext=None, **format_opts):
opts = self.get_opts(meta, format_opts)
opts_with_defaults = copy.deepcopy(self.defaults)
opts_with_defaults.update(opts)

with tempfile.TemporaryDirectory() as td:
path = Path(td) / 'data.h5ad'
obj.write(path, **opts_with_defaults)
data = path.read_bytes()

return data, self._update_meta(meta, additions=opts_with_defaults)

def deserialize(self, bytes_obj, meta=None, ext=None, **format_opts):
try:
import anndata as ad
except ImportError:
raise QuiltException("Please install quilt3[anndata]")

buf = io.BytesIO(bytes_obj)
return ad.read_h5ad(buf)


AnnDataFormatHandler().register()


class CompressionRegistry:
"""A collection for organizing `CompressionHandler` objects."""
registered_handlers = []
Expand Down
5 changes: 2 additions & 3 deletions api/python/setup.py
Expand Up @@ -75,10 +75,9 @@ def run(self):
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
],
'anndata': ['anndata>=0.8.0'],
'tests': [
'numpy>=1.14.0', # required by pandas, but missing from its dependencies.
'pandas>=0.19.2',
'pyarrow>=0.14.1', # as of 7/5/19: linux/circleci bugs on 0.14.0
'quilt3[pyarrow,anndata]',
'pytest==6.*',
'pytest-cov',
'coverage==6.4',
Expand Down
Binary file added api/python/tests/data/test.h5ad
Binary file not shown.
67 changes: 67 additions & 0 deletions api/python/tests/integration/test_packages.py
Expand Up @@ -1817,6 +1817,73 @@ def test_push_dest_fn(self):
BytesIO(push_manifest_mock.call_args[0][2])
)[lk].physical_key == PhysicalKey(dest_bucket, dest_key, version)

@patch('quilt3.workflows.validate', mock.MagicMock(return_value=None))
@patch('quilt3.Package._calculate_top_hash', mock.MagicMock(return_value=mock.sentinel.top_hash))
def test_push_selector_fn_false(self):
pkg_name = 'foo/bar'
lk = 'foo'
src_bucket = 'src-bucket'
src_key = 'foo.txt'
src_version = '1'
dst_bucket = 'dst-bucket'
pkg = Package()
with patch('quilt3.packages.get_size_and_version', return_value=(0, src_version)):
pkg.set(lk, f's3://{src_bucket}/{src_key}')

selector_fn = mock.MagicMock(return_value=False)
push_manifest_mock = self.patch_s3_registry('push_manifest')
self.patch_s3_registry('shorten_top_hash', return_value='7a67ff4')
with patch('quilt3.packages.calculate_sha256', return_value=["a" * 64]):
pkg.push(pkg_name, registry=f's3://{dst_bucket}', selector_fn=selector_fn, force=True)

selector_fn.assert_called_once_with(lk, pkg[lk])
push_manifest_mock.assert_called_once_with(pkg_name, mock.sentinel.top_hash, ANY)
assert Package.load(
BytesIO(push_manifest_mock.call_args[0][2])
)[lk].physical_key == PhysicalKey(src_bucket, src_key, src_version)

@patch('quilt3.workflows.validate', mock.MagicMock(return_value=None))
@patch('quilt3.Package._calculate_top_hash', mock.MagicMock(return_value=mock.sentinel.top_hash))
def test_push_selector_fn_true(self):
pkg_name = 'foo/bar'
lk = 'foo'
src_bucket = 'src-bucket'
src_key = 'foo.txt'
src_version = '1'
dst_bucket = 'dst-bucket'
dst_key = f'{pkg_name}/{lk}'
dst_version = '2'
pkg = Package()
with patch('quilt3.packages.get_size_and_version', return_value=(0, src_version)):
pkg.set(lk, f's3://{src_bucket}/{src_key}')

selector_fn = mock.MagicMock(return_value=True)
self.s3_stubber.add_response(
method='copy_object',
service_response={
'VersionId': dst_version,
},
expected_params={
'Bucket': dst_bucket,
'Key': dst_key,
'CopySource': {
'Bucket': src_bucket,
'Key': src_key,
'VersionId': src_version,
},
}
)
push_manifest_mock = self.patch_s3_registry('push_manifest')
self.patch_s3_registry('shorten_top_hash', return_value='7a67ff4')
with patch('quilt3.packages.calculate_sha256', return_value=["a" * 64]):
pkg.push(pkg_name, registry=f's3://{dst_bucket}', selector_fn=selector_fn, force=True)

selector_fn.assert_called_once_with(lk, pkg[lk])
push_manifest_mock.assert_called_once_with(pkg_name, mock.sentinel.top_hash, ANY)
assert Package.load(
BytesIO(push_manifest_mock.call_args[0][2])
)[lk].physical_key == PhysicalKey(dst_bucket, dst_key, dst_version)

def test_package_dump_file_mode(self):
"""
Package.dump() works with both files opened in binary and text mode.
Expand Down
32 changes: 32 additions & 0 deletions api/python/tests/test_formats.py
Expand Up @@ -3,11 +3,13 @@
import numpy as np
import pandas as pd
import pytest
from anndata import AnnData

from quilt3.formats import FormatRegistry
from quilt3.util import QuiltException

# Constants
data_dir = pathlib.Path(__file__).parent / 'data'


# Code
Expand Down Expand Up @@ -146,6 +148,36 @@ def test_formats_csv_roundtrip():
assert df1.equals(df2)


def test_formats_anndata_roundtrip():
meta = {'format': {'name': 'h5ad'}}
ad_file = data_dir / 'test.h5ad'
ad: AnnData = FormatRegistry.deserialize(ad_file.read_bytes(), meta)
assert isinstance(ad, AnnData)

bin, format_meta = FormatRegistry.serialize(ad, meta)
meta2 = {**meta, **format_meta}
ad2: AnnData = FormatRegistry.deserialize(bin, meta2)
np.allclose(ad.X, ad2.X)
ad.obs.equals(ad2.obs)
ad.var.equals(ad2.var)


def test_all_supported_formats():
assert FormatRegistry.all_supported_formats() == {
AnnData: {'h5ad'},
pd.DataFrame: {'csv', 'parquet', 'ssv', 'tsv'},
np.ndarray: {'npy', 'npz'},
str: {'json', 'md', 'rst', 'txt'},
tuple: {'json'},
type(None): {'json'},
dict: {'json'},
int: {'json'},
list: {'json'},
float: {'json'},
bytes: {'bin'},
}


def test_formats_search_fail_notfound():
# a search that finds nothing should raise with an explanation.
class Foo:
Expand Down
4 changes: 2 additions & 2 deletions catalog/app/components/Preview/loaders/Html.js
Expand Up @@ -23,8 +23,8 @@ function IFrameLoader({ handle, children }) {
return GQL.fold(bucketData, {
fetching: () => children(AsyncResult.Pending()),
error: (e) => children(AsyncResult.Err(e)),
data: ({ bucketConfig: { browsable } }) =>
browsable && inPackage ? (
data: ({ bucketConfig }) =>
bucketConfig?.browsable && inPackage ? (
<IFrame.LoaderBrowsable {...{ handle, children }} />
) : (
<IFrame.LoaderSigned {...{ handle, children }} />
Expand Down
8 changes: 8 additions & 0 deletions docs/CHANGELOG.md
Expand Up @@ -14,6 +14,14 @@ Entries inside each section should be ordered by type:
## Catalog, Lambdas
!-->
# unreleased - YYYY-MM-DD
## Python API
* [Added] Support [AnnData](https://anndata.readthedocs.io/en/latest/) format ([#2974](https://github.com/quiltdata/quilt/pull/2974))

# 5.2.1 - 2023-04-05
## Python API
* [Fixed] Fixed CSV serialization with pandas 2 ([#3395](https://github.com/quiltdata/quilt/pull/3395))

# 5.2.0 - 2023-03-27
## Python API
* [Added] Validation of package entries metadata ([#3286](https://github.com/quiltdata/quilt/pull/3286))
Expand Down

0 comments on commit 081914a

Please sign in to comment.