Skip to content

Commit

Permalink
Allow asa calls on non-table GroupNodes (#626)
Browse files Browse the repository at this point in the history
* Allow asa calls on non-table GroupNodes

Previously, calling _data on GroupNodes was only allowed if all the
node’s children were TableNodes with matching schemas. This change
allows _data to be called with an asa function to return all child-node
paths regardless of their type. Updated docs to describe passing an asa function on a GroupNode that's not a DataFrame.
  • Loading branch information
kevinemoore committed May 23, 2018
1 parent 8e15f95 commit 069e403
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 11 deletions.
9 changes: 6 additions & 3 deletions compiler/quilt/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ def __init__(self, package, node, data, meta):
def _data(self, asa=None):
"""
Returns the contents of the node: a dataframe or a file path, or passes
the node and its contents to a callable.
the node and its contents to a callable.
"""
if asa is not None:
if self._package is None or not self._node.hashes:
Expand Down Expand Up @@ -128,14 +128,15 @@ def _data(self, asa=None):
store = None
hash_list = []
stack = [self]
alldfs = True
while stack:
node = stack.pop()
if isinstance(node, GroupNode):
stack.extend(child for _, child in sorted(node._items(), reverse=True))
else:
if not isinstance(node._node, core.TableNode):
raise ValueError("Group contains non-dataframe nodes")
if not node._node.hashes:
alldfs = False
if node._node is None or not node._node.hashes:
msg = "Can only merge built dataframes. Build this package and try again."
raise NotImplementedError(msg)
node_store = node._package.get_store()
Expand All @@ -148,6 +149,8 @@ def _data(self, asa=None):
if asa is None:
if not hash_list:
return None
if not alldfs:
raise ValueError("Group contains non-dataframe nodes")
return store.load_dataframe(hash_list)
else:
if hash_list:
Expand Down
15 changes: 8 additions & 7 deletions compiler/quilt/test/test_import.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,10 +439,10 @@ def test_lambda(node, hashes):
for path in hashes:
assert os.path.exists(path)
return testdata

mydir = os.path.dirname(__file__)
build_path = os.path.join(mydir, './build.yml')
command.build('foo/package', build_path)
command.build('foo/package', build_path)
pkg = command.load('foo/package')
assert pkg.dataframes.csv(asa=test_lambda) is testdata

Expand All @@ -454,22 +454,23 @@ def test_lambda(node, hashes):
for path in hashes:
assert os.path.exists(path)
return testdata

mydir = os.path.dirname(__file__)
build_path = os.path.join(mydir, './build.yml')
command.build('foo/package', build_path)

pkg = command.load('foo/package')
assert pkg.dataframes(asa=test_lambda) is testdata

assert pkg(asa=test_lambda) is testdata

def test_memory_only_datanode_asa(self):
testdata = "justatest"
def test_lambda(node, hashes):
return testdata

mydir = os.path.dirname(__file__)
build_path = os.path.join(mydir, './build.yml')
command.build('foo/package', build_path)
command.build('foo/package', build_path)
pkg = command.load('foo/package')
pkg._set(['dataframes', 'memory'], pd.DataFrame())
with self.assertRaises(ValueError):
Expand Down
2 changes: 1 addition & 1 deletion docs/api-python.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Packages contain three types of nodes:
* Retrieve the contents of a `DataNode` with `_data()`, or simply `()`: `PACKAGE.NODE.ANOTHER_NODE()`
* Columnar data (`XLS`, `CSV`, `TSV`, etc.) returns as a `pandas.DataFrame`
* All other data types return a string to the path of the object in the package store
* Provide a custom deserialzer by passing a function to `data(asa=FUNCTION)` with the signature `function(NODE, LIST_OF_FILE_PATHS)`. A single node can contain data in multiple files (e.g., a DataFrame stored as a set of Parquet files). Calling `data(asa=FUNCTION)` on a GroupNode is currently only allowed for GroupNodes where all children are DataFrames (backed by Parquet files) with a common schema. In that case, FUNCTION is called with the GroupNode object and a list of the paths to all of the Parquet files in all of the child nodes.
* Provide a custom deserialzer by passing a function to `data(asa=FUNCTION)` with the signature `function(NODE, LIST_OF_FILE_PATHS)`. A single node can contain data in multiple files (e.g., a DataFrame stored as a set of Parquet files). Calling `data(asa=FUNCTION)` on a GroupNode calls FUNCTION with the GroupNode object and a list of the paths to all of the objects in all of the child nodes.

### Enumerate package contents
* `quilt.inspect("USER/PACKAGE")` shows package columns, types, and shape
Expand Down

0 comments on commit 069e403

Please sign in to comment.