Allow asa calls on non-table GroupNodes (#626)

* Allow asa calls on non-table GroupNodes Previously, calling _data on GroupNodes was only allowed if all the node’s children were TableNodes with matching schemas. This change allows _data to be called with an asa function to return all child-node paths regardless of their type. Updated docs to describe passing an asa function on a GroupNode that's not a DataFrame.
quiltdata · May 23, 2018 · 069e403 · 069e403
1 parent 8e15f95
commit 069e403
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 11 deletions.
diff --git a/compiler/quilt/nodes.py b/compiler/quilt/nodes.py
@@ -50,7 +50,7 @@ def __init__(self, package, node, data, meta):
     def _data(self, asa=None):
         """
         Returns the contents of the node: a dataframe or a file path, or passes
-        the node and its contents to a callable.        
+        the node and its contents to a callable.
         """
         if asa is not None:
             if self._package is None or not self._node.hashes:
@@ -128,14 +128,15 @@ def _data(self, asa=None):
         store = None
         hash_list = []
         stack = [self]
+        alldfs = True
         while stack:
             node = stack.pop()
             if isinstance(node, GroupNode):
                 stack.extend(child for _, child in sorted(node._items(), reverse=True))
             else:
                 if not isinstance(node._node, core.TableNode):
-                    raise ValueError("Group contains non-dataframe nodes")
-                if not node._node.hashes:
+                    alldfs = False
+                if node._node is None or not node._node.hashes:
                     msg = "Can only merge built dataframes. Build this package and try again."
                     raise NotImplementedError(msg)
                 node_store = node._package.get_store()
@@ -148,6 +149,8 @@ def _data(self, asa=None):
         if asa is None:
             if not hash_list:
                 return None
+            if not alldfs:
+                raise ValueError("Group contains non-dataframe nodes")
             return store.load_dataframe(hash_list)
         else:
             if hash_list:

diff --git a/compiler/quilt/test/test_import.py b/compiler/quilt/test/test_import.py
@@ -439,10 +439,10 @@ def test_lambda(node, hashes):
             for path in hashes:
                 assert os.path.exists(path)
             return testdata
-        
+
         mydir = os.path.dirname(__file__)
         build_path = os.path.join(mydir, './build.yml')
-        command.build('foo/package', build_path)        
+        command.build('foo/package', build_path)
         pkg = command.load('foo/package')
         assert pkg.dataframes.csv(asa=test_lambda) is testdata
 
@@ -454,22 +454,23 @@ def test_lambda(node, hashes):
             for path in hashes:
                 assert os.path.exists(path)
             return testdata
-        
+
         mydir = os.path.dirname(__file__)
         build_path = os.path.join(mydir, './build.yml')
         command.build('foo/package', build_path)
-        
+
         pkg = command.load('foo/package')
         assert pkg.dataframes(asa=test_lambda) is testdata
-
+        assert pkg(asa=test_lambda) is testdata
+
     def test_memory_only_datanode_asa(self):
         testdata = "justatest"
         def test_lambda(node, hashes):
             return testdata
-        
+
         mydir = os.path.dirname(__file__)
         build_path = os.path.join(mydir, './build.yml')
-        command.build('foo/package', build_path)        
+        command.build('foo/package', build_path)
         pkg = command.load('foo/package')
         pkg._set(['dataframes', 'memory'], pd.DataFrame())
         with self.assertRaises(ValueError):

diff --git a/docs/api-python.md b/docs/api-python.md
@@ -132,7 +132,7 @@ Packages contain three types of nodes:
 * Retrieve the contents of a `DataNode` with `_data()`, or simply `()`: `PACKAGE.NODE.ANOTHER_NODE()`
   * Columnar data (`XLS`, `CSV`, `TSV`, etc.) returns as a `pandas.DataFrame`
   * All other data types return a string to the path of the object in the package store
-  * Provide a custom deserialzer by passing a function to `data(asa=FUNCTION)` with the signature `function(NODE, LIST_OF_FILE_PATHS)`. A single node can contain data in multiple files (e.g., a DataFrame stored as a set of Parquet files). Calling `data(asa=FUNCTION)` on a GroupNode is currently only allowed for GroupNodes where all children are DataFrames (backed by Parquet files) with a common schema. In that case, FUNCTION is called with the GroupNode object and a list of the paths to all of the Parquet files in all of the child nodes.
+  * Provide a custom deserialzer by passing a function to `data(asa=FUNCTION)` with the signature `function(NODE, LIST_OF_FILE_PATHS)`. A single node can contain data in multiple files (e.g., a DataFrame stored as a set of Parquet files). Calling `data(asa=FUNCTION)` on a GroupNode calls FUNCTION with the GroupNode object and a list of the paths to all of the objects in all of the child nodes.
 
 ### Enumerate package contents
 * `quilt.inspect("USER/PACKAGE")` shows package columns, types, and shape