qiime2 · thermokarst · Aug 8, 2019 · Aug 2, 2019 · Aug 8, 2019
diff --git a/q2_gneiss/cluster/_cluster.py b/q2_gneiss/cluster/_cluster.py
@@ -69,6 +69,7 @@ def correlation_clustering(table: pd.DataFrame, pseudocount: float = 0.5
 
 def gradient_clustering(table: pd.DataFrame,
                         gradient: NumericMetadataColumn,
+                        ignore_missing_samples: bool = False,
                         weighted: bool = True) -> skbio.TreeNode:
     """ Builds a tree for features based on a gradient.
 
@@ -78,6 +79,9 @@ def gradient_clustering(table: pd.DataFrame,
        Contingency table where rows are samples and columns are features.
     gradient : qiime2.NumericMetadataColumn
        Continuous vector of measurements corresponding to samples.
+    ignore_missing_samples: bool
+        Whether to except or ignore when there are samples present in the table
+        that are not present in the gradient metadata.
     weighted : bool
        Specifies if abundance or presence/absence information
        should be used to perform the clustering.
@@ -88,6 +92,14 @@ def gradient_clustering(table: pd.DataFrame,
        Represents the partitioning of features with respect to the gradient.
     """
     c = gradient.to_series()
+    if not ignore_missing_samples:
+        difference = set(table.index) - set(c.index)
+        if difference:
+            raise KeyError("There are samples present in the table not "
+                           "present in the gradient metadata column. Override "
+                           "this error by using the `ignore_missing_samples` "
+                           "argument. Offending samples: %r"
+                           % ', '.join(sorted([str(i) for i in difference])))
     if not weighted:
         table = (table > 0).astype(np.float)
     table, c = match(table, c)
@@ -109,7 +121,8 @@ def gradient_clustering(table: pd.DataFrame,
         'table': ('The feature table containing the samples in which '
                   'the columns will be clustered.'),
     },
-    parameters={'gradient': MetadataColumn[Numeric], 'weighted': Bool},
+    parameters={'gradient': MetadataColumn[Numeric], 'weighted': Bool,
+                'ignore_missing_samples': Bool},
     parameter_descriptions={
         'gradient': ('Contains gradient values to sort the '
                      'features and samples.'),

diff --git a/q2_gneiss/cluster/tests/test_cluster.py b/q2_gneiss/cluster/tests/test_cluster.py
@@ -81,6 +81,26 @@ def test_gradient_artifact_weighted(self):
 
         self.assertNotEqual(str(res_clust_uw), str(res_clust_w))
 
+    def test_gradient_missing_samples(self):
+        from qiime2.plugins.gneiss.methods import gradient_clustering
+        table = pd.DataFrame({"x": 1, "y": 2}, index=["a", "s1"])
+        table = qiime2.Artifact.import_data("FeatureTable[Frequency]", table)
+        metadata = qiime2.Metadata.load(get_data_path("test_metadata.txt"))
+
+        with self.assertRaisesRegex(KeyError, "not present.*a"):
+            gradient_clustering(table, metadata.get_column("x"))
+
+    def test_gradient_ignore_missing_samples(self):
+        from qiime2.plugins.gneiss.methods import gradient_clustering
+        table = pd.DataFrame({"x": 1, "y": 2}, index=["a", "s1"])
+        table = qiime2.Artifact.import_data("FeatureTable[Frequency]", table)
+        metadata = qiime2.Metadata.load(get_data_path("test_metadata.txt"))
+
+        gradient_clustering(table, metadata.get_column("x"),
+                            ignore_missing_samples=True)
+        # Checkpoint assertion
+        self.assertTrue(True)
+
     def test_assign_ids(self):
         from qiime2.plugins.gneiss.methods import assign_ids
         tree_f = get_data_path("tree.qza")