MRG: force continue past tax genome classification errors (#3100)

When we were doing one or a few genome classifications, it made sense to error out completely if there was an issue. Now that we have fastmultigather and can do 10s of thousands at once, It would be nice to be able to continue past errors (logging them). **Changed behavior:** - If there is a failed classification, notify the error and do not write that result. Continue with classification. - Finish classification and write output file, BUT exit with an error code if there were errors, except if --force is used. - Remove some previously useful reporting about the classification ranks, because it's too much output for large-scale classification. --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
sourmash-bio · Apr 12, 2024 · e0d002a · e0d002a
1 parent a387d22
commit e0d002a
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 30 deletions.
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
@@ -313,6 +313,9 @@ def genome(args):
         sys.exit(-1)
 
     # for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur.
+    n_total = len(query_gather_results)
+    classified_results = []
+    found_error = False
     for queryResult in query_gather_results:
         try:
             queryResult.build_classification_result(
@@ -322,10 +325,21 @@ def genome(args):
                 lingroup_ranks=lg_ranks,
                 lingroups=all_lgs,
             )
+            classified_results.append(queryResult)
 
         except ValueError as exc:
-            error(f"ERROR: {str(exc)}")
-            sys.exit(-1)
+            found_error = True
+            notify(f"ERROR: {str(exc)}")
+
+    n_classified = len(classified_results)
+    if n_classified == 0:
+        notify("No queries could be classified. Exiting.")
+        sys.exit(-1)
+    else:
+        classif_perc = (float(n_classified) / float(n_total)) * 100
+        notify(
+            f"classified {n_classified}/{n_total} queries ({classif_perc :.2f}%). Writing results"
+        )
 
     # write outputs
     if "csv_summary" in args.output_format:
@@ -334,7 +348,7 @@ def genome(args):
         )
         with FileOutputCSV(summary_outfile) as out_fp:
             tax_utils.write_summary(
-                query_gather_results,
+                classified_results,
                 out_fp,
                 limit_float_decimals=limit_float,
                 classification=True,
@@ -389,6 +403,11 @@ def genome(args):
         with FileOutputCSV(lineage_outfile) as out_fp:
             tax_utils.write_output(header, lineage_results, out_fp)
 
+    # if there was a classification error, exit with err code
+    if found_error:
+        if not args.force:
+            sys.exit(-1)
+
 
 def annotate(args):
     """

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
@@ -2328,9 +2328,6 @@ def summarize_up_ranks(self, single_rank=None, force_resummarize=False):
                     f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})"
                 )
             self.summarized_ranks = [single_rank]
-        notify(
-            f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} "
-        )
         for taxres in self.raw_taxresults:
             lininfo = taxres.lineageInfo
             if (

diff --git a/tests/test_tax.py b/tests/test_tax.py
@@ -2700,6 +2700,115 @@ def test_genome_gather_two_files_empty_force(runtmp):
     )
 
 
+def test_genome_gather_two_files_one_classif_fail(runtmp):
+    # if one query cant be classified still get classif for second
+    # no --force = fail but still write file
+    c = runtmp
+    taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
+    g_res = utils.get_test_data("tax/test1.gather.csv")
+
+    # make test2 results (identical to test1 except query_name and filename)
+    g_res2 = runtmp.output("test2.gather.csv")
+    test2_results = [
+        x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()
+    ]
+    test2_results[1] = test2_results[1].replace(
+        "0.08815317112086159", "1.1"
+    )  # make test2 f_unique_to_query sum to >1
+    for line in test2_results:
+        print(line)
+    with open(g_res2, "w") as fp:
+        fp.writelines(test2_results)
+
+    with pytest.raises(SourmashCommandFailed):
+        c.run_sourmash(
+            "tax",
+            "genome",
+            "-g",
+            g_res,
+            g_res2,
+            "--taxonomy-csv",
+            taxonomy_csv,
+            "--rank",
+            "species",
+            "--containment-threshold",
+            "0",
+        )
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == -1
+    assert (
+        "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank"
+        in c.last_result.out
+    )
+    assert (
+        "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000"
+        in c.last_result.out
+    )
+    assert "test2" not in c.last_result.out
+    assert (
+        "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query."
+        in c.last_result.err
+    )
+
+
+def test_genome_gather_two_files_one_classif(runtmp):
+    # if one query cant be classified, still get classif for second
+    c = runtmp
+    taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
+    g_res = utils.get_test_data("tax/test1.gather.csv")
+
+    # make test2 results (identical to test1 except query_name and filename)
+    g_res2 = runtmp.output("test2.gather.csv")
+    test2_results = [
+        x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()
+    ]
+    test2_results[1] = test2_results[1].replace(
+        "0.08815317112086159", "1.1"
+    )  # make test2 f_unique_to_query sum to >1
+    for line in test2_results:
+        print(line)
+    with open(g_res2, "w") as fp:
+        fp.writelines(test2_results)
+
+    c.run_sourmash(
+        "tax",
+        "genome",
+        "-g",
+        g_res,
+        g_res2,
+        "--taxonomy-csv",
+        taxonomy_csv,
+        "--rank",
+        "species",
+        "--containment-threshold",
+        "0",
+        "--force",
+    )
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert (
+        "query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank"
+        in c.last_result.out
+    )
+    assert (
+        "test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000"
+        in c.last_result.out
+    )
+    assert "test2" not in c.last_result.out
+    assert (
+        "ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query."
+        in c.last_result.err
+    )
+
+
 def test_genome_gather_duplicate_filename(runtmp):
     c = runtmp
     taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
@@ -5936,10 +6045,6 @@ def test_metagenome_LIN_lingroups(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert (
         "Read 5 lingroup rows and found 5 distinct lingroup prefixes."
         in c.last_result.err
@@ -5970,10 +6075,6 @@ def test_metagenome_LIN_human_summary_no_lin_position(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert "sample name    proportion   cANI   lineage" in c.last_result.out
     assert "-----------    ----------   ----   -------" in c.last_result.out
     assert "test1             86.9%     -      unclassified" in c.last_result.out
@@ -6020,10 +6121,6 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert "sample name    proportion   cANI   lineage" in c.last_result.out
     assert "-----------    ----------   ----   -------" in c.last_result.out
     assert "test1             86.9%     -      unclassified" in c.last_result.out
@@ -6058,10 +6155,6 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status == 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert "fraction	0	1	2	3	4	5" in c.last_result.out
     assert "0.08815317112086159	0	0	0	0	0	0" in c.last_result.out
     assert "0.07778220981252493	1	0	0	0	0	0" in c.last_result.out
@@ -6133,10 +6226,6 @@ def test_metagenome_LIN_lingroups_empty_lg_file(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status != 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert (
         f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err
     )
@@ -6302,8 +6391,4 @@ def test_metagenome_LIN_lingroups_lg_only_header(runtmp):
     print(c.last_result.err)
 
     assert c.last_result.status != 0
-    assert (
-        "Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
-        in c.last_result.err
-    )
     assert f"No lingroups loaded from {lg_file}" in c.last_result.err