Skip to content

Commit

Permalink
MRG: force continue past tax genome classification errors (#3100)
Browse files Browse the repository at this point in the history
When we were doing one or a few genome classifications, it made sense to
error out completely if there was an issue. Now that we have
fastmultigather and can do 10s of thousands at once, It would be nice to
be able to continue past errors (logging them).

**Changed behavior:**
- If there is a failed classification, notify the error and do not write
that result. Continue with classification.
- Finish classification and write output file, BUT exit with an error
code if there were errors, except if --force is used.
- Remove some previously useful reporting about the classification
ranks, because it's too much output for large-scale classification.

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
bluegenes and pre-commit-ci[bot] committed Apr 12, 2024
1 parent a387d22 commit e0d002a
Show file tree
Hide file tree
Showing 3 changed files with 131 additions and 30 deletions.
25 changes: 22 additions & 3 deletions src/sourmash/tax/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,9 @@ def genome(args):
sys.exit(-1)

# for each queryResult, summarize at rank and classify according to thresholds, reporting any errors that occur.
n_total = len(query_gather_results)
classified_results = []
found_error = False
for queryResult in query_gather_results:
try:
queryResult.build_classification_result(
Expand All @@ -322,10 +325,21 @@ def genome(args):
lingroup_ranks=lg_ranks,
lingroups=all_lgs,
)
classified_results.append(queryResult)

except ValueError as exc:
error(f"ERROR: {str(exc)}")
sys.exit(-1)
found_error = True
notify(f"ERROR: {str(exc)}")

n_classified = len(classified_results)
if n_classified == 0:
notify("No queries could be classified. Exiting.")
sys.exit(-1)
else:
classif_perc = (float(n_classified) / float(n_total)) * 100
notify(
f"classified {n_classified}/{n_total} queries ({classif_perc :.2f}%). Writing results"
)

# write outputs
if "csv_summary" in args.output_format:
Expand All @@ -334,7 +348,7 @@ def genome(args):
)
with FileOutputCSV(summary_outfile) as out_fp:
tax_utils.write_summary(
query_gather_results,
classified_results,
out_fp,
limit_float_decimals=limit_float,
classification=True,
Expand Down Expand Up @@ -389,6 +403,11 @@ def genome(args):
with FileOutputCSV(lineage_outfile) as out_fp:
tax_utils.write_output(header, lineage_results, out_fp)

# if there was a classification error, exit with err code
if found_error:
if not args.force:
sys.exit(-1)


def annotate(args):
"""
Expand Down
3 changes: 0 additions & 3 deletions src/sourmash/tax/tax_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2328,9 +2328,6 @@ def summarize_up_ranks(self, single_rank=None, force_resummarize=False):
f"Error: rank '{single_rank}' not in available ranks ({', '.join(self.summarized_ranks)})"
)
self.summarized_ranks = [single_rank]
notify(
f"Starting summarization up rank(s): {', '.join(self.summarized_ranks)} "
)
for taxres in self.raw_taxresults:
lininfo = taxres.lineageInfo
if (
Expand Down
133 changes: 109 additions & 24 deletions tests/test_tax.py
Original file line number Diff line number Diff line change
Expand Up @@ -2700,6 +2700,115 @@ def test_genome_gather_two_files_empty_force(runtmp):
)


def test_genome_gather_two_files_one_classif_fail(runtmp):
# if one query cant be classified still get classif for second
# no --force = fail but still write file
c = runtmp
taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
g_res = utils.get_test_data("tax/test1.gather.csv")

# make test2 results (identical to test1 except query_name and filename)
g_res2 = runtmp.output("test2.gather.csv")
test2_results = [
x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()
]
test2_results[1] = test2_results[1].replace(
"0.08815317112086159", "1.1"
) # make test2 f_unique_to_query sum to >1
for line in test2_results:
print(line)
with open(g_res2, "w") as fp:
fp.writelines(test2_results)

with pytest.raises(SourmashCommandFailed):
c.run_sourmash(
"tax",
"genome",
"-g",
g_res,
g_res2,
"--taxonomy-csv",
taxonomy_csv,
"--rank",
"species",
"--containment-threshold",
"0",
)

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == -1
assert (
"query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank"
in c.last_result.out
)
assert (
"test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000"
in c.last_result.out
)
assert "test2" not in c.last_result.out
assert (
"ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query."
in c.last_result.err
)


def test_genome_gather_two_files_one_classif(runtmp):
# if one query cant be classified, still get classif for second
c = runtmp
taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
g_res = utils.get_test_data("tax/test1.gather.csv")

# make test2 results (identical to test1 except query_name and filename)
g_res2 = runtmp.output("test2.gather.csv")
test2_results = [
x.replace("test1", "test2") + "\n" for x in Path(g_res).read_text().splitlines()
]
test2_results[1] = test2_results[1].replace(
"0.08815317112086159", "1.1"
) # make test2 f_unique_to_query sum to >1
for line in test2_results:
print(line)
with open(g_res2, "w") as fp:
fp.writelines(test2_results)

c.run_sourmash(
"tax",
"genome",
"-g",
g_res,
g_res2,
"--taxonomy-csv",
taxonomy_csv,
"--rank",
"species",
"--containment-threshold",
"0",
"--force",
)

print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert (
"query_name,status,rank,fraction,lineage,query_md5,query_filename,f_weighted_at_rank,bp_match_at_rank"
in c.last_result.out
)
assert (
"test1,match,species,0.089,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri,md5,test1.sig,0.057,444000"
in c.last_result.out
)
assert "test2" not in c.last_result.out
assert (
"ERROR: Summarized fraction is > 100% of the query! This should not be possible. Please check that your input files come directly from a single gather run per query."
in c.last_result.err
)


def test_genome_gather_duplicate_filename(runtmp):
c = runtmp
taxonomy_csv = utils.get_test_data("tax/test.taxonomy.csv")
Expand Down Expand Up @@ -5936,10 +6045,6 @@ def test_metagenome_LIN_lingroups(runtmp):
print(c.last_result.err)

assert c.last_result.status == 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert (
"Read 5 lingroup rows and found 5 distinct lingroup prefixes."
in c.last_result.err
Expand Down Expand Up @@ -5970,10 +6075,6 @@ def test_metagenome_LIN_human_summary_no_lin_position(runtmp):
print(c.last_result.err)

assert c.last_result.status == 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert "sample name proportion cANI lineage" in c.last_result.out
assert "----------- ---------- ---- -------" in c.last_result.out
assert "test1 86.9% - unclassified" in c.last_result.out
Expand Down Expand Up @@ -6020,10 +6121,6 @@ def test_metagenome_LIN_human_summary_lin_position_5(runtmp):
print(c.last_result.err)

assert c.last_result.status == 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert "sample name proportion cANI lineage" in c.last_result.out
assert "----------- ---------- ---- -------" in c.last_result.out
assert "test1 86.9% - unclassified" in c.last_result.out
Expand Down Expand Up @@ -6058,10 +6155,6 @@ def test_metagenome_LIN_krona_lin_position_5(runtmp):
print(c.last_result.err)

assert c.last_result.status == 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert "fraction 0 1 2 3 4 5" in c.last_result.out
assert "0.08815317112086159 0 0 0 0 0 0" in c.last_result.out
assert "0.07778220981252493 1 0 0 0 0 0" in c.last_result.out
Expand Down Expand Up @@ -6133,10 +6226,6 @@ def test_metagenome_LIN_lingroups_empty_lg_file(runtmp):
print(c.last_result.err)

assert c.last_result.status != 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert (
f"Cannot read lingroups from '{lg_file}'. Is file empty?" in c.last_result.err
)
Expand Down Expand Up @@ -6302,8 +6391,4 @@ def test_metagenome_LIN_lingroups_lg_only_header(runtmp):
print(c.last_result.err)

assert c.last_result.status != 0
assert (
"Starting summarization up rank(s): 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0"
in c.last_result.err
)
assert f"No lingroups loaded from {lg_file}" in c.last_result.err

0 comments on commit e0d002a

Please sign in to comment.