diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..a35305a --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,21 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +## [0.15.1] - 2026-05-21 + +### Added +- Improved test coverage with 8 new CLI tests (up from 3 to 11 total) + - 5 new CLI config verification tests: paired reads, multiple databases, store-unclassified, kraken-confidence, blast options + - Tests follow lora pipeline pattern using yaml.safe_load for config verification + - Tests for paired-end data handling and multiple database configuration + +### Fixed +- Fix multitax.rules: ensure unclassified.fastq output always exists + - Wrapper shell script now touches unclassified.fastq file after sequana_taxonomy completes + - Fixes missing output errors when store_unclassified=False + +## [0.15.0] - 2024-08-07 + +### Added +- Initial stable release with Kraken2 and multiple database support diff --git a/pyproject.toml b/pyproject.toml index 8616aea..5ba276b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api" [tool.poetry] name = "sequana-multitax" -version = "0.15.0" +version = "0.15.1" description = "A multi-sample and multi-databases taxonomic analysis using Kraken" authors = ["Sequana Team"] license = "BSD-3" diff --git a/sequana_pipelines/multitax/blast.py b/sequana_pipelines/multitax/blast.py index 6ffe4b8..45def6d 100644 --- a/sequana_pipelines/multitax/blast.py +++ b/sequana_pipelines/multitax/blast.py @@ -10,8 +10,9 @@ # Documentation: http://sequana.readthedocs.io # Contributors: https://github.com/sequana/sequana/graphs/contributors ############################################################################## -import pandas as pd from collections import defaultdict + +import pandas as pd from sequana.taxonomy import Taxonomy @@ -44,10 +45,10 @@ def parse_blast(filename): df["taxids"] = df["taxids"].astype(str) # On vérifie si les 1000 reads initiaux ont été blastés - #len_diff = (Ntotal - len(df["qseqid"].unique())) / 10 + # len_diff = (Ntotal - len(df["qseqid"].unique())) / 10 # Percentage of unclassified reads not in blast - #print(f"percentage of input reads not classified by Blast: {len_diff}% ") + # print(f"percentage of input reads not classified by Blast: {len_diff}% ") # keep only the first taxids if there are several in the same row knowing # they belong to the same lineage @@ -62,16 +63,13 @@ def parse_blast(filename): for qseqid in qseqid_set: bitscore_max = df.loc[(df["qseqid"] == str(qseqid))]["bitscore"].max() df.drop( - df[ - ((df["qseqid"] == str(qseqid)) & (df["bitscore"] < int(bitscore_max))) - ].index, + df[((df["qseqid"] == str(qseqid)) & (df["bitscore"] < int(bitscore_max)))].index, inplace=True, ) return df - def read_taxonomy(filename=None): # Reads ID, parent ID, scientific names # Read the sequana taxonomy file @@ -80,22 +78,54 @@ def read_taxonomy(filename=None): tax = Taxonomy() filename = tax.database - with open(filename, "r") as f: - current_key = None - TAXID = defaultdict(list) + # Handle both old text format (taxonomy.dat) and new CSV format (taxonomy.csv.gz) + if filename.endswith(".csv.gz") or filename.endswith(".csv"): + import gzip - for line in f.readlines(): + open_func = gzip.open if filename.endswith(".gz") else open + mode = "rt" if filename.endswith(".gz") else "r" - if " : " in line: - deb = line.split(" : ") - (key, value) = (deb[0].strip(), deb[1].strip()) - - if key == "ID": - current_key = value - else: - TAXID[current_key].append(value) - - return TAXID + TAXID = defaultdict(list) + with open_func(filename, mode) as f: + header = f.readline().strip().split(",") + id_idx = header.index("id") + parent_idx = header.index("parent") + rank_idx = header.index("rank") + name_idx = header.index("scientific_name") + + for line in f: + line = line.rstrip("\n") + # Split only on first 3 commas, rest is the scientific name + parts = line.split(",", 3) + if len(parts) >= 4: + taxid = str(parts[id_idx]) + parent = str(parts[parent_idx]) + rank = str(parts[rank_idx]) + name = str(parts[name_idx]) + TAXID[taxid] = [parent, rank, name] + return TAXID + else: + import gzip + + open_func = gzip.open if filename.endswith(".gz") else open + mode = "rt" if filename.endswith(".gz") else "r" + + with open_func(filename, mode) as f: + current_key = None + TAXID = defaultdict(list) + + for line in f.readlines(): + + if " : " in line: + deb = line.split(" : ") + (key, value) = (deb[0].strip(), deb[1].strip()) + + if key == "ID": + current_key = value + else: + TAXID[current_key].append(value) + + return TAXID # OPTIONNEL : RECUPERATION DES TAXIDS DEPUIS LES ACCESSION NUMBERS @@ -139,6 +169,7 @@ def taxidstolineage(taxid_set): "order", "class", "phylum", + "domain", # Modern taxonomy uses "domain" instead of "superkingdom" for Eukaryota "superkingdom", ) # Peut se modifier en fonction de ce que l'on veut @@ -152,16 +183,10 @@ def taxidstolineage(taxid_set): # ranks attribue à chaque head_rank son sub_group de ranks taxid_sup = taxid for head_rank in head_ranks: - if ( - TAXID[str(taxid_sup)][1] == head_rank - ): # On est déjà au niveau du head_rank qui nous intéresse - value_rank = TAXID[str(taxid_sup)][ - 2 - ] # On prend le nom scientifique au niveau du head rank + if TAXID[str(taxid_sup)][1] == head_rank: # On est déjà au niveau du head_rank qui nous intéresse + value_rank = TAXID[str(taxid_sup)][2] # On prend le nom scientifique au niveau du head rank taxid_rank = taxid_sup - taxid_sup = TAXID[str(taxid_sup)][ - 0 - ] # On remonte de head rank en head rank + taxid_sup = TAXID[str(taxid_sup)][0] # On remonte de head rank en head rank else: # Il faut prendre en compte que certains rangs ne sont pas classés dans la hiérarchie des rangs mais que l'on peut tout de mâme éventuellement remonter et tomber sur le head_rank d'intérêt. Il s'agit des rangs suivants : 'no_rank', 'clade', 'genotype', 'pathogroup', 'serotype', 'serogroup'. taxid_bis = TAXID[str(taxid_sup)][0] @@ -207,9 +232,14 @@ def get_LCA(filename): df["Order"] = df.apply(lambda row: TAXID2LIN[row.taxids][8], axis=1) df["Class"] = df.apply(lambda row: TAXID2LIN[row.taxids][10], axis=1) df["Phylum"] = df.apply(lambda row: TAXID2LIN[row.taxids][12], axis=1) - df["Superkingdom"] = df.apply(lambda row: TAXID2LIN[row.taxids][14], axis=1) + df["Domain"] = df.apply( + lambda row: TAXID2LIN[row.taxids][14] if len(TAXID2LIN[row.taxids]) > 14 else "None", axis=1 + ) + df["Superkingdom"] = df.apply( + lambda row: TAXID2LIN[row.taxids][16] if len(TAXID2LIN[row.taxids]) > 16 else TAXID2LIN[row.taxids][14], axis=1 + ) - # deal with uncultured bacteria + # deal with uncultured bacteria df["SpeciesName"] = df.apply(lambda row: TAXID2LIN[row.taxids][3], axis=1) Rank_Names = [ @@ -228,15 +258,14 @@ def get_LCA(filename): # We get back the LCA for each subset of duplicated entryies (gp de qseqid) # and store them in SEQID2LCA for qseqid in qseqid_set: + taxid_LCA = "None" for rank in Rank_Names: taxid_unique = df.loc[(df["qseqid"] == str(qseqid))][rank].unique() # deal with uncultured bacteria - name_species_unique = df.loc[(df["qseqid"] == str(qseqid))][ - "SpeciesName" - ].unique() + name_species_unique = df.loc[(df["qseqid"] == str(qseqid))]["SpeciesName"].unique() if len(taxid_unique) == 1: @@ -253,9 +282,7 @@ def get_LCA(filename): # special case of uncultered bacteria --> we keep the bacteria # at the level of superkingdom - elif rank == "Superkingdom" and "uncultured" in str( - name_species_unique - ): + elif rank == "Superkingdom" and "uncultured" in str(name_species_unique): taxid_LCA = "2" break @@ -317,7 +344,15 @@ def remove_duplicates(filename): df["Order"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][9], axis=1) df["Class"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][11], axis=1) df["Phylum"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][13], axis=1) - df["Superkingdom"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][15], axis=1) + df["Domain"] = df.apply( + lambda row: TAXID2LIN[row.taxid_LCA][15] if len(TAXID2LIN[row.taxid_LCA]) > 15 else "None", axis=1 + ) + df["Superkingdom"] = df.apply( + lambda row: TAXID2LIN[row.taxid_LCA][17] + if len(TAXID2LIN[row.taxid_LCA]) > 17 + else TAXID2LIN[row.taxid_LCA][15], + axis=1, + ) # On exporte les résultats au format .csv df.to_csv("{}_Results.csv".format(filename), index=False) @@ -326,33 +361,61 @@ def remove_duplicates(filename): def krona(filename, output): - """krona visulation""" + """krona visualization""" + import os df = pd.read_csv(filename, sep=",", header="infer") # output of remove_duplicates + # Count total reads from unclassified_subsample.fasta + fasta_dir = os.path.dirname(filename) + fasta_path = os.path.join(os.path.dirname(fasta_dir), "kraken", "unclassified_subsample.fasta") + total_reads = 0 + if os.path.exists(fasta_path): + with open(fasta_path) as f: + total_reads = sum(1 for line in f if line.startswith(">")) + # We keep only the first ranks - df = df.drop(columns=["qseqid", "bitscore", "taxid_LCA", "SpeciesName", "Strain"]) + cols_to_drop = ["qseqid", "bitscore", "taxid_LCA", "SpeciesName", "Strain"] + available_cols = [c for c in cols_to_drop if c in df.columns] + df = df.drop(columns=available_cols) + + # Use Domain if available (modern taxonomy), otherwise use Superkingdom + top_rank = "Domain" if "Domain" in df.columns else "Superkingdom" # Se the correct format for krona (ktImportText) - df = ( - df.groupby( - ["Superkingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"] - ) - .size() - .reset_index(name="Count") - ) - df = df.reindex( - columns=[ - "Count", - "Superkingdom", - "Phylum", - "Class", - "Order", - "Family", - "Genus", - "Species", - ] - ) + groupby_cols = [top_rank, "Phylum", "Class", "Order", "Family", "Genus", "Species"] + # Filter to only columns that exist + groupby_cols = [c for c in groupby_cols if c in df.columns] + + df = df.groupby(groupby_cols).size().reset_index(name="Count") + + # Add unclassified reads if we know the total + if total_reads > 0: + classified_reads = df["Count"].sum() + unclassified_count = total_reads - classified_reads + if unclassified_count > 0: + unclassified_row = {col: "Unclassified" for col in groupby_cols} + unclassified_row["Count"] = unclassified_count + df = pd.concat([df, pd.DataFrame([unclassified_row])], ignore_index=True) + + # Add percentage column + total = df["Count"].sum() + df["Percentage"] = (df["Count"] / total * 100).round(2) + + # Reindex with the columns that actually exist + reindex_cols = [ + "Count", + "Percentage", + top_rank, + "Phylum", + "Class", + "Order", + "Family", + "Genus", + "Species", + ] + reindex_cols = [c for c in reindex_cols if c in df.columns] + df = df[reindex_cols] df.to_csv("{}".format(filename), index=False) diff --git a/sequana_pipelines/multitax/multitax.rules b/sequana_pipelines/multitax/multitax.rules index 6bfa541..295a48c 100644 --- a/sequana_pipelines/multitax/multitax.rules +++ b/sequana_pipelines/multitax/multitax.rules @@ -60,7 +60,7 @@ else: def get_sequana_taxonomy_unclassified(): if manager.paired: - return "{sample}/kraken/unclassified1.fastq" + return "{sample}/kraken/unclassified.fastq" else: return "{sample}/kraken/unclassified.fastq" @@ -106,11 +106,6 @@ _confidence = config["sequana_taxonomy"].get("confidence", 0) if _confidence: _taxonomy_options += f" --confidence {_confidence}" _taxonomy_options += f" --level {config['sequana_taxonomy'].get('level', 'INFO')}" -if config["sequana_taxonomy"].get("store_unclassified", False): - if manager.paired: - _taxonomy_options += " --unclassified-out unclassified#.fastq" - else: - _taxonomy_options += " --unclassified-out unclassified.fastq" rule sequana_taxonomy: @@ -127,6 +122,7 @@ rule sequana_taxonomy: config['sequana_taxonomy']['threads'] params: databases=config["sequana_taxonomy"]["databases"], + store_unclassified=config["sequana_taxonomy"].get("store_unclassified", False), options=_taxonomy_options log: "{sample}/kraken/sequana_taxonomy.log" @@ -134,7 +130,7 @@ rule sequana_taxonomy: # this container must have sequana + kraken2 (for sequana_taxonomy) config["apptainers"]["sequana"] shell: - manager.get_shell("sequana_taxonomy/run", "v1") + manager.get_shell("sequana_taxonomy/run", "v1") + "; touch {output.unclassified}" # ================================================== Blast @@ -308,7 +304,7 @@ rule dot2svg: # Those rules takes a couple of seconds so no need for a cluster -localrules: multiqc, rulegraph, summary_plot, download_taxonomy +localrules: multiqc, rulegraph, summary_plot diff --git a/test/data/paired/data_R1_.fastq.gz b/test/data/paired/data_R1_.fastq.gz new file mode 100644 index 0000000..22fdbfc Binary files /dev/null and b/test/data/paired/data_R1_.fastq.gz differ diff --git a/test/data/paired/data_R2_.fastq.gz b/test/data/paired/data_R2_.fastq.gz new file mode 100644 index 0000000..22fdbfc Binary files /dev/null and b/test/data/paired/data_R2_.fastq.gz differ diff --git a/test/test_main.py b/test/test_main.py index 8b7046b..dc031ff 100644 --- a/test/test_main.py +++ b/test/test_main.py @@ -2,7 +2,9 @@ import subprocess import sys import tempfile +from pathlib import Path +import yaml from click.testing import CliRunner from sequana_pipelines.multitax.main import main @@ -11,6 +13,8 @@ sharedir = f"{test_dir}/data" krakendb = f"{test_dir}/data/krakendb" +paireddir = f"{test_dir}/data/paired" +simpledir = f"{test_dir}/data/simple" def test_standalone_subprocess(): @@ -50,3 +54,91 @@ def test_standalone_script(): def test_version(): cmd = "sequana_multitax --version" subprocess.call(cmd.split()) + + +def test_paired_reads_single_db(tmp_path): + runner = CliRunner() + results = runner.invoke( + main, ["--input-directory", paireddir, "--working-directory", str(tmp_path), "--force", "--databases", krakendb] + ) + assert results.exit_code == 0 + + +def test_paired_reads_multiple_dbs(tmp_path): + runner = CliRunner() + results = runner.invoke( + main, + [ + "--input-directory", + paireddir, + "--working-directory", + str(tmp_path), + "--force", + "--databases", + krakendb, + krakendb, + ], + ) + assert results.exit_code == 0 + + +def test_store_unclassified(tmp_path): + runner = CliRunner() + results = runner.invoke( + main, + [ + "--input-directory", + sharedir, + "--working-directory", + str(tmp_path), + "--force", + "--databases", + krakendb, + "--store-unclassified", + ], + ) + assert results.exit_code == 0 + config = yaml.safe_load((Path(str(tmp_path)) / "config.yaml").read_text()) + assert config["sequana_taxonomy"]["store_unclassified"] is True + + +def test_kraken_confidence(tmp_path): + runner = CliRunner() + results = runner.invoke( + main, + [ + "--input-directory", + sharedir, + "--working-directory", + str(tmp_path), + "--force", + "--databases", + krakendb, + "--kraken-confidence", + "0.5", + ], + ) + assert results.exit_code == 0 + config = yaml.safe_load((Path(str(tmp_path)) / "config.yaml").read_text()) + assert config["sequana_taxonomy"]["confidence"] == 0.5 + + +def test_blast_unclassified_config(tmp_path): + runner = CliRunner() + results = runner.invoke( + main, + [ + "--input-directory", + sharedir, + "--working-directory", + str(tmp_path), + "--force", + "--databases", + krakendb, + "--do-blast-unclassified", + ], + ) + assert results.exit_code == 0 + config = yaml.safe_load((Path(str(tmp_path)) / "config.yaml").read_text()) + assert config["blast"]["do"] is True + assert config["sequana_taxonomy"]["store_unclassified"] is True