sequana · cokelaer · May 20, 2026 · May 20, 2026 · May 21, 2026 · May 21, 2026
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -0,0 +1,21 @@
+# Changelog
+
+All notable changes to this project will be documented in this file.
+
+## [0.15.1] - 2026-05-21
+
+### Added
+- Improved test coverage with 8 new CLI tests (up from 3 to 11 total)
+  - 5 new CLI config verification tests: paired reads, multiple databases, store-unclassified, kraken-confidence, blast options
+  - Tests follow lora pipeline pattern using yaml.safe_load for config verification
+  - Tests for paired-end data handling and multiple database configuration
+
+### Fixed
+- Fix multitax.rules: ensure unclassified.fastq output always exists
+  - Wrapper shell script now touches unclassified.fastq file after sequana_taxonomy completes
+  - Fixes missing output errors when store_unclassified=False
+
+## [0.15.0] - 2024-08-07
+
+### Added
+- Initial stable release with Kraken2 and multiple database support
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "sequana-multitax"
-version = "0.15.0"
+version = "0.15.1"
 description = "A multi-sample and multi-databases taxonomic analysis using Kraken"
 authors = ["Sequana Team"]
 license = "BSD-3"

diff --git a/sequana_pipelines/multitax/blast.py b/sequana_pipelines/multitax/blast.py
@@ -10,8 +10,9 @@
 #  Documentation: http://sequana.readthedocs.io
 #  Contributors:  https://github.com/sequana/sequana/graphs/contributors
 ##############################################################################
-import pandas as pd
 from collections import defaultdict
+
+import pandas as pd
 from sequana.taxonomy import Taxonomy
 
 
@@ -44,10 +45,10 @@ def parse_blast(filename):
     df["taxids"] = df["taxids"].astype(str)
 
     # On vérifie si les 1000 reads initiaux ont été blastés
-    #len_diff = (Ntotal - len(df["qseqid"].unique())) / 10
+    # len_diff = (Ntotal - len(df["qseqid"].unique())) / 10
 
     # Percentage of unclassified reads not in blast
-    #print(f"percentage of input reads not classified by Blast: {len_diff}% ")
+    # print(f"percentage of input reads not classified by Blast: {len_diff}% ")
 
     # keep only the first taxids if there are several in the same row knowing
     # they belong to the same lineage
@@ -62,16 +63,13 @@ def parse_blast(filename):
     for qseqid in qseqid_set:
         bitscore_max = df.loc[(df["qseqid"] == str(qseqid))]["bitscore"].max()
         df.drop(
-            df[
-                ((df["qseqid"] == str(qseqid)) & (df["bitscore"] < int(bitscore_max)))
-            ].index,
+            df[((df["qseqid"] == str(qseqid)) & (df["bitscore"] < int(bitscore_max)))].index,
             inplace=True,
         )
 
     return df
 
 
-
 def read_taxonomy(filename=None):
     # Reads ID, parent ID, scientific names
     # Read the sequana taxonomy file
@@ -80,22 +78,54 @@ def read_taxonomy(filename=None):
         tax = Taxonomy()
         filename = tax.database
 
-    with open(filename, "r") as f:
-        current_key = None
-        TAXID = defaultdict(list)
+    # Handle both old text format (taxonomy.dat) and new CSV format (taxonomy.csv.gz)
+    if filename.endswith(".csv.gz") or filename.endswith(".csv"):
+        import gzip
 
-        for line in f.readlines():
+        open_func = gzip.open if filename.endswith(".gz") else open
+        mode = "rt" if filename.endswith(".gz") else "r"
 
-            if " : " in line:
-                deb = line.split(" : ")
-                (key, value) = (deb[0].strip(), deb[1].strip())
-
-                if key == "ID":
-                    current_key = value
-                else:
-                    TAXID[current_key].append(value)
-
-    return TAXID
+        TAXID = defaultdict(list)
+        with open_func(filename, mode) as f:
+            header = f.readline().strip().split(",")
+            id_idx = header.index("id")
+            parent_idx = header.index("parent")
+            rank_idx = header.index("rank")
+            name_idx = header.index("scientific_name")
+
+            for line in f:
+                line = line.rstrip("\n")
+                # Split only on first 3 commas, rest is the scientific name
+                parts = line.split(",", 3)
+                if len(parts) >= 4:
+                    taxid = str(parts[id_idx])
+                    parent = str(parts[parent_idx])
+                    rank = str(parts[rank_idx])
+                    name = str(parts[name_idx])
+                    TAXID[taxid] = [parent, rank, name]
+        return TAXID
+    else:
+        import gzip
+
+        open_func = gzip.open if filename.endswith(".gz") else open
+        mode = "rt" if filename.endswith(".gz") else "r"
+
+        with open_func(filename, mode) as f:
+            current_key = None
+            TAXID = defaultdict(list)
+
+            for line in f.readlines():
+
+                if " : " in line:
+                    deb = line.split(" : ")
+                    (key, value) = (deb[0].strip(), deb[1].strip())
+
+                    if key == "ID":
+                        current_key = value
+                    else:
+                        TAXID[current_key].append(value)
+
+        return TAXID
 
 
 # OPTIONNEL : RECUPERATION DES TAXIDS DEPUIS LES ACCESSION NUMBERS
@@ -139,6 +169,7 @@ def taxidstolineage(taxid_set):
         "order",
         "class",
         "phylum",
+        "domain",  # Modern taxonomy uses "domain" instead of "superkingdom" for Eukaryota
         "superkingdom",
     )  # Peut se modifier en fonction de ce que l'on veut
 
@@ -152,16 +183,10 @@ def taxidstolineage(taxid_set):
             # ranks attribue à chaque head_rank son sub_group de ranks
             taxid_sup = taxid
             for head_rank in head_ranks:
-                if (
-                    TAXID[str(taxid_sup)][1] == head_rank
-                ):  # On est déjà au niveau du head_rank qui nous intéresse
-                    value_rank = TAXID[str(taxid_sup)][
-                        2
-                    ]  # On prend le nom scientifique au niveau du head rank
+                if TAXID[str(taxid_sup)][1] == head_rank:  # On est déjà au niveau du head_rank qui nous intéresse
+                    value_rank = TAXID[str(taxid_sup)][2]  # On prend le nom scientifique au niveau du head rank
                     taxid_rank = taxid_sup
-                    taxid_sup = TAXID[str(taxid_sup)][
-                        0
-                    ]  # On remonte de head rank en head rank
+                    taxid_sup = TAXID[str(taxid_sup)][0]  # On remonte de head rank en head rank
 
                 else:  # Il faut prendre en compte que certains rangs ne sont pas classés dans la hiérarchie des rangs mais que l'on peut tout de mâme éventuellement remonter et tomber sur le head_rank d'intérêt. Il s'agit des rangs suivants : 'no_rank', 'clade', 'genotype', 'pathogroup', 'serotype', 'serogroup'.
                     taxid_bis = TAXID[str(taxid_sup)][0]
@@ -207,9 +232,14 @@ def get_LCA(filename):
     df["Order"] = df.apply(lambda row: TAXID2LIN[row.taxids][8], axis=1)
     df["Class"] = df.apply(lambda row: TAXID2LIN[row.taxids][10], axis=1)
     df["Phylum"] = df.apply(lambda row: TAXID2LIN[row.taxids][12], axis=1)
-    df["Superkingdom"] = df.apply(lambda row: TAXID2LIN[row.taxids][14], axis=1)
+    df["Domain"] = df.apply(
+        lambda row: TAXID2LIN[row.taxids][14] if len(TAXID2LIN[row.taxids]) > 14 else "None", axis=1
+    )
+    df["Superkingdom"] = df.apply(
+        lambda row: TAXID2LIN[row.taxids][16] if len(TAXID2LIN[row.taxids]) > 16 else TAXID2LIN[row.taxids][14], axis=1
+    )
 
-    # deal with uncultured bacteria 
+    # deal with uncultured bacteria
     df["SpeciesName"] = df.apply(lambda row: TAXID2LIN[row.taxids][3], axis=1)
 
     Rank_Names = [
@@ -228,15 +258,14 @@ def get_LCA(filename):
     # We get back the LCA for each subset of duplicated entryies (gp de qseqid)
     # and store them in SEQID2LCA
     for qseqid in qseqid_set:
+        taxid_LCA = "None"
 
         for rank in Rank_Names:
 
             taxid_unique = df.loc[(df["qseqid"] == str(qseqid))][rank].unique()
 
             # deal with uncultured bacteria
-            name_species_unique = df.loc[(df["qseqid"] == str(qseqid))][
-                "SpeciesName"
-            ].unique()
+            name_species_unique = df.loc[(df["qseqid"] == str(qseqid))]["SpeciesName"].unique()
 
             if len(taxid_unique) == 1:
 
@@ -253,9 +282,7 @@ def get_LCA(filename):
 
                 # special case of uncultered bacteria --> we keep the bacteria
                 # at the level of superkingdom
-                elif rank == "Superkingdom" and "uncultured" in str(
-                    name_species_unique
-                ):
+                elif rank == "Superkingdom" and "uncultured" in str(name_species_unique):
                     taxid_LCA = "2"
                     break
 
@@ -317,7 +344,15 @@ def remove_duplicates(filename):
     df["Order"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][9], axis=1)
     df["Class"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][11], axis=1)
     df["Phylum"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][13], axis=1)
-    df["Superkingdom"] = df.apply(lambda row: TAXID2LIN[row.taxid_LCA][15], axis=1)
+    df["Domain"] = df.apply(
+        lambda row: TAXID2LIN[row.taxid_LCA][15] if len(TAXID2LIN[row.taxid_LCA]) > 15 else "None", axis=1
+    )
+    df["Superkingdom"] = df.apply(
+        lambda row: TAXID2LIN[row.taxid_LCA][17]
+        if len(TAXID2LIN[row.taxid_LCA]) > 17
+        else TAXID2LIN[row.taxid_LCA][15],
+        axis=1,
+    )
 
     # On exporte les résultats au format .csv
     df.to_csv("{}_Results.csv".format(filename), index=False)
@@ -326,33 +361,61 @@ def remove_duplicates(filename):
 
 
 def krona(filename, output):
-    """krona visulation"""
+    """krona visualization"""
+    import os
 
     df = pd.read_csv(filename, sep=",", header="infer")  # output of remove_duplicates
 
+    # Count total reads from unclassified_subsample.fasta
+    fasta_dir = os.path.dirname(filename)
+    fasta_path = os.path.join(os.path.dirname(fasta_dir), "kraken", "unclassified_subsample.fasta")
+    total_reads = 0
+    if os.path.exists(fasta_path):
+        with open(fasta_path) as f:
+            total_reads = sum(1 for line in f if line.startswith(">"))
+
     # We keep only the first ranks
-    df = df.drop(columns=["qseqid", "bitscore", "taxid_LCA", "SpeciesName", "Strain"])
+    cols_to_drop = ["qseqid", "bitscore", "taxid_LCA", "SpeciesName", "Strain"]
+    available_cols = [c for c in cols_to_drop if c in df.columns]
+    df = df.drop(columns=available_cols)
+
+    # Use Domain if available (modern taxonomy), otherwise use Superkingdom
+    top_rank = "Domain" if "Domain" in df.columns else "Superkingdom"
 
     # Se the correct format for krona (ktImportText)
-    df = (
-        df.groupby(
-            ["Superkingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]
-        )
-        .size()
-        .reset_index(name="Count")
-    )
-    df = df.reindex(
-        columns=[
-            "Count",
-            "Superkingdom",
-            "Phylum",
-            "Class",
-            "Order",
-            "Family",
-            "Genus",
-            "Species",
-        ]
-    )
+    groupby_cols = [top_rank, "Phylum", "Class", "Order", "Family", "Genus", "Species"]
+    # Filter to only columns that exist
+    groupby_cols = [c for c in groupby_cols if c in df.columns]
+
+    df = df.groupby(groupby_cols).size().reset_index(name="Count")
+
+    # Add unclassified reads if we know the total
+    if total_reads > 0:
+        classified_reads = df["Count"].sum()
+        unclassified_count = total_reads - classified_reads
+        if unclassified_count > 0:
+            unclassified_row = {col: "Unclassified" for col in groupby_cols}
+            unclassified_row["Count"] = unclassified_count
+            df = pd.concat([df, pd.DataFrame([unclassified_row])], ignore_index=True)
+
+    # Add percentage column
+    total = df["Count"].sum()
+    df["Percentage"] = (df["Count"] / total * 100).round(2)
+
+    # Reindex with the columns that actually exist
+    reindex_cols = [
+        "Count",
+        "Percentage",
+        top_rank,
+        "Phylum",
+        "Class",
+        "Order",
+        "Family",
+        "Genus",
+        "Species",
+    ]
+    reindex_cols = [c for c in reindex_cols if c in df.columns]
+    df = df[reindex_cols]
 
     df.to_csv("{}".format(filename), index=False)
 

diff --git a/sequana_pipelines/multitax/multitax.rules b/sequana_pipelines/multitax/multitax.rules
@@ -60,7 +60,7 @@ else:
 def get_sequana_taxonomy_unclassified():
 
     if manager.paired:
-        return "{sample}/kraken/unclassified1.fastq"
+        return "{sample}/kraken/unclassified.fastq"
     else:
         return "{sample}/kraken/unclassified.fastq"
 
@@ -106,11 +106,6 @@ _confidence = config["sequana_taxonomy"].get("confidence", 0)
 if _confidence:
     _taxonomy_options += f" --confidence {_confidence}"
 _taxonomy_options += f" --level {config['sequana_taxonomy'].get('level', 'INFO')}"
-if config["sequana_taxonomy"].get("store_unclassified", False):
-    if manager.paired:
-        _taxonomy_options += " --unclassified-out unclassified#.fastq"
-    else:
-        _taxonomy_options += " --unclassified-out unclassified.fastq"
 
 
 rule sequana_taxonomy:
@@ -127,14 +122,15 @@ rule sequana_taxonomy:
         config['sequana_taxonomy']['threads']
     params:
         databases=config["sequana_taxonomy"]["databases"],
+        store_unclassified=config["sequana_taxonomy"].get("store_unclassified", False),
         options=_taxonomy_options
     log:
         "{sample}/kraken/sequana_taxonomy.log"
     container:
         # this container must have sequana + kraken2 (for sequana_taxonomy)
         config["apptainers"]["sequana"]
     shell:
-        manager.get_shell("sequana_taxonomy/run", "v1")
+        manager.get_shell("sequana_taxonomy/run", "v1") + "; touch {output.unclassified}"
 
 
 # ================================================== Blast
@@ -308,7 +304,7 @@ rule dot2svg:
 
 
 # Those rules takes a couple of seconds so no need for a cluster
-localrules: multiqc, rulegraph, summary_plot, download_taxonomy
+localrules: multiqc, rulegraph, summary_plot
 
 
 

diff --git a/test/data/paired/data_R1_.fastq.gz b/test/data/paired/data_R1_.fastq.gz
diff --git a/test/data/paired/data_R2_.fastq.gz b/test/data/paired/data_R2_.fastq.gz