From 60df42c66763766875c32cb876ddb5b8a619068b Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Tue, 14 Apr 2026 22:12:04 +0200 Subject: [PATCH 1/2] Fix import --- sequana_pipelines/laa/laa.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sequana_pipelines/laa/laa.rules b/sequana_pipelines/laa/laa.rules index 0ecd323..de37ffa 100644 --- a/sequana_pipelines/laa/laa.rules +++ b/sequana_pipelines/laa/laa.rules @@ -589,7 +589,7 @@ rule consensus: params: min_depth = 10 run: - from sequana.laa import Consensus + from sequana_pipelines.laa.consensus import Consensus c = Consensus(input.bases, input.freebayes) c.min_depth = params.min_depth From 41fb6c9bb55a75712f4764d88082833ae17174d8 Mon Sep 17 00:00:00 2001 From: Thomas Cokelaer Date: Tue, 14 Apr 2026 22:20:48 +0200 Subject: [PATCH 2/2] dd missing file --- sequana_pipelines/laa/consensus.py | 106 +++++++++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 sequana_pipelines/laa/consensus.py diff --git a/sequana_pipelines/laa/consensus.py b/sequana_pipelines/laa/consensus.py new file mode 100644 index 0000000..a063f24 --- /dev/null +++ b/sequana_pipelines/laa/consensus.py @@ -0,0 +1,106 @@ +"""Local copy of sequana.laa.Consensus fixed for sequana>=0.18 (VariantFile API). + +Kept in the pipeline so the workflow does not depend on an unreleased sequana fix. +""" +import colorlog +import pandas as pd + +from sequana.variants import VariantFile + +logger = colorlog.getLogger(__name__) + + +class Consensus: + def __init__(self, bases, freebayes=None): + self.filename_bases = bases + self.min_depth = 10 + self.min_score = 1 + + if freebayes is not None: + v = VariantFile(freebayes) + self.variants = [v._variant_to_dict(x) for x in v.variants] + else: + self.variants = [] + + def identify_deletions(self): + deletions = [] + for variant in self.variants: + alt = variant["alternative"] + ref = variant["reference"] + dp = variant["depth"] + score = variant["freebayes_score"] + + if score < self.min_score: + continue + if dp < self.min_depth: + continue + if len(alt) < 1: + continue + if len(alt) <= len(ref): + continue + + deletions.append(variant) + return deletions + + def get_bases(self, skip_rows=3): + try: + df = pd.read_csv(self.filename_bases, sep="\t", skiprows=skip_rows, header=None) + except pd.errors.ParserError: + df = pd.read_csv(self.filename_bases, sep="\t", skiprows=5, header=None) + + df.columns = ["Pos", "A", "C", "G", "T", "N", "DEL", "INS"] + df = df.set_index("Pos") + + indices = df[df.sum(axis=1) < self.min_depth].index + df.loc[indices] = 0 + df.loc[indices, "N"] = 10000 + return df + + def run(self): + df = self.get_bases() + deletions = self.identify_deletions() + + dd = df.apply(lambda x: x.idxmax(), axis=1) + + for d in deletions: + pos = int(d["position"]) + ref = d["reference"] + if "".join(dd.loc[pos : pos + len(ref) - 1]) != ref: + logger.warning( + "reference string {} not found in consensus at position {}".format(ref, pos) + ) + + for d in deletions: + pos = int(d["position"]) + ref = d["reference"] + alt = d["alternative"] + + dfA = df.loc[0 : pos - 1] + + dfB = df.iloc[0 : len(alt)].copy() + dfB.index = [pos] * len(dfB) + dfB *= 0 + for i, nucleotide in enumerate(alt): + dfB.iloc[i][nucleotide] = 10000 + + dfC = df.loc[pos + len(ref) :] + + df = pd.concat([dfA, dfB, dfC]) + + df.reset_index(drop=True, inplace=True) + + dd = df.apply(lambda x: x.idxmax(), axis=1) + return dd + + def save_consensus(self, output, identifier): + dd = self.run() + with open(output, "w") as fout: + data = "".join(dd).replace("DEL", "") + fout.write(">{}\n{}\n".format(identifier, data)) + + def get_population(self, threshold=0.1, Npop=2): + df = self.get_bases() + cols = ["A", "C", "G", "T", "N", "DEL"] + df = df.divide(df[cols].sum(axis=1), axis=0) + selection = df[(df[cols] > threshold).sum(axis=1) >= 2] + return selection