In [2]:
## HUMAN VS VIRUS

#!/usr/bin/env python3
import os, re, glob, pandas as pd
from typing import Optional, Tuple, Dict

RESULTS_DIR = "/blue/simone.marini/share/kraken2/Results"
DATA_DIR    = "/blue/simone.marini/share/kraken2/Data/data_prediction"
OUT_CSV     = os.path.join(DATA_DIR, "kraken_Human_vs_Virus_balanced_accuracy_summary.csv")

def split_report_line(line: str):
    parts = line.rstrip("\n").split("\t")
    if len(parts) < 6:
        parts = re.split(r"\s+", line.rstrip("\n"), maxsplit=5)
    if len(parts) < 6:
        return None
    return parts[:6]

def extract_pct_taxid(path: str, target_taxid: str) -> Optional[float]:
    if not path or not os.path.exists(path):
        return None
    with open(path, "r") as f:
        for raw in f:
            parts = split_report_line(raw)
            if not parts:
                continue
            pct_str, _, _, _, taxid, _ = parts
            if taxid.strip() == target_taxid:
                s = pct_str.strip().replace(",", ".")
                try:
                    return float(s)
                except ValueError:
                    return None
    return None

def counts_from_csv(path: str) -> Tuple[int, int]:
    df = pd.read_csv(path)
    vc = df["label"].value_counts()
    return int(vc.get("virus", 0)), int(vc.get("human", 0))

def norm_key(s: str) -> str:
    return re.sub(r"[\s_]+", "", s).lower()

def find_corresponding_report(stem: str, kind: str, results_dir: str) -> Optional[str]:
    want = norm_key(f"{stem}_{kind}.report")
    for p in glob.glob(os.path.join(results_dir, "*.report")):
        if norm_key(os.path.basename(p)) == want:
            return p
    want_prefix = norm_key(stem)
    for p in glob.glob(os.path.join(results_dir, f"*{kind}.report")):
        b = os.path.basename(p)
        if norm_key(b).startswith(want_prefix) and b.lower().endswith(f"_{kind}.report"):
            return p
    return None

def compute_for_tag(tag: str, results_dir: str, data_dir: str) -> Dict:
    rep_human = find_corresponding_report(tag, "human", results_dir)
    rep_virus = find_corresponding_report(tag, "virus", results_dir)
    csv_path  = os.path.join(data_dir, f"{tag}_test_only.csv")

    human_pct = extract_pct_taxid(rep_human, "9606") or extract_pct_taxid(rep_human, "9605")
    virus_pct = extract_pct_taxid(rep_virus, "10239")

    n_virus = n_human = None
    if os.path.exists(csv_path):
        n_virus, n_human = counts_from_csv(csv_path)

    TPR = virus_pct/100.0 if virus_pct is not None else None
    TNR = human_pct/100.0 if human_pct is not None else None
    FNR = None if TPR is None else (1-TPR)
    FPR = None if TNR is None else (1-TNR)

    TP = FN = TN = FP = None
    if (n_virus is not None) and (TPR is not None):
        TP = round(TPR * n_virus); FN = n_virus - TP
    if (n_human is not None) and (TNR is not None):
        TN = round(TNR * n_human); FP = n_human - TN

    bacc = None if (TPR is None or TNR is None) else (TPR + TNR)/2.0

    return {
        "tag": tag,
        "virus_report": rep_virus,
        "human_report": rep_human,
        "csv": csv_path,
        "virus_pct": virus_pct,
        "non_virus_pct": None if virus_pct is None else (100 - virus_pct),
        "human_pct": human_pct,
        "non_homo_pct": None if human_pct is None else (100 - human_pct),
        "n_virus": n_virus, "n_human": n_human,
        "TPR": TPR, "FNR": FNR, "TNR": TNR, "FPR": FPR,
        "TP": TP, "FN": FN, "TN": TN, "FP": FP,
        "balanced_accuracy": bacc
    }

def discover_from_csvs(data_dir: str) -> list:
    tags = []
    for p in glob.glob(os.path.join(data_dir, "*_test_only.csv")):
        tags.append(os.path.basename(p).replace("_test_only.csv", ""))
    return sorted(set(tags))

def main():
    tags = discover_from_csvs(DATA_DIR)
    rows = [compute_for_tag(t, RESULTS_DIR, DATA_DIR) for t in tags]
    out = pd.DataFrame(rows)
    cols = ["tag","virus_pct","non_virus_pct","human_pct","non_homo_pct",
            "n_virus","n_human","TP","FN","TN","FP","TPR","TNR","FPR","FNR",
            "balanced_accuracy","virus_report","human_report","csv"]
    out = out[[c for c in cols if c in out.columns]]
    out.to_csv(OUT_CSV, index=False)
    print(f"[OK] Salvato: {OUT_CSV}")
    with pd.option_context("display.max_columns", None):
        print(out.head(len(out)))

if __name__ == "__main__":
    main()

[OK] Salvato: /blue/simone.marini/share/kraken2/Data/data_prediction/kraken_Human_vs_Virus_balanced_accuracy_summary.csv
                                               tag  virus_pct  non_virus_pct  \
0  split_Bacteria vs Human vs Virus_run10_seed2034      13.24          86.76   
1  split_Bacteria vs Human vs Virus_run11_seed2035      12.35          87.65   
2  split_Bacteria vs Human vs Virus_run12_seed2036      12.35          87.65   
3  split_Bacteria vs Human vs Virus_run13_seed2037      11.90          88.10   
4  split_Bacteria vs Human vs Virus_run14_seed2038      13.13          86.87   

   homo_pct  non_homo_pct  n_virus  n_human     TP     FN       TN    FP  \
0     99.95          0.05      891    24457  118.0  773.0  24445.0  12.0   
1     99.90          0.10      891    24457  110.0  781.0  24433.0  24.0   
2     99.91          0.09      891    24457  110.0  781.0  24435.0  22.0   
3     99.90          0.10      891    24457  106.0  785.0  24433.0  24.0   
4     99.89       

In [6]:
## BACTERIA VS HUMAN 

#!/usr/bin/env python3
import os, re, glob, pandas as pd
from typing import Optional, Tuple, Dict

RESULTS_DIR = "/blue/simone.marini/share/kraken2/Results"
DATA_DIR    = "/blue/simone.marini/share/kraken2/Data/data_prediction"
OUT_CSV     = os.path.join(DATA_DIR, "kraken_Bacteria_vs_Human_balanced_accuracy_summary.csv")

# ---------- parsing .report ----------
def split_report_line(line: str):
    parts = line.rstrip("\n").split("\t")
    if len(parts) < 6:
        parts = re.split(r"\s+", line.rstrip("\n"), maxsplit=5)
    if len(parts) < 6:
        return None
    return parts[:6]

def extract_pct_taxid(path: str, target_taxid: str) -> Optional[float]:
    if not path or not os.path.exists(path):
        return None
    with open(path, "r") as f:
        for raw in f:
            parts = split_report_line(raw)
            if not parts:
                continue
            pct_str, _, _, _, taxid, _ = parts
            if taxid.strip() == target_taxid:
                s = pct_str.strip().replace(",", ".")
                try:
                    return float(s)
                except ValueError:
                    return None
    return None

# ---------- csv counts ----------
def counts_from_csv(path: str) -> Tuple[int, int]:
    df = pd.read_csv(path)
    vc = df["label"].value_counts()
    return int(vc.get("bacteria", 0)), int(vc.get("human", 0))

# ---------- filename helpers ----------
def norm_key(s: str) -> str:
    return re.sub(r"[\s_]+", "", s).lower()

def find_corresponding_report(stem: str, kind: str, results_dir: str) -> Optional[str]:
    # match exact <stem>_<kind>.report ignoring spaces/underscore/case
    want = norm_key(f"{stem}_{kind}.report")
    for p in glob.glob(os.path.join(results_dir, "*.report")):
        if norm_key(os.path.basename(p)) == want:
            return p
    # fallback: prefix match
    want_prefix = norm_key(stem)
    for p in glob.glob(os.path.join(results_dir, f"*{kind}.report")):
        b = os.path.basename(p)
        if norm_key(b).startswith(want_prefix) and b.lower().endswith(f"_{kind}.report"):
            return p
    return None

# ---------- tag discovery (FILTER 2-class ONLY) ----------
def is_bacteria_vs_human_tag(tag: str) -> bool:
    """
    Accetta SOLO: split_Bacteria vs Human_run<digits>_seed<digits>
    (spazi/underscore indifferentemente, case-insensitive).
    """
    t = tag.strip()
    # normalizza: collassa sequenze di spazio/underscore a singolo spazio
    t_norm = re.sub(r"[_\s]+", " ", t, flags=re.IGNORECASE).strip().lower()
    # regex su forma normalizzata con spazi
    pat = re.compile(r"^split bacteria vs human run\d+ seed\d+$")
    return bool(pat.match(t_norm))

def discover_from_csvs(data_dir: str) -> list:
    tags = []
    for p in glob.glob(os.path.join(data_dir, "*_test_only.csv")):
        tag = os.path.basename(p).replace("_test_only.csv", "")
        if is_bacteria_vs_human_tag(tag):
            tags.append(tag)
    return sorted(set(tags))

# ---------- metrics ----------
def compute_for_tag(tag: str, results_dir: str, data_dir: str) -> Dict:
    rep_human    = find_corresponding_report(tag, "human", results_dir)
    rep_bacteria = find_corresponding_report(tag, "bacteria", results_dir)
    csv_path     = os.path.join(data_dir, f"{tag}_test_only.csv")

    # Homo sapiens (9606) con fallback Homo (9605)
    human_pct = extract_pct_taxid(rep_human, "9606") or extract_pct_taxid(rep_human, "9605")
    # Bacteria superkingdom (taxid = 2)
    bacteria_pct = extract_pct_taxid(rep_bacteria, "2")

    n_bacteria = n_human = None
    if os.path.exists(csv_path):
        n_bacteria, n_human = counts_from_csv(csv_path)

    # tassi
    TPR = bacteria_pct/100.0 if bacteria_pct is not None else None  # batteri = positivi
    TNR = human_pct/100.0    if human_pct    is not None else None
    FNR = None if TPR is None else (1 - TPR)
    FPR = None if TNR is None else (1 - TNR)

    # confusion matrix stimata
    TP = FN = TN = FP = None
    if (n_bacteria is not None) and (TPR is not None):
        TP = round(TPR * n_bacteria); FN = n_bacteria - TP
    if (n_human is not None) and (TNR is not None):
        TN = round(TNR * n_human); FP = n_human - TN

    bacc = None if (TPR is None or TNR is None) else (TPR + TNR) / 2.0

    return {
        "tag": tag,
        "bacteria_report": rep_bacteria,
        "human_report": rep_human,
        "csv": csv_path,
        "bacteria_pct": bacteria_pct,
        "non_bacteria_pct": None if bacteria_pct is None else (100 - bacteria_pct),
        "human_pct": human_pct,
        "non_homo_pct": None if human_pct is None else (100 - human_pct),
        "n_bacteria": n_bacteria, "n_human": n_human,
        "TPR": TPR, "FNR": FNR, "TNR": TNR, "FPR": FPR,
        "TP": TP, "FN": FN, "TN": TN, "FP": FP,
        "balanced_accuracy": bacc,
    }

def main():
    tags = discover_from_csvs(DATA_DIR)
    rows = [compute_for_tag(t, RESULTS_DIR, DATA_DIR) for t in tags]
    out = pd.DataFrame(rows)
    cols = ["tag","bacteria_pct","non_bacteria_pct","human_pct","non_homo_pct",
            "n_bacteria","n_human","TP","FN","TN","FP","TPR","TNR","FPR","FNR",
            "balanced_accuracy","bacteria_report","human_report","csv"]
    out = out[[c for c in cols if c in out.columns]]
    out.to_csv(OUT_CSV, index=False)
    print(f"[OK] Salvato: {OUT_CSV}")
    with pd.option_context("display.max_columns", None):
        print(out.head(len(out)))

if __name__ == "__main__":
    main()

[OK] Salvato: /blue/simone.marini/share/kraken2/Data/data_prediction/kraken_Bacteria_vs_Human_balanced_accuracy_summary.csv
                                       tag  bacteria_pct  non_bacteria_pct  \
0   split_Bacteria vs Human_run10_seed2034         97.31              2.69   
1   split_Bacteria vs Human_run11_seed2035         97.25              2.75   
2   split_Bacteria vs Human_run12_seed2036         97.27              2.73   
3   split_Bacteria vs Human_run13_seed2037         97.28              2.72   
4   split_Bacteria vs Human_run14_seed2038         97.27              2.73   
5   split_Bacteria vs Human_run15_seed2039         97.24              2.76   
6   split_Bacteria vs Human_run16_seed2040         97.25              2.75   
7   split_Bacteria vs Human_run17_seed2041         97.26              2.74   
8   split_Bacteria vs Human_run18_seed2042         97.29              2.71   
9   split_Bacteria vs Human_run19_seed2043         97.29              2.71   
10   split_Bacteri

In [2]:
## VIRUS VS BACTERIA

#!/usr/bin/env python3
import os, re, glob, pandas as pd
from typing import Optional, Tuple, Dict

RESULTS_DIR = "/blue/simone.marini/share/kraken2/Results"
DATA_DIR    = "/blue/simone.marini/share/kraken2/Data/data_prediction"
OUT_CSV     = os.path.join(DATA_DIR, "kraken_Bacteria_vs_Virus_balanced_accuracy_summary.csv")

# ---------- parsing .report ----------
def split_report_line(line: str):
    parts = line.rstrip("\n").split("\t")
    if len(parts) < 6:
        parts = re.split(r"\s+", line.rstrip("\n"), maxsplit=5)
    if len(parts) < 6:
        return None
    return parts[:6]

def extract_pct_taxid(path: str, target_taxid: str) -> Optional[float]:
    if not path or not os.path.exists(path):
        return None
    with open(path, "r") as f:
        for raw in f:
            parts = split_report_line(raw)
            if not parts:
                continue
            pct_str, _, _, _, taxid, _ = parts
            if taxid.strip() == target_taxid:
                s = pct_str.strip().replace(",", ".")
                try:
                    return float(s)
                except ValueError:
                    return None
    return None

# ---------- csv counts ----------
def counts_from_csv(path: str) -> Tuple[int, int]:
    """Ritorna (n_bacteria, n_virus) dalla colonna 'label' del *_test_only.csv."""
    df = pd.read_csv(path)
    vc = df["label"].value_counts()
    return int(vc.get("bacteria", 0)), int(vc.get("virus", 0))

# ---------- filename helpers ----------
def norm_key(s: str) -> str:
    return re.sub(r"[\s_]+", "", s).lower()

def find_corresponding_report(stem: str, kind: str, results_dir: str) -> Optional[str]:
    # match exact <stem>_<kind>.report ignorando spazi/underscore/case
    want = norm_key(f"{stem}_{kind}.report")
    for p in glob.glob(os.path.join(results_dir, "*.report")):
        if norm_key(os.path.basename(p)) == want:
            return p
    # fallback: prefix match tollerante
    want_prefix = norm_key(stem)
    for p in glob.glob(os.path.join(results_dir, f"*{kind}.report")):
        b = os.path.basename(p)
        if norm_key(b).startswith(want_prefix) and b.lower().endswith(f"_{kind}.report"):
            return p
    return None

# ---------- tag discovery (SOLO Bacteria vs Virus) ----------
def normalize_tag_spaces(tag: str) -> str:
    return re.sub(r"[_\s]+", " ", tag.strip()).lower()

def is_bacteria_vs_virus_tag(tag: str) -> bool:
    """Accetta SOLO: split_Bacteria vs Virus_run<digits>_seed<digits> (spazi/underscore indifferenti)."""
    t = normalize_tag_spaces(tag)
    return bool(re.match(r"^split bacteria vs virus run\d+ seed\d+$", t))

def discover_from_csvs(data_dir: str) -> list:
    tags = []
    for p in glob.glob(os.path.join(data_dir, "*_test_only.csv")):
        tag = os.path.basename(p).replace("_test_only.csv", "")
        if is_bacteria_vs_virus_tag(tag):
            tags.append(tag)
    return sorted(set(tags))

# ---------- metrics ----------
def compute_for_tag(tag: str, results_dir: str, data_dir: str) -> Dict:
    rep_bacteria = find_corresponding_report(tag, "bacteria", results_dir)
    rep_virus    = find_corresponding_report(tag, "virus", results_dir)
    csv_path     = os.path.join(data_dir, f"{tag}_test_only.csv")

    # % target dai report
    bacteria_pct = extract_pct_taxid(rep_bacteria, "2")      # Bacteria
    virus_pct    = extract_pct_taxid(rep_virus, "10239")     # Viruses

    # conteggi reali dal CSV
    n_bacteria = n_virus = None
    if os.path.exists(csv_path):
        n_bacteria, n_virus = counts_from_csv(csv_path)

    # TPR/TNR
    TPR = virus_pct/100.0 if virus_pct is not None else None  # positivi = virus
    TNR = bacteria_pct/100.0    if bacteria_pct    is not None else None  # negativi = batteri

    FNR = None if TPR is None else (1 - TPR)
    FPR = None if TNR is None else (1 - TNR)

    # Confusion matrix stimata
    TP = FN = TN = FP = None
    if (n_virus is not None) and (TPR is not None):
        TP = round(TPR * n_virus); FN = n_virus - TP
    if (n_virus is not None) and (TNR is not None):
        TN = round(TNR * n_bacteria); FP = n_bacteria - TN

    bacc = None if (TPR is None or TNR is None) else (TPR + TNR) / 2.0

    return {
        "tag": tag,
        "csv": csv_path,
        "bacteria_report": rep_bacteria,
        "virus_report": rep_virus,
        "bacteria_pct": bacteria_pct,
        "virus_pct": virus_pct,
        "non_bacteria_pct": None if bacteria_pct is None else (100 - bacteria_pct),
        "non_virus_pct": None if virus_pct is None else (100 - virus_pct),
        "n_bacteria": n_bacteria, "n_virus": n_virus,
        "TP": TP, "FN": FN, "TN": TN, "FP": FP,
        "TPR": TPR, "TNR": TNR, "FPR": FPR, "FNR": FNR,
        "balanced_accuracy": bacc,
    }

def main():
    tags = discover_from_csvs(DATA_DIR)
    rows = [compute_for_tag(t, RESULTS_DIR, DATA_DIR) for t in tags]
    out = pd.DataFrame(rows)
    cols = [
        "tag","bacteria_pct","virus_pct",
        "non_bacteria_pct","non_virus_pct",
        "n_bacteria","n_virus","TP","FN","TN","FP",
        "TPR","TNR","FPR","FNR","balanced_accuracy",
        "bacteria_report","virus_report","csv"
    ]
    out = out[[c for c in cols if c in out.columns]]
    out.to_csv(OUT_CSV, index=False)
    print(f"[OK] Salvato: {OUT_CSV}")
    with pd.option_context("display.max_columns", None):
        print(out.head(len(out)))

if __name__ == "__main__":
    main()

[OK] Salvato: /blue/simone.marini/share/kraken2/Data/data_prediction/kraken_Bacteria_vs_Virus_balanced_accuracy_summary.csv
                                       tag  bacteria_pct  virus_pct  \
0   split_Bacteria vs Virus_run10_seed2034         97.31      13.47   
1   split_Bacteria vs Virus_run11_seed2035         97.25      12.91   
2   split_Bacteria vs Virus_run12_seed2036         97.27      12.46   
3   split_Bacteria vs Virus_run13_seed2037         97.28      12.35   
4   split_Bacteria vs Virus_run14_seed2038         97.27      10.33   
5   split_Bacteria vs Virus_run15_seed2039         97.24      12.91   
6   split_Bacteria vs Virus_run16_seed2040         97.25      13.36   
7   split_Bacteria vs Virus_run17_seed2041         97.26      12.68   
8   split_Bacteria vs Virus_run18_seed2042         97.29       9.54   
9   split_Bacteria vs Virus_run19_seed2043         97.29      11.00   
10   split_Bacteria vs Virus_run1_seed2025         97.26      12.46   
11  split_Bacteria vs Vi

In [1]:
#!/usr/bin/env python3
## bacteria vs virus vs human
import os, re, glob, pandas as pd
from typing import Optional, Tuple, Dict

RESULTS_DIR = "/blue/simone.marini/share/kraken2/Results"
DATA_DIR    = "/blue/simone.marini/share/kraken2/Data/data_prediction"
OUT_CSV     = os.path.join(DATA_DIR, "kraken_Bacteria_Human_Virus_multiclass_summary.csv")

# ---------- parsing .report ----------
def split_report_line(line: str):
    parts = line.rstrip("\n").split("\t")
    if len(parts) < 6:
        parts = re.split(r"\s+", line.rstrip("\n"), maxsplit=5)
    if len(parts) < 6:
        return None
    return parts[:6]

def extract_pct_taxid(path: str, target_taxid: str) -> Optional[float]:
    if not path or not os.path.exists(path):
        return None
    with open(path, "r") as f:
        for raw in f:
            parts = split_report_line(raw)
            if not parts:
                continue
            pct_str, _, _, _, taxid, _ = parts
            if taxid.strip() == target_taxid:
                s = pct_str.strip().replace(",", ".")
                try:
                    return float(s)
                except ValueError:
                    return None
    return None

# ---------- csv counts ----------
def counts_from_csv(path: str) -> Tuple[int, int, int]:
    df = pd.read_csv(path)
    vc = df["label"].value_counts()
    return int(vc.get("bacteria", 0)), int(vc.get("human", 0)), int(vc.get("virus", 0))

# ---------- filename helpers ----------
def norm_key(s: str) -> str:
    return re.sub(r"[\s_]+", "", s).lower()

def find_corresponding_report(stem: str, kind: str, results_dir: str) -> Optional[str]:
    want = norm_key(f"{stem}_{kind}.report")
    for p in glob.glob(os.path.join(results_dir, "*.report")):
        if norm_key(os.path.basename(p)) == want:
            return p
    # fallback: prefix match tollerante
    want_prefix = norm_key(stem)
    for p in glob.glob(os.path.join(results_dir, f"*{kind}.report")):
        b = os.path.basename(p)
        if norm_key(b).startswith(want_prefix) and b.lower().endswith(f"_{kind}.report"):
            return p
    return None

# ---------- tag discovery (SOLO Bacteria vs Human vs Virus) ----------
def normalize_tag_spaces(tag: str) -> str:
    return re.sub(r"[_\s]+", " ", tag.strip()).lower()

def is_bhv_tag(tag: str) -> bool:
    """Accetta SOLO: split_Bacteria vs Human vs Virus_run<digits>_seed<digits>."""
    t = normalize_tag_spaces(tag)
    return bool(re.match(r"^split bacteria vs human vs virus run\d+ seed\d+$", t))

def discover_from_csvs(data_dir: str) -> list:
    tags = []
    for p in glob.glob(os.path.join(data_dir, "*_test_only.csv")):
        tag = os.path.basename(p).replace("_test_only.csv", "")
        if is_bhv_tag(tag):
            tags.append(tag)
    return sorted(set(tags))

# ---------- metrics ----------
def compute_for_tag(tag: str, results_dir: str, data_dir: str) -> Dict:
    rep_bacteria = find_corresponding_report(tag, "bacteria", results_dir)
    rep_human    = find_corresponding_report(tag, "human", results_dir)
    rep_virus    = find_corresponding_report(tag, "virus", results_dir)
    csv_path     = os.path.join(data_dir, f"{tag}_test_only.csv")

    # % per classe dai report (taxid Kraken)
    bacteria_pct = extract_pct_taxid(rep_bacteria, "2")                            # Bacteria
    human_pct    = extract_pct_taxid(rep_human, "9606") or extract_pct_taxid(rep_human, "9605")  # H. sapiens / Homo
    virus_pct    = extract_pct_taxid(rep_virus, "10239")                           # Viruses

    # TPR per classe (uno-vs-rest)
    TPR_bact  = None if bacteria_pct is None else bacteria_pct / 100.0
    TPR_human = None if human_pct    is None else human_pct    / 100.0
    TPR_virus = None if virus_pct    is None else virus_pct    / 100.0

    # Conteggi reali
    n_bact = n_human = n_virus = None
    if os.path.exists(csv_path):
        n_bact, n_human, n_virus = counts_from_csv(csv_path)
        N = n_bact + n_human + n_virus
    else:
        N = None

    # TP/FN stimati per classe
    def est_tp_fn(n, tpr):
        if n is None or tpr is None:
            return None, None
        tp = round(tpr * n)
        return tp, (n - tp)
    TP_bact,  FN_bact  = est_tp_fn(n_bact,  TPR_bact)
    TP_human, FN_human = est_tp_fn(n_human, TPR_human)
    TP_virus, FN_virus = est_tp_fn(n_virus, TPR_virus)

    # Macro Balanced Accuracy (media delle recall)
    bacc_macro = None
    tprs = [t for t in (TPR_bact, TPR_human, TPR_virus) if t is not None]
    if len(tprs) == 3:
        bacc_macro = sum(tprs) / 3.0

    # Micro accuracy (stimata)
    micro_acc = None
    if N and all(v is not None for v in (TP_bact, TP_human, TP_virus)):
        micro_acc = (TP_bact + TP_human + TP_virus) / N

    return {
        "tag": tag,
        "csv": csv_path,
        "bacteria_report": rep_bacteria,
        "human_report": rep_human,
        "virus_report": rep_virus,
        "bacteria_pct": bacteria_pct,
        "human_pct": human_pct,
        "virus_pct": virus_pct,
        "TPR_bacteria": TPR_bact,
        "TPR_human": TPR_human,
        "TPR_virus": TPR_virus,
        "n_bacteria": n_bact, "n_human": n_human, "n_virus": n_virus,
        "TP_bacteria": TP_bact, "FN_bacteria": FN_bact,
        "TP_human": TP_human, "FN_human": FN_human,
        "TP_virus": TP_virus, "FN_virus": FN_virus,
        "balanced_accuracy_macro": bacc_macro,
        "accuracy_micro_est": micro_acc,
        # utili se vuoi vedere i “complementi”:
        "non_bacteria_pct": None if bacteria_pct is None else (100 - bacteria_pct),
        "non_human_pct":   None if human_pct    is None else (100 - human_pct),
        "non_virus_pct":   None if virus_pct    is None else (100 - virus_pct),
    }

def main():
    tags = discover_from_csvs(DATA_DIR)
    rows = [compute_for_tag(t, RESULTS_DIR, DATA_DIR) for t in tags]
    out = pd.DataFrame(rows)
    cols = [
        "tag",
        "bacteria_pct","human_pct","virus_pct",
        "TPR_bacteria","TPR_human","TPR_virus",
        "n_bacteria","n_human","n_virus",
        "TP_bacteria","FN_bacteria","TP_human","FN_human","TP_virus","FN_virus",
        "balanced_accuracy_macro","accuracy_micro_est",
        "non_bacteria_pct","non_human_pct","non_virus_pct",
        "bacteria_report","human_report","virus_report","csv"
    ]
    out = out[[c for c in cols if c in out.columns]]
    out.to_csv(OUT_CSV, index=False)
    print(f"[OK] Salvato: {OUT_CSV}")
    with pd.option_context("display.max_columns", None):
        print(out.head(len(out)))

if __name__ == "__main__":
    main()

[OK] Salvato: /blue/simone.marini/share/kraken2/Data/data_prediction/kraken_Bacteria_Human_Virus_multiclass_summary.csv
                                                tag  bacteria_pct  human_pct  \
0   split_Bacteria vs Human vs Virus_run10_seed2034         97.31      99.95   
1   split_Bacteria vs Human vs Virus_run11_seed2035         97.25      99.90   
2   split_Bacteria vs Human vs Virus_run12_seed2036         97.27      99.91   
3   split_Bacteria vs Human vs Virus_run13_seed2037         97.28      99.90   
4   split_Bacteria vs Human vs Virus_run14_seed2038         97.27      99.89   
5   split_Bacteria vs Human vs Virus_run15_seed2039         97.24      99.94   
6   split_Bacteria vs Human vs Virus_run16_seed2040         97.25      99.91   
7   split_Bacteria vs Human vs Virus_run17_seed2041         97.26      99.93   
8   split_Bacteria vs Human vs Virus_run18_seed2042         97.29      99.94   
9   split_Bacteria vs Human vs Virus_run19_seed2043         97.29      99.90   
