In [None]:
"""Workbook to analyse encode predictions.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-argument, too-many-branches, pointless-statement, unreachable, unused-import

## SETUP

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

import copy
import functools
import gc
import subprocess
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from IPython.display import display
from sklearn.metrics import confusion_matrix, f1_score

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_ORDER,
    CELL_TYPE,
    LIFE_STAGE,
    SEX,
    IHECColorMap,
    MetadataHandler,
    SplitResultsHandler,
    display_perc,
    merge_life_stages,
)

# from plotly.subplots import make_subplots

In [None]:
CANCER = "harmonized_sample_cancer_high"
CORE_ASSAYS = ASSAY_ORDER[0:7]

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [None]:
metadata_handler = MetadataHandler(paper_dir)
split_results_handler = SplitResultsHandler()

In [None]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map

In [None]:
encode_metadata_dir = base_data_dir / "metadata" / "encode"

In [None]:
encode_predictions_dir = base_data_dir / "training_results" / "predictions" / "encode"

In [None]:
for path in [encode_metadata_dir, encode_predictions_dir]:
    if not path.exists():
        raise FileNotFoundError(f"Directory {path} does not exist.")

In [None]:
accepted_cts = [
    "T cell",
    "neutrophil",
    "brain",
    "monocyte",
    "lymphocyte of B lineage",
    "myeloid cell",
    "venous blood",
    "macrophage",
    "mesoderm-derived structure",
    "endoderm-derived structure",
    "colon",
    "connective tissue cell",
    "hepatocyte",
    "mammary gland epithelial cell",
    "muscle organ",
    "extraembryonic cell",
]
accepted_cts = [ct.lower() for ct in accepted_cts]

### Get complete metadata

Was created in `encode_metadata_creation.ipynb` using new web API downloads.

In [None]:
full_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_full_metadata_2025-02_no_revoked.csv"
)
complete_metadata_df = pd.read_csv(full_metadata_path, low_memory=False)

5 categories value counts

In [None]:
for cat in [
    ASSAY,
    CELL_TYPE,
    "cell_type",
    "epiclass_sample_ontology",
    "donor_life_stage",
    "donor_sex",
    "cancer_status",
]:
    try:
        print(complete_metadata_df[cat].value_counts(dropna=False), "\n")
    except KeyError:
        print(f"Column {cat} not found.")

In [None]:
if "cancer_status" not in complete_metadata_df.columns:
    chip_path = (
        encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
    )
    chip_metadata_df = pd.read_csv(chip_path, low_memory=False)

    cancer_status = chip_metadata_df[["md5sum", "cancer_status"]]
    complete_metadata_df = complete_metadata_df.merge(
        cancer_status, how="left", left_on="FILE_accession", right_on="md5sum"
    )
    complete_metadata_df.drop(columns="md5sum", inplace=True)
    del chip_metadata_df

    gc.collect()

### Merge all available predictions

In [None]:
chip_pred_dfs = {}
nb_chip_files = 9619
for folder in encode_predictions_dir.glob("*1l_3000n"):
    if not folder.is_dir():
        continue
    cat = folder.name.split("_1l_3000n")[0]  # [category]_1l_3000n
    pred_file = list(folder.rglob("complete_no_valid_oversample*all_augmented.csv"))[0]
    encode_df = pd.read_csv(pred_file)
    chip_pred_dfs[cat] = encode_df
    print(cat, encode_df.shape)
    assert encode_df.shape[0] == nb_chip_files

assert len(chip_pred_dfs) == 5

In [None]:
pred_dfs_rna = {}
nb_rna_files = 1790
for folder in encode_predictions_dir.glob("*1l_3000n"):
    if not folder.is_dir():
        continue
    cat = folder.name.split("_1l_3000n")[0]  # [category]_1l_3000n
    try:
        pred_file = list(folder.rglob("complete_no_valid_oversample*rna*augmented*csv"))[
            0
        ]
        print("Using augmented file", pred_file)
    except IndexError as err:
        pred_file = list(folder.rglob("complete_no_valid_oversample*rna*csv"))[0]
        print("Augmenting file", pred_file)

        # Augment the prediction file with some additional columns
        # script.py <prediction_file> <metadata_file>
        # output_template = "Augmented prediction file saved to {new_path}"
        script_path = (
            Path.home()
            / "Projects/sources/epi_ml/src/python/epi_ml/utils/augment_predict_file.py"
        )
        if not script_path.exists():
            raise FileNotFoundError(f"Script {script_path} does not exist.") from err

        if not full_metadata_path.exists():
            raise FileNotFoundError(
                f"Metadata file {full_metadata_path} does not exist."
            ) from err

        output = subprocess.run(
            [
                "python",
                str(script_path),
                "-v",
                str(pred_file),
                str(full_metadata_path.with_suffix(".json")),
            ],
            check=False,
            capture_output=True,
        )
        if output.returncode != 0:
            raise RuntimeError(
                f"Error running script: {output.stderr.decode('utf-8')}"
            ) from err

        stdout = output.stdout.decode("utf-8")
        pred_file = stdout.strip().split("Augmented prediction file saved to ", 1)[-1]
        pred_file = Path(pred_file).resolve()

    encode_df = pd.read_csv(pred_file)
    encode_df["md5sum"] = encode_df["md5sum"].str.split(pat="_", n=1, expand=True)[0]
    pred_dfs_rna[cat] = encode_df
    print(cat, encode_df.shape)
    assert encode_df.shape[0] == nb_rna_files

assert len(pred_dfs_rna) == 5

In [None]:
rna_results_file = encode_predictions_dir / "full_rna_merged_df.csv"
if not rna_results_file.exists():
    rna_dict = copy.deepcopy(pred_dfs_rna)

    same_col_names = 8
    # Make all different columns have unique relevant names except for the pred vector
    for cat, df in rna_dict.items():
        df = df.drop(columns=["Same?"])
        old_names = df.columns[1 : same_col_names - 1]
        new_names = [f"{old_name} ({cat})" for old_name in old_names]
        df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
        df.set_index("md5sum", inplace=True)

        y_true_col = [col for col in df.columns if "True class" in col][0]
        if df[y_true_col].nunique() == 1:
            df = df.drop(y_true_col, axis=1)
        rna_dict[cat] = df

    df_order = [ASSAY, CELL_TYPE, SEX, LIFE_STAGE, CANCER]
    df_list = [rna_dict[cat] for cat in df_order]
    full_rna_merged_df = functools.reduce(merge_dataframes, df_list)
    full_rna_merged_df.to_csv(encode_predictions_dir / "full_rna_merged_df.csv")

In [None]:
assay_diff_columns = ["mrna_seq", "rna_seq", "wgbs-pbat", "wgbs-standard"]

concat_pred_dfs = {}
for cat, chip_results in chip_pred_dfs.items():
    rna_results = pred_dfs_rna[cat]

    if cat == ASSAY:
        chip_results.loc[:, assay_diff_columns] = "NA"

    if not chip_results.columns.equals(rna_results.columns):
        raise ValueError(
            f"Columns are not the same for {cat}. Chip: {chip_results.columns}, RNA: {rna_results.columns}"
        )

    all_results = pd.concat([chip_results, rna_results])

    assert all_results.shape == (
        chip_results.shape[0] + rna_results.shape[0],
        chip_results.shape[1],
    )

    concat_pred_dfs[cat] = all_results
    assert len(all_results) == chip_results.shape[0] + rna_results.shape[0]

In [None]:
same_col_names = 8
# Make all different columns have unique relevant names except for the pred vector
for cat, df in concat_pred_dfs.items():
    df.drop(columns=["Same?"], inplace=True)
    old_names = df.columns[1 : same_col_names - 1]
    new_names = [f"{old_name} ({cat})" for old_name in old_names]
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    df.set_index("md5sum", inplace=True)
    concat_pred_dfs[cat] = df

In [None]:
df_order = [ASSAY, CELL_TYPE, SEX, LIFE_STAGE, CANCER]
df_list = [concat_pred_dfs[cat] for cat in df_order]
full_merged_df = functools.reduce(merge_dataframes, df_list)

In [None]:
preds_plus_metadata_df: pd.DataFrame = full_merged_df.merge(
    complete_metadata_df,
    left_index=True,
    right_on="FILE_accession",
    how="left",
    suffixes=("", "_delete"),
)

for col in preds_plus_metadata_df.columns:
    if col.endswith("_delete"):
        print(col)
        preds_plus_metadata_df.drop(columns=col, inplace=True)

In [None]:
assert isinstance(preds_plus_metadata_df, pd.DataFrame)  # pylance being weird

meta_col_order = [
    col for col in complete_metadata_df.columns if col in preds_plus_metadata_df.columns
]
results_col_order = [
    col for col in full_merged_df.columns if col in preds_plus_metadata_df.columns
]

new_order = results_col_order + meta_col_order
preds_plus_metadata_df = preds_plus_metadata_df.loc[:, new_order]

In [None]:
for pairs in [
    (ASSAY, ASSAY),
    (SEX, SEX),
    (LIFE_STAGE, LIFE_STAGE),
    (CANCER, "cancer_status"),
    (CELL_TYPE, "epiclass_sample_ontology"),
]:
    name1 = f"True class ({pairs[0]})"
    name2 = pairs[1]
    print(name1, name2)
    preds_plus_metadata_df[name1] = preds_plus_metadata_df[name2]
    preds_plus_metadata_df[pairs[0]] = preds_plus_metadata_df[pairs[1]]

In [None]:
logdir = base_data_dir / "training_results" / "predictions" / "encode"
preds_plus_metadata_df.to_csv(
    logdir / "complete_encode_predictions_augmented_2025-02_metadata.csv", index=False
)

Remove datasets overlapping with EpiATLAS

In [None]:
preds_plus_metadata_df["in_epiatlas"].value_counts(dropna=False)

In [None]:
preds_plus_metadata_df["in_epiatlas"].fillna("unknown", inplace=True)

all_preds_no_epiatlas = preds_plus_metadata_df[
    preds_plus_metadata_df["in_epiatlas"].astype(str) == "False"
]

In [None]:
print(all_preds_no_epiatlas.shape)
display(all_preds_no_epiatlas[ASSAY].value_counts(dropna=False))

## Cell type metrics

In [None]:
# Only keep the predictions for the 16 cell types
accepted_cts = [
    "T cell",
    "neutrophil",
    "brain",
    "monocyte",
    "lymphocyte of B lineage",
    "myeloid cell",
    "venous blood",
    "macrophage",
    "mesoderm-derived structure",
    "endoderm-derived structure",
    "colon",
    "connective tissue cell",
    "hepatocyte",
    "mammary gland epithelial cell",
    "muscle organ",
    "extraembryonic cell",
]
accepted_cts = [ct.lower() for ct in accepted_cts]

In [None]:
def compute_metrics(
    df: pd.DataFrame, cat_label: str | None = None, min_pred: float | None = None
) -> Tuple[float, float, int]:
    """Compute the accuracy and f1 of the predictions.

    Args:
        df: DataFrame containing the predictions and true classes.
        cat_label: Label for the category being evaluated, for
        labels of the form "True class (category)".
        min_pred: Minimum prediction score to consider.

    Returns:
        Tuple of accuracy, f1 and number of samples.
    """

    true_label = "True class"
    pred_label = "Predicted class"
    max_pred_label = "Max pred"
    if cat_label:
        true_label = f"{true_label} ({cat_label})"
        pred_label = f"{pred_label} ({cat_label})"
        max_pred_label = f"{max_pred_label} ({cat_label})"

    sub_df = df.copy()
    if min_pred:
        try:
            sub_df = sub_df[sub_df[max_pred_label] >= min_pred]
        except KeyError as err:
            raise KeyError(
                f"Column '{max_pred_label}' not found in DataFrame and min_pred is not None."
            ) from err

    y_true = sub_df[true_label]
    y_pred = sub_df[pred_label]

    acc = (y_true == y_pred).mean()

    f1: float = f1_score(  # type: ignore
        y_true,
        y_pred,
        labels=y_pred.unique(),
        average="macro",
    )
    return acc, f1, sub_df.shape[0]

In [None]:
cell_type_df = preds_plus_metadata_df.copy(deep=True)
cell_type_df = cell_type_df[cell_type_df[CELL_TYPE].isin(accepted_cts)]
print(cell_type_df.shape)

In [None]:
cell_type_df[CELL_TYPE].value_counts(dropna=False)

In [None]:
core_assays = ASSAY_ORDER + ["mrna_seq"]
cell_type_core_df = cell_type_df[cell_type_df[ASSAY].isin(core_assays)]
cell_type_noncore_df = cell_type_df[~cell_type_df[ASSAY].isin(core_assays)]

In [None]:
for df in [cell_type_core_df, cell_type_noncore_df]:
    print(df.shape)
    N = df.shape[0]
    display(df[CELL_TYPE].value_counts(dropna=False))
    display(df["assay"].value_counts(dropna=False))

In [None]:
for df, name in zip([cell_type_core_df, cell_type_noncore_df], ["core", "noncore"]):
    print(name)
    full_N = df.shape[0]
    for min_pred in [0, 0.6, 0.8, 0.9]:
        acc, f1, N = compute_metrics(df, CELL_TYPE, min_pred)
        print(
            f"Min pred: {min_pred}, N: {N} ({N/full_N:.2%}), Acc: {acc:.3f}, F1: {f1:.3f}"
        )
    print()

### Other cell type trainings metrics

Includes cell type classifiers trained with single assays. (e.g. only h3k4me1 files)

In [None]:
pred_folder = (
    base_data_dir
    / f"training_results/dfreeze_v2/hg38_100kb_all_none/{CELL_TYPE}_1l_3000n/complete-no_valid-oversampling"
)
if not pred_folder.exists():
    raise FileNotFoundError(f"Directory {pred_folder} does not exist.")

In [None]:
other_ct_dfs = {}
for folder in pred_folder.glob("*"):
    if not folder.is_dir():
        print(f"Skipping {folder}")
        continue
    pred_file = list(folder.glob("predictions/*.csv"))

    if len(pred_file) > 1:
        print(f"More than one prediction file found in {folder}")
        continue

    if len(pred_file) == 0:
        print(f"No prediction file found in {folder}")
        continue

    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file)
    name = folder.name.replace("complete_no_valid_oversample_", "")

    for col in ["True class", "Predicted class"]:
        pred_df[col] = pred_df[col].str.lower()

    other_ct_dfs[name] = pred_df

In [None]:
other_ct_dfs.keys()

In [None]:
def compute_cell_type_acc(
    metadata_df: pd.DataFrame,
    pred_dfs_dict: Dict[str, pd.DataFrame],
    min_pred: float = 0.6,
) -> None:
    """Compute the accuracy of the predictions for the 16 cell types.
    Inner meger of the metadata and predictions is performed.
    """
    meta_df = metadata_df[metadata_df[CELL_TYPE].isin(accepted_cts)].copy()

    # print("Assay counts for 16 cell types")
    # values_count = meta_df["Assay"].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    # print("Cell types distribution")
    # values_count = meta_df[CELL_TYPE].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    for name, pred_df in sorted(pred_dfs_dict.items()):
        print(name)
        pred_w_ct = pred_df.merge(
            meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
        )
        N = pred_w_ct.shape[0]

        # Calculate results for all predictions
        true, pred = pred_w_ct[CELL_TYPE], pred_w_ct["Predicted class"]

        total_correct = (true == pred).sum()
        acc = total_correct / N
        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(f"Acc (pred>0.0): {total_correct}/{N} ({acc:.2%})")
        print(f"F1 (pred>0.0): {f1:.2f}")

        # Calculate results for predictions with max_pred
        pred_w_ct_filtered = pred_w_ct[pred_w_ct["Max pred"] > min_pred]
        true, pred = pred_w_ct_filtered[CELL_TYPE], pred_w_ct_filtered["Predicted class"]

        total_correct_filtered = (true == pred).sum()
        perc_filtered = total_correct_filtered / pred_w_ct_filtered.shape[0]

        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(
            f"Acc (pred>{min_pred:.1f}): {total_correct_filtered}/{pred_w_ct_filtered.shape[0]} ({perc_filtered:.2%})"
        )
        diff = N - pred_w_ct_filtered.shape[0]
        print(f"F1 (pred>{min_pred}): {f1:.2f}")
        print(f"Samples ignored at {min_pred:.1f}: {diff} ({diff/N:.2%})\n")

In [None]:
mask_core_assays = complete_metadata_df[ASSAY].isin(core_assays)
non_core_metadata_df = complete_metadata_df[~mask_core_assays]
core_metadata_df = complete_metadata_df[mask_core_assays]

compute_cell_type_acc(non_core_metadata_df, other_ct_dfs)
print("\n")
compute_cell_type_acc(core_metadata_df, other_ct_dfs)

#### Confusion matrices

In [None]:
conf_matrix_logdir = (
    base_fig_dir / "encode_predictions" / "confusion_matrices" / CELL_TYPE / "core"
)
if not conf_matrix_logdir.exists():
    conf_matrix_logdir.mkdir(parents=True)

In [None]:
meta_df = core_metadata_df[core_metadata_df[CELL_TYPE].isin(accepted_cts)].copy()

limited_pred_dfs_dict = {k: v for k, v in other_ct_dfs.items() if "-ct16" in k}

In [None]:
for no_rna in [True, False]:
    for task_name, df in limited_pred_dfs_dict.items():
        pred_w_ct = df.merge(
            meta_df, left_on="md5sum", right_on="FILE_accession", how="inner"
        )

        if no_rna:
            pred_w_ct = pred_w_ct[~pred_w_ct[ASSAY].str.contains("rna")]

        for threshold in [0, 0.6, 0.8]:
            sub_df = pred_w_ct[pred_w_ct["Max pred"] >= threshold]

            true, pred = sub_df[CELL_TYPE], sub_df["Predicted class"]
            cm = confusion_matrix(true, pred, labels=accepted_cts)

            filename = f"{task_name}-core-confusion_matrix-{threshold*100}"
            if no_rna:
                final_filename = f"{filename}-no_rna"
                this_logdir = conf_matrix_logdir / "no_rna"
                this_logdir.mkdir(parents=True, exist_ok=True)
            else:
                final_filename = filename
                this_logdir = conf_matrix_logdir / "with_rna"
                this_logdir.mkdir(parents=True, exist_ok=True)

            writer = ConfusionMatrixWriter(labels=accepted_cts, confusion_matrix=cm)
            writer.to_all_formats(
                logdir=this_logdir,
                name=final_filename,
            )
            plt.close("all")

## ASSAY metrics


No RNA-seq here

Download note
~~~bash
paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2/hg38_100kb_all_none/assay_epiclass_1l_3000n"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./*c/complete_no_valid_oversample .

paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./hg38_100kb_all_none_w_encode_noncore/assay_epiclass_1l_3000n/complete_no_valid_oversample-0 .

find -type f -name "*.list*.csv" -print0 | xargs -0 rename 's/\.list//g'
~~~

In [None]:
data_dir = base_data_dir / "training_results" / "dfreeze_v2"
assay7_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/7c/complete_no_valid_oversample"
)
assay11_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/11c/complete_no_valid_oversample"
)
assay13_folder = (
    data_dir
    / f"hg38_100kb_all_none_w_encode_noncore/{ASSAY}_1l_3000n/13c/complete_no_valid_oversample"
)

In [None]:
pred_dfs_dict = {}
for name, folder in zip(
    ["7c", "11c", "13c"], [assay7_folder, assay11_folder, assay13_folder]
):
    if not folder.exists():
        print(f"Folder {folder} does not exist.")
        continue

    pred_folder = folder / "predictions" / "encode"
    if not pred_folder.exists():
        print(f"Folder {pred_folder} does not exist.")
        continue

    pred_file = list(pred_folder.glob("*.csv"))
    if len(pred_file) != 1:
        print(f"Found {len(pred_file)} files in {pred_folder}.")
        continue
    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file, sep=",")
    try:
        pred_df.drop(columns=["Same?"], inplace=True)
    except KeyError:
        pass

    # Add assay metadata
    pred_df = pred_df.merge(
        complete_metadata_df, left_on="md5sum", right_on="FILE_accession", how="left"
    )

    pred_df["True class"] = pred_df["assay_epiclass"]
    pred_dfs_dict[name] = pred_df

### Core7 preds

In [None]:
output_dir = data_dir = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "assay_epiclass_1l_3000n"
)
for name, df in pred_dfs_dict.items():
    print(name)
    print(df.shape)

    # Only consider files already labeled with core7 assays
    df = df[df[ASSAY].isin(CORE_ASSAYS)]

    # Only consider non-EpiAtlas samples
    df = df[df["in_epiatlas"].astype(str) == "False"]

    # Calculate results for all predictions
    correct_pred = df["Predicted class"] == df["True class"]
    total_correct = correct_pred.sum()
    total = df.shape[0]
    perc = total_correct / total
    print(f"Acc (pred>=0.0) {total_correct}/{total} ({perc:.2%})")

    for assay in CORE_ASSAYS:
        min_pred = 0.6
        df_assay = df[df[ASSAY] == assay]
        df_assay = df_assay[df_assay["Max pred"] >= min_pred]
        correct_pred = df_assay["Predicted class"] == df_assay["True class"]
        total_correct = correct_pred.sum()
        total = df_assay.shape[0]
        perc = total_correct / total
        print(
            f"Acc (pred>={min_pred:.1f}) {assay} = {total_correct}/{total} ({perc:.2%})"
        )

    # Calculate results for predictions with max_pred > 0.6
    df_filtered = df[df["Max pred"] >= 0.6]
    correct_pred_filtered = df_filtered["Predicted class"] == df_filtered["True class"]
    total_correct_filtered = correct_pred_filtered.sum()
    total_filtered = df_filtered.shape[0]
    perc_filtered = total_correct_filtered / total_filtered
    print(
        f"Acc (pred>=0.6): {total_correct_filtered}/{total_filtered} ({perc_filtered:.2%})"
    )

    df_filtered_wrong = df_filtered[~correct_pred_filtered]
    # groupby = (
    #     df_filtered_wrong.groupby(["True class", "Predicted class"])
    #     .size()
    #     .sort_values(ascending=False)
    # )
    # display("Mislabels:", groupby)

    df_filtered_wrong.to_csv(
        output_dir / f"encode_only_mislabels_minPred0.6_{name}.csv", index=False
    )

### non-core 7c preds

In [None]:
non_core_preds = all_preds_no_epiatlas[
    all_preds_no_epiatlas[ASSAY].isin(["ctcf", "non-core"])
]
print(f"Non-core datasets: {non_core_preds.shape[0]}")

groupby = non_core_preds.groupby(["assay"]).size()
groupby = groupby[groupby > 3]
selected_assays = groupby.index
print(f"Non-core targets/assays with > 3 files: {non_core_preds.shape[0]}")

non_core_preds = non_core_preds[non_core_preds["assay"].isin(selected_assays)]
print(f"Non-core files respecting selected assays: {non_core_preds.shape[0]}")

N_high_conf = (non_core_preds["Max pred (assay_epiclass)"] >= 0.6).sum()
N_total = non_core_preds.shape[0]
print(
    f"High confidence non-core predictions: {N_high_conf / N_total:.2%} ({N_high_conf}/{N_total})"
)

In [None]:
# 7c preds on non-core assays
name = "7c"
df = pred_dfs_dict[name].copy(deep=True)
df = df[~df["True class"].isin(ASSAY_ORDER)]
print(df.shape)
print(df["assay"].isna().sum())
display(df["assay"].value_counts(dropna=False))

In [None]:
output_dir = data_dir = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "assay_epiclass_1l_3000n"
)
if not output_dir.exists():
    output_dir.mkdir(parents=True)

# 7c preds on non-core assays
name = "7c"
df = pred_dfs_dict[name].copy(deep=True)
df = df[~df["True class"].isin(ASSAY_ORDER)]
df = df[~df["assay"].isna()]

non_core_df = all_preds_no_epiatlas[
    all_preds_no_epiatlas[ASSAY].isin(["ctcf", "non-core"])
]
for min_pred in [0, 0.6, 0.8]:
    df_filtered = df[df["Max pred"] >= min_pred]
    groupby = (
        df_filtered.groupby(["Predicted class", "assay"])
        .size()
        .reset_index(name="Count")
        .sort_values(by=["Predicted class", "Count"], ascending=[True, False])
        .set_index(["Predicted class", "assay"])["Count"]
    )
    groupby.to_csv(
        output_dir / f"encode_non-core_{name}_predictions_minPred{min_pred}.csv"
    )

In [None]:
encode_metadata_dir = base_data_dir / "metadata/encode"
non_core_categories_path = (
    encode_metadata_dir / "non-core_encode_assay_category_2024-08-29.csv"
)
if not non_core_categories_path.exists():
    raise FileNotFoundError(f"File {non_core_categories_path} does not exist.")

non_core_categories_df = pd.read_csv(
    non_core_categories_path, sep=",", names=["assay", "assay_category", "note"]
)
print(non_core_categories_df.shape)

In [None]:
df_w_cats = df.merge(
    non_core_categories_df[["assay", "assay_category"]],
    left_on="assay",
    right_on="assay",
    how="left",
)
print(df_w_cats.shape)

df_w_cats.fillna("not_looked", inplace=True)
display(df_w_cats["assay_category"].value_counts(dropna=False))

In [None]:
def create_non_core_preds_df(df: pd.DataFrame, min_pred: float = 0.6):
    """Create a DataFrame of non-core assay predictions."""
    results = {}
    assay_categories = dict(zip(df["assay"], df["assay_category"]))

    for assay, group in df.groupby("assay"):
        # N = group.shape[0]
        # if N < 3:
        #     continue

        group = group[group["Max pred"] >= min_pred]
        # N_post_filter = group.shape[0]
        # if N_post_filter == 0 or N_post_filter < min_n:
        #     continue

        groupby = (
            group.groupby(["Predicted class"])
            .size()
            .reset_index(name="Count")  # type: ignore
            .sort_values(by=["Count"], ascending=False)
        )

        results[assay] = dict(zip(groupby["Predicted class"], groupby["Count"]))

    result_df = pd.DataFrame(results).fillna(0)
    result_df = result_df.astype(int)
    result_df = result_df.T  # assay as row/index
    result_df["assay_category"] = result_df.index.map(assay_categories)
    return result_df

In [None]:
for min_pred in [0, 0.6, 0.8]:
    predicted_classes_df = create_non_core_preds_df(df_w_cats, min_pred=min_pred)
    predicted_classes_df.to_csv(
        output_dir
        / f"encode_non-core_{name}_predictions_per_assay_minPred{min_pred:.2f}.csv"
    )

In [None]:
def create_structured_dataframe(df_w_cats):
    """Create a structured dataframe with the percentage of predictions for each assay category."""
    # Create an empty list to store our data
    data = []

    # Iterate through the grouped data
    for predicted_class, group in df_w_cats.groupby("Predicted class"):
        for min_pred in list(np.arange(0, 1, 0.05)) + [0.99]:
            df_filtered = group[group["Max pred"] >= min_pred]
            counts = df_filtered["assay_category"].value_counts(dropna=False)
            total = counts.sum()

            # Calculate percentages
            percentages = (counts / total * 100).round(2)

            # Add data for each assay category
            for assay_category, percentage in percentages.items():
                data.append(
                    {
                        "Predicted class": predicted_class,
                        "Min pred": min_pred,
                        "assay_category": assay_category,
                        "Percentage": percentage,
                        "Count": counts[assay_category],
                        "Total samples": total,
                    }
                )

    # Create the dataframe
    df_structured = pd.DataFrame(data)

    # Set the multi-index
    df_structured = df_structured.set_index(
        ["Predicted class", "Min pred", "assay_category"]
    )

    return df_structured

In [None]:
assay_category_df = create_structured_dataframe(df_w_cats)
assay_category_df.to_csv(output_dir / "encode_non-core_7c_predictions_assay_category.csv")

In [None]:
section_fig_dir = base_fig_dir / "encode_predictions" / "assay_epiclass" / "non-core"
if not section_fig_dir.exists():
    raise FileNotFoundError(f"Directory {section_fig_dir} does not exist.")

#### X = assay_epiclass, stack = assay_category

In [None]:
fig_dir = section_fig_dir / "stacked_bar_X_assay_epiclass"
fig_dir.mkdir(parents=False, exist_ok=True)

bar_df = assay_category_df.reset_index()

predicted_class_order = [
    "h3k27ac",
    "h3k4me3",
    "h3k4me1",
    "h3k9me3",
    "h3k27me3",
    "h3k36me3",
    "input",
]
assay_category_color_map = {
    cat: px.colors.qualitative.Safe[i]
    for i, cat in enumerate(sorted(bar_df["assay_category"].unique()))
}

for min_pred in [0, 0.6, 0.8, 0.9]:
    sub_df = bar_df[
        (bar_df["Min pred"] > min_pred - 0.01) & (bar_df["Min pred"] < min_pred + 0.01)
    ]
    fig = px.bar(
        sub_df,
        x="Predicted class",
        y="Percentage",
        color="assay_category",
        title=f"Assay Category Composition for Each Predicted Class at predScore >= {min_pred:.2f}",
        labels={"Percentage": "Percentage (%)", "Predicted class": "Predicted Class"},
        barmode="stack",
        category_orders={"Predicted class": predicted_class_order},
        color_discrete_map=assay_category_color_map,
    )

    figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
    # fig.write_html(fig_dir / f"{figname}.html")
    # fig.write_image(fig_dir / f"{figname}.png")
    # fig.write_image(fig_dir / f"{figname}.svg")
    fig.show()

#### X = assay_category, stack = assay_epiclass

In [None]:
df = df_w_cats[df_w_cats["assay_category"] != "not_looked"]

In [None]:
assay_epiclass_order = [
    "h3k27ac",
    "h3k4me3",
    "h3k4me1",
    "h3k9me3",
    "h3k27me3",
    "h3k36me3",
    "input",
]
assay_epiclass_order = {assay: i for i, assay in enumerate(assay_epiclass_order)}

In [None]:
fig_dir = section_fig_dir / "stacked_bar_X_assay_category"
fig_dir.mkdir(parents=False, exist_ok=True)

assay_categories_order = [
    "trx_reg",
    "heterochrom",
    "polycomb",
    "splicing",
    "insulator",
    "other/mixed",
]

for min_pred in [0, 0.6, 0.8]:
    sub_df = df[df["Max pred"] >= min_pred]
    groupby = (
        sub_df.groupby(["assay_category", "Predicted class"])
        .size()
        .reset_index(name="Count")
        .sort_values(by=["assay_category", "Count"], ascending=[True, False])
    )
    groupby["Percentage"] = groupby.groupby("assay_category")["Count"].transform(
        lambda x: (x / x.sum()) * 100
    )

    # Add order for plotting
    groupby["assay_order"] = groupby["Predicted class"].map(assay_epiclass_order)
    groupby = groupby.sort_values(
        by=["assay_category", "assay_order"], ascending=[False, True]
    )

    # Main plot
    fig = px.bar(
        groupby,
        x="assay_category",
        y="Percentage",
        color="Predicted class",
        barmode="stack",
        category_orders={"assay_category": assay_categories_order},
        color_discrete_map=assay_colors,
        title=f"core7 predictions for non-core assays, predScore >= {min_pred:.2f}",
        labels={"Percentage": "Fraction (%)", "assay_category": "Assay Category"},
    )

    # Modify x-axis labels
    total_counts = groupby.groupby("assay_category")["Count"].sum()

    ticktext = [
        f"{assay_category} (N={total_counts[assay_category]})"
        for assay_category in assay_categories_order
    ]
    fig.update_xaxes(tickvals=assay_categories_order, ticktext=ticktext)

    # Save and display
    figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
    # fig.write_html(fig_dir / f"{figname}.html")
    # fig.write_image(fig_dir / f"{figname}.png")
    # fig.write_image(fig_dir / f"{figname}.svg")
    fig.show()

#### Assay category evolution with min_predScore

In [None]:
def create_assay_category_graphs(df, output_dir: Path | None = None):
    """Graph assay category distribution for each predicted class."""
    # Get unique predicted classes
    predicted_classes = df.index.get_level_values("Predicted class").unique()
    assay_categories = df.index.get_level_values("assay_category").unique()

    graph_colors = {
        cat: px.colors.qualitative.Safe[i]
        for i, cat in enumerate(sorted(assay_categories))
    }

    # Create a figure for each predicted class
    for predicted_class in predicted_classes:
        df_class = df.loc[predicted_class]

        # Get unique assay categories for this predicted class
        assay_categories = df_class.index.get_level_values("assay_category").unique()

        total_samples_at_zero = df_class.xs(0, level="Min pred")["Total samples"].iloc[0]

        # Create the figure
        fig = go.Figure()

        for assay_category in assay_categories:
            df_assay = df_class.xs(assay_category, level="assay_category")

            fig.add_trace(
                go.Scatter(
                    x=df_assay.index,
                    y=df_assay["Percentage"],
                    mode="lines+markers",
                    name=assay_category,
                    marker=dict(color=graph_colors[assay_category]),
                )
            )

        conserved_percentages = (
            df_class.groupby("Min pred")["Total samples"].first()
            / total_samples_at_zero
            * 100
        )
        fig.add_trace(
            go.Scatter(
                x=conserved_percentages.index,
                y=conserved_percentages.values,
                mode="lines+markers",
                name="Samples Conserved",
                line=dict(dash="dash", color="black"),
            )
        )

        # Update layout
        fig.update_layout(
            title=f"Composition for Predicted Class: {predicted_class}",
            xaxis_title="Min pred",
            yaxis_title="Percentage Composition",
            legend_title="Assay Category",
            hovermode="x unified",
        )

        fig.update_xaxes(range=[-0.01, 1.01])
        fig.update_yaxes(range=[0, 100])

        # Save
        if output_dir:
            filename = f"encode_non-core_7c_predictions_assay_category_{predicted_class}"
            fig.write_image(output_dir / f"{filename}.png")
            fig.write_image(output_dir / f"{filename}.svg")
            fig.write_html(output_dir / f"{filename}.html")
        fig.show()

In [None]:
# Assuming df_structured is your dataframe from the previous step
fig_dir = (
    base_fig_dir
    / "encode_predictions"
    / "assay_epiclass"
    / "non-core"
    / "line_graphs_over_min_pred"
)
fig_dir.mkdir(parents=False, exist_ok=True)
create_assay_category_graphs(df=assay_category_df, output_dir=fig_dir)

## OTHER - Sex, life stage, cancer

Throwing all the predictions together to get acc/F1 for each of 5 classifiers, on core/non-core data respectively. (for assay it gets more messy, cannot do non-core directly)

In [None]:
# create new life stage classification
merge_life_stage = {
    "adult": "adult",
    "embryo": "prenatal",
    "fetal": "prenatal",
    "newborn": "prenatal",
    "child": "child",
}
for label in [
    LIFE_STAGE,
    f"True class ({LIFE_STAGE})",
    f"Predicted class ({LIFE_STAGE})",
]:
    new_label = label.replace(LIFE_STAGE, f"{LIFE_STAGE}_merged")
    all_preds_no_epiatlas[new_label] = all_preds_no_epiatlas[label].map(merge_life_stage)

all_preds_no_epiatlas[f"Max pred ({LIFE_STAGE}_merged)"] = all_preds_no_epiatlas[
    f"Max pred ({LIFE_STAGE})"
]

### Accuracies per assay

#### Reformat data for easy plotting

In [None]:
def compute_all_acc_per_assay(all_preds_no_epiatlas: pd.DataFrame) -> Dict[str, Dict]:
    """Compute accuracy for each assay.
    Checks core9 assays (core7, *rna_seq, wgbs-*) + CTCF + non-core

    Args:
    - all_preds_no_epiatlas: The dataframe containing the predictions.

    Returns:
    - A dictionary with the accuracy for each assay.
        Format: {task_name:{assay: [(min_pred, acc, f1, nb_samples), ...], ...}, ...}
    """
    if not (all_preds_no_epiatlas["in_epiatlas"].astype(str) == "False").all():
        raise ValueError("all_preds_no_epiatlas should not contain EpiAtlas samples.")

    df = copy.deepcopy(all_preds_no_epiatlas)
    core_assays = ASSAY_ORDER
    all_assays = ASSAY_ORDER + ["ctcf", "non-core"]

    # merging rna_seq and mrna_seq
    for col in [ASSAY, f"True class ({ASSAY})", f"Predicted class ({ASSAY})"]:
        try:
            df[col] = df[col].str.replace("mrna_seq", "rna_seq")
        except KeyError as err:
            raise ValueError(f"Column '{col}' not found.") from err

    all_acc_per_assay = {}
    for name in [ASSAY, CELL_TYPE, SEX, LIFE_STAGE, f"{LIFE_STAGE}_merged", CANCER]:
        task_df = copy.deepcopy(df)
        y_true_col = f"True class ({name})"
        y_pred_col = f"Predicted class ({name})"
        max_pred_label = f"Max pred ({name})"

        if max_pred_label not in df.columns:
            raise ValueError(f"Column '{max_pred_label}' not found.")

        # remove unknown samples
        for label in [y_true_col, y_pred_col]:
            task_df[label].fillna("unknown", inplace=True)
            task_df = task_df[task_df[label] != "unknown"]

        if name == CELL_TYPE:
            task_df = task_df[task_df[CELL_TYPE].isin(accepted_cts)]

        acc_per_assay: Dict[str, List[Tuple[str, float, float, int]]] = {}
        for label in all_assays:
            if label in ["non-core", "ctcf"] and name == ASSAY:
                continue
            acc_per_assay[label] = []
            if label not in task_df[ASSAY].unique():
                continue
            assay_df = task_df[task_df[ASSAY] == label]
            for min_pred in ["0.0", "0.6", "0.8", "0.9"]:
                acc, f1, N = compute_metrics(
                    assay_df, cat_label=name, min_pred=float(min_pred)
                )
                acc_per_assay[label].append((min_pred, acc, f1, N))

        # Avg accuracy
        for set_label in ["avg-all", "avg-core", "avg-non-core"]:
            acc_per_assay[set_label] = []

        for min_pred in ["0.0", "0.6", "0.8", "0.9"]:
            core_df = task_df[task_df[ASSAY].isin(core_assays)]
            acc, f1, N = compute_metrics(
                core_df, cat_label=name, min_pred=float(min_pred)
            )
            acc_per_assay["avg-core"].append((min_pred, acc, f1, N))

            if name == ASSAY:
                continue

            non_core_df = task_df[~task_df[ASSAY].isin(core_assays)]
            acc, f1, N = compute_metrics(
                non_core_df, cat_label=name, min_pred=float(min_pred)
            )
            acc_per_assay["avg-non-core"].append((min_pred, acc, f1, N))

            compute_metrics(task_df, cat_label=name, min_pred=float(min_pred))
            acc_per_assay["avg-all"].append((min_pred, acc, f1, N))

        all_acc_per_assay[name] = acc_per_assay

    return all_acc_per_assay

In [None]:
def compute_and_save_acc_per_assay(
    preds_no_epiatlas: pd.DataFrame, filename: str
) -> pd.DataFrame:
    """
    Take a dataframe containing predictions for multiple tasks, and compute accuracy for each assay.
    Saves the results to a tsv file, to two hardcoded folders.

    Returns:
    - A dataframe with accuracy, f1 , N for each assay
    """
    all_acc_per_assay = compute_all_acc_per_assay(preds_no_epiatlas)

    # acc per assay to table
    rows = []
    for name, acc_per_assay in all_acc_per_assay.items():
        for assay, values in acc_per_assay.items():
            for min_pred, acc, f1, nb_samples in values:
                rows.append([name, assay, min_pred, acc, f1, nb_samples])
    df_acc_per_assay = pd.DataFrame(
        rows,
        columns=["task_name", ASSAY, "min_predScore", "acc", "f1-score", "nb_samples"],
    )

    df_acc_per_assay = df_acc_per_assay.astype(
        {
            "task_name": "str",
            "assay_epiclass": "str",
            "min_predScore": "float",
            "acc": "float",
            "f1-score": "float",
            "nb_samples": "int",
        }
    )

    # f1-score on ASSAY task, per assay, doesn't make sense
    df_acc_per_assay.loc[df_acc_per_assay["task_name"] == ASSAY, "f1-score"] = "NA"
    print(f"Saving {df_acc_per_assay.shape[0]} rows")
    path1 = base_fig_dir / "encode_predictions" / filename
    df_acc_per_assay.to_csv(
        path1,
        sep="\t",
        index=False,
    )
    print(f"Saved to {path1}")
    path2 = base_data_dir / "training_results" / "predictions" / "encode" / filename
    df_acc_per_assay.to_csv(
        path2,
        sep="\t",
        index=False,
    )
    print(f"Saved to {path2}")

    return df_acc_per_assay

In [None]:
filename = "5tasks_acc_per_assay_NO_EpiAtlas.tsv"
df_acc_per_assay = compute_and_save_acc_per_assay(all_preds_no_epiatlas, filename)

In [None]:
# Keep only the 16 cell types
preds_no_epiatlas_16ct = all_preds_no_epiatlas[
    all_preds_no_epiatlas[CELL_TYPE].isin(accepted_cts)
]

In [None]:
filename = "5tasks_acc_per_assay_NO_EpiAtlas_16ct.tsv"
_ = compute_and_save_acc_per_assay(preds_no_epiatlas_16ct, filename)

#### Separate min_pred graphing

In [None]:
def plot_encode_metrics_per_assay(
    df_acc_per_assay: pd.DataFrame, min_pred: float = 0, logdir: Path | None = None
) -> None:
    """Plot accuracy+F1 of each classification task, per assay"""
    df = copy.deepcopy(df_acc_per_assay)

    # Selecting min_pred
    to_plot = df[
        (df["min_predScore"] > (min_pred - 0.01))
        & (df["min_predScore"] < (min_pred + 0.01))
    ]

    # sort tasks by avg_acc
    averages = to_plot[to_plot[ASSAY] == "avg-core"]
    avg_acc = list(zip(averages["acc"], averages["task_name"]))
    task_order = [
        task_name for _, task_name in sorted(avg_acc, key=lambda x: x[0], reverse=True)
    ]

    # Removing undesired assays
    to_plot = to_plot[to_plot[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]

    names = {
        ASSAY: "assay",
        LIFE_STAGE: "life stage",
        f"{LIFE_STAGE}_merged": "life stage (merged)",
        CELL_TYPE: "cell type",
        SEX: "sex",
        CANCER: "cancer",
    }

    # Plot each task
    for metric in ["acc", "f1-score"]:
        fig = go.Figure()
        for task_name in task_order:
            task_df = to_plot[to_plot["task_name"] == task_name]

            task_name = names[task_name]

            if task_name == "assay" and metric == "f1-score":
                continue
            fig.add_trace(
                go.Box(
                    x=[task_name] * len(task_df),
                    y=task_df[metric],
                    name=metric,
                    boxpoints="outliers",
                    boxmean=True,
                    marker_color="gray",
                    showlegend=False,
                    hoverinfo="skip",
                )
            )

            fig.add_trace(
                go.Scatter(
                    x=[task_name] * len(task_df),
                    y=task_df[metric],
                    mode="markers",
                    name=task_name,
                    marker_color=[assay_colors[assay] for assay in task_df[ASSAY]],
                    hoverinfo="text",
                    hovertext=[
                        f"{assay}: {value:.3f}"
                        for assay, value in zip(task_df[ASSAY], task_df[metric])
                    ],
                    showlegend=False,
                )
            )

        y_axis_label = "F1-score" if metric == "f1-score" else "Accuracy"
        fig.update_layout(
            xaxis_title="Classification task",
            yaxis_title=y_axis_label,
            font=dict(size=18),
            width=800,
            height=600,
            title=f"ENCODE: Task {y_axis_label} per assay (min_predScore={min_pred:.2f})",
        )

        fig.update_yaxes(range=[0, 1.01])

        # Show/Write the plot
        if logdir:
            filename = f"encode_5tasks_metrics_per_assay-{metric}-{min_pred*100:.0f}"
            fig.write_image(logdir / f"{filename}.png")
            fig.write_image(logdir / f"{filename}.svg")
            fig.write_html(logdir / f"{filename}.html")

        fig.show()

In [None]:
logdir = base_fig_dir / "encode_predictions" / "metrics_per_assay"
if not logdir.exists():
    logdir.mkdir(parents=True)

for min_pred in [0, 0.6, 0.8, 0.9]:
    plot_encode_metrics_per_assay(df_acc_per_assay, min_pred=min_pred, logdir=logdir)

#### Multiple min_predScore

In [None]:
def plot_all_acc_per_assay(graph_df, minY, maxY, logdir: Path | None = None):
    """Plot accuracy per assay, per min_predScore, per scatter_name/task_name,
    for core vs non-core assays.

    """
    min_predScore_color_map = {"0.0": "blue", "0.6": "orange", "0.9": "red"}

    graph_df["scatter_name"] = graph_df["task_name"].replace(
        "harmonized_", "", regex=True
    )

    graph_df = graph_df.sort_values(by=[ASSAY, "min_predScore", "scatter_name"])

    for graph_type in ["core", "non-core"]:
        graph_df = df_acc_per_assay.copy()
        if graph_type == "core":
            graph_df = graph_df[graph_df[ASSAY].isin(CORE_ASSAYS + ["rna_seq"])]
            minY = 0.55
            maxY = 1.001
        elif graph_type == "non-core":
            graph_df = graph_df[~graph_df[ASSAY].isin(CORE_ASSAYS)]
            minY = 0
            maxY = 1
        else:
            raise ValueError(f"Invalid graph type: {graph_type}")

        unique_assays = list(graph_df[ASSAY].unique())

        # Calculate average over assays
        avg_df = (
            graph_df.groupby(["min_predScore", "scatter_name"])["acc"]
            .mean()
            .reset_index()
        )
        avg_df[ASSAY] = "Average"

        # traces_per_assay = graph_df["scatter_name"].nunique()

        fig = go.Figure()

        for min_pred in ["0.0", "0.6", "0.9"]:
            df_subset = graph_df[graph_df["min_predScore"] == min_pred]
            avg_subset = avg_df[avg_df["min_predScore"] == min_pred]

            # Add average over assay trace
            fig.add_trace(
                go.Scatter(
                    x=["Average - " + name for name in avg_subset["scatter_name"]],
                    y=avg_subset["acc"],
                    mode="markers",
                    name=f"Avg Min Pred Score: {min_pred}",
                    marker=dict(
                        color=min_predScore_color_map[min_pred],
                        size=9,
                        symbol="star",
                    ),
                    hoverinfo="y+x",
                    showlegend=False,
                )
            )

            # Add individual assay traces
            hovertext = list(
                zip(
                    df_subset[ASSAY],
                    df_subset["nb_samples"].apply(lambda x: f"Samples: {x}"),
                )
            )
            fig.add_trace(
                go.Scatter(
                    x=df_subset[ASSAY] + " - " + df_subset["scatter_name"],
                    y=df_subset["acc"],
                    mode="markers",
                    name=f"Min Pred Score: {min_pred}",
                    marker=dict(
                        color=min_predScore_color_map[min_pred],
                        size=9,
                    ),
                    text=hovertext,
                    hoverinfo="text+y+x",
                )
            )

        # Modify x-axis tick labels

        ticktext = []
        tick_group = list(df_subset["scatter_name"].unique())
        for i, tick in enumerate(tick_group):
            tick_group[i] = f"<b>{tick}</b>"

        for i in range(len(unique_assays) + 1):
            ticktext.extend(tick_group)

        fig.update_xaxes(
            tickmode="array", ticktext=ticktext, tickvals=list(range(len(ticktext)))
        )

        # Add assay labels on top + vertical lines between assay groups
        fig.add_annotation(
            x=len(tick_group) / 2 - 0.5,
            y=1.05,
            yref="paper",
            text="Average",
            showarrow=False,
            font=dict(size=14),
        )

        fig.add_vline(
            x=len(tick_group) - 0.5, line_width=2, line_dash="solid", line_color="black"
        )
        fig.add_hline(y=1, line_width=1, line_color="black")

        for i, label in enumerate(unique_assays):
            fig.add_annotation(
                x=(i + 1) * len(tick_group) + len(tick_group) / 2 - 0.5,
                y=1.05,
                yref="paper",
                text=label,
                showarrow=False,
                font=dict(size=14),
            )
            fig.add_vline(
                x=(i + 1) * len(tick_group) - 0.5,
                line_width=1,
                line_dash="dash",
                line_color="black",
            )

        # titles + yaxis range
        fig.update_layout(
            title="ENCODE data - Label match per Assay and Task",
            xaxis_title="Assay - Task",
            yaxis_title="Match %",
            xaxis_tickangle=-45,
            showlegend=True,
            height=600,
            width=1200,
            yaxis=dict(tickformat=".2%", range=[minY, maxY]),
        )

        # Show/Write the plot
        print(f"Graphing {graph_type}")
        if logdir:
            figname = f"encode_{graph_type}_acc_per_assay_minY{minY:.2f}"
            fig.write_html(logdir / f"{figname}.html")
            fig.write_image(logdir / f"{figname}.png")
            fig.write_image(logdir / f"{figname}.svg")
        fig.show()

In [None]:
# this_fig_dir = base_fig_dir / "encode_predictions" / "acc_per_assay"
# if not this_fig_dir.exists():
#     raise FileNotFoundError(f"Folder {this_fig_dir} does not exist")

### Confusion matrices

In [None]:
def graph_confusion_matrix(all_preds: pd.DataFrame, output_dir: Path):
    """Graph confusion matrix for each classification task, for both core and non-core assays"""
    df = copy.deepcopy(all_preds)
    for graph_type in ["core", "non-core"]:
        print(f"Graphing {graph_type}")
        if graph_type == "core":
            sub_df = df[df[ASSAY].isin(CORE_ASSAYS + ["rna_seq", "mrna_seq"])]
        elif graph_type == "non-core":
            sub_df = df[df[ASSAY].isin(["ctcf", "non-core"])]
        else:
            raise ValueError(f"Invalid graph_type: {graph_type}")

        for name in [ASSAY, CELL_TYPE, SEX, LIFE_STAGE, f"{LIFE_STAGE}_merged", CANCER]:
            logdir = output_dir / name
            if not logdir.exists():
                logdir.mkdir(parents=True)

            if name == CELL_TYPE and graph_type == "core":
                continue

            task_df = copy.deepcopy(sub_df)
            task_df = task_df.fillna("unknown")
            task_df = task_df[task_df[name] != "unknown"]

            if name == CELL_TYPE:
                task_df = task_df[task_df[CELL_TYPE].isin(accepted_cts)]

            print(name, task_df.shape)

            y_true_col = f"True class ({name})"
            y_pred_col = f"Predicted class ({name})"
            max_pred_label = f"Max pred ({name})"

            for threshold in [0, 0.6, 0.8, 0.9]:
                filtered_df = task_df[task_df[max_pred_label] >= threshold]

                true, pred = filtered_df[y_true_col], filtered_df[y_pred_col]
                if name == CELL_TYPE:
                    labels = accepted_cts
                else:
                    labels = list(filtered_df[name].unique())

                cm = confusion_matrix(true, pred, labels=labels)

                writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
                writer.to_all_formats(
                    logdir=logdir,
                    name=f"{name}-{graph_type}-confusion_matrix-{threshold*100:.0f}",
                )
                plt.close("all")

In [None]:
cm_logdir = base_fig_dir / "encode_predictions" / "confusion_matrices"
graph_confusion_matrix(all_preds_no_epiatlas, cm_logdir)

### track type

In [None]:
track_type_pred_path = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "track_type"
    / "split0_test_prediction_100kb_all_none_all.list.csv"
)
track_type_pred_df = pd.read_csv(track_type_pred_path)

In [None]:
pred_vector_cols = list(track_type_pred_df.columns[3:])
track_type_pred_df["Max_pred_track_type"] = track_type_pred_df.loc[
    :, pred_vector_cols
].max(axis=1)

In [None]:
track_type_df = track_type_pred_df.merge(
    complete_metadata_df, left_on="Unnamed: 0", right_on="FILE_accession", how="inner"
)

print(track_type_df.shape, encode_df.shape, track_type_pred_df.shape)

In [None]:
# write each table in a separate excel sheet
output = track_type_pred_path.parent / "track_type_predictions_pivot.csv"
output.unlink(missing_ok=True)

with open(output, "a", encoding="utf8") as csv_stream:
    for min_pred in [0, 0.6, 0.8]:
        df = track_type_df[track_type_df["Max_pred_track_type"] >= min_pred]
        pivot = df.pivot_table(
            index=ASSAY,
            columns="Predicted class",
            values="Max_pred_track_type",
            aggfunc="count",
            fill_value=0,
            margins=True,
        ).astype(int)
        relative_pivot = pivot.div(pivot["All"], axis=0) * 100

        # csv_stream.write(f"Count Pivot - Min pred: {min_pred}\n")
        # pivot.to_csv(csv_stream)
        # csv_stream.write("\n")

        # csv_stream.write(f"Relative Pivot - Min pred: {min_pred}\n")
        # relative_pivot.to_csv(csv_stream)
        # csv_stream.write("\n")

        # display(pivot)
        # with pd.option_context("display.float_format", "{:.2f}".format):
        #     display(relative_pivot)

## RNA-Seq Assay

In [None]:
rna_df = all_preds_no_epiatlas[all_preds_no_epiatlas[ASSAY].str.contains("rna")]

In [None]:
for col in [ASSAY, f"True class ({ASSAY})"]:
    rna_df.loc[:, col].replace(
        {
            "total RNA-seq": "rna_seq",
            "polyA plus RNA-seq": "mrna_seq",
        },
        inplace=True,
    )

In [None]:
print("RNA-Seq assay accuracy, if mrna_seq != rna_seq\n")
for min_pred in [0, 0.6, 0.8]:
    df = rna_df[rna_df[f"Max pred ({ASSAY})"] >= min_pred]
    acc = len(df[df[f"True class ({ASSAY})"] == df[f"Predicted class ({ASSAY})"]]) / len(
        df
    )
    print(
        f"Min pred: {min_pred}, Accuracy: {acc:.4f}. Samples: {len(df)}/{rna_df.shape[0]}\n"
    )
    groupby = (
        df.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values([ASSAY, "Count"], ascending=False)
    )
    print(groupby, "\n")

In [None]:
print("RNA-Seq assay accuracy, if mrna_seq == rna_seq\n")
df = rna_df.copy()
for cat in [ASSAY, f"Predicted class ({ASSAY})", f"True class ({ASSAY})"]:
    df.loc[df[cat] == "mrna_seq", cat] = "rna_seq"

for min_pred in [0, 0.6, 0.8]:
    sub_df = df[df[f"Max pred ({ASSAY})"] >= min_pred]
    acc = len(
        sub_df[sub_df[f"True class ({ASSAY})"] == sub_df[f"Predicted class ({ASSAY})"]]
    ) / len(sub_df)
    print(
        f"Min pred: {min_pred}, Accuracy: {acc:.4f}. Samples: {len(sub_df)}/{rna_df.shape[0]}\n"
    )

    groupby = (
        sub_df.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")