In [None]:
"""Workbook to analyse Chip-Atlas predictions, destined for the paper.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, pointless-statement

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from __future__ import annotations

from collections import Counter
from pathlib import Path
from typing import List, Tuple

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import upsetplot
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.metrics import confusion_matrix as sk_cm

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_ORDER,
    IHECColorMap,
    display_perc,
)

# import plotly.express as px
# from plotly.subplots import make_subplots

## Setup

In [None]:
ASSAY_ORDER

In [6]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [7]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map

In [None]:
ca_dir = base_data_dir / "training_results" / "predictions" / "C-A" / "assay_epiclass"

# mod2 = add 24NA to 1rst/2nd_prob_diff_cancer, 1rst/2nd_prob_ratio_cancer, 1rst/2nd_prob_diff_disease et 1rst/2nd_prob_ratio_disease
# for samples that have column shift leading to Max_pred_donorlife (and others) having incoherent/string values
ca_filename = "CA_metadata_4DB+all_pred.20240606_mod2.1.tsv"
ca_pred_path = ca_dir / ca_filename
ca_pred_df = pd.read_csv(ca_pred_path, sep="\t", low_memory=False)
print(ca_pred_df.shape)

In [None]:
ca_pred_df.head()

| Assay | Exp Key                               | Nb Files | Training Size | Oversampling | Expected Nb Files                      |
|-------|---------------------------------------|----------|---------------|--------------|---------------------------------------|
| 13c   | dd3710b73c0341af85a17ce1998362d0      | 24989    | 116550        | true         | 24989                                 |
| 11c   | 0f8e5eb996114868a17057bebe64f87c      | 20922    | 46128         | true         | 20922                                 |
| 7c    | 69488630801b4a05a53b5d9e572f0aaa      | 16788    | 34413         | true         | 16788 (contre-vérifié)                |

*using hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2


In [10]:
df_cols = ca_pred_df.columns.to_list()

In [11]:
CORE_ASSAYS = ASSAY_ORDER[0:7]

In [12]:
DB_COLS = ["GEO_mod", "C-A", "Cistrome", "NGS_mod"]

In [13]:
PRED_COLS = [
    "Predicted_class_assay7",
    "Predicted_class_assay11",
    "Predicted_class_assay13",
]

### File correction

CA_metadata_4DB+all_pred.20240606.tsv has some mistakes in GEO_mod and manual_target_consensus. Using values in CA_metadata_mod2.tsv to overwrite.

In [None]:
ca_mod_path = ca_dir / "CA_metadata_mod2.tsv"
ca_mod_df = pd.read_csv(ca_mod_path, sep="\t", low_memory=False)
ca_mod_df.head(n=2)

In [15]:
ca_pred_df = ca_pred_df.merge(ca_mod_df, on="Experimental-id", how="left")
ca_pred_df[["manual_target_consensus", "GEO_mod"]] = ca_pred_df[
    ["manual_target_consensus2", "GEO_mod2"]
]
ca_pred_df = ca_pred_df.drop(columns=["manual_target_consensus2", "GEO_mod2"])

In [None]:
ca_pred_df["manual_target_consensus"].value_counts(dropna=False)

Lowercase all targets

In [17]:
ca_pred_df.loc[:, DB_COLS] = (
    ca_pred_df[DB_COLS].astype(str).apply(lambda x: x.str.lower())
)

Transform all "revxlinkchromatin" target into "input" so they're not counted as different targets.

In [18]:
ca_pred_df.loc[:, DB_COLS] = ca_pred_df[DB_COLS].replace("revxlinkchromatin", "input")

In [19]:
# Checking samples with "ctrl" target. Definition is ambiguous. Will be treated as "core" sample.
# display(ca_pred_df[ca_pred_df[DB_COLS].isin(["ctrl"]).any(axis=1)][["Experimental-id", "Gse-geo", "GSM"] + DB_COLS + ["manual_target_consensus"] + PRED_COLS].sort_values("Gse-geo"))

Correct some NGS_mod annotation errors using the file titles (GSE78801). 

They took h3.3k27m as the target when it is related to the cell line (SF8628 Human DIPG H3.3-K27M Cell Line).

In [20]:
to_replace = {
    "GSM2265634": "h3k27me3",
    "GSM2265635": "h3k27me3",
    "GSM2265642": "h3k4me1",
}

idx = ca_pred_df["GSM"].isin(to_replace.keys())
ca_pred_df.loc[idx, "NGS_mod"] = ca_pred_df.loc[idx, "GSM"].map(to_replace)

In [21]:
if ca_pred_df[ca_pred_df.isin(["h3.3k27m"])].notna().sum().sum() != 0:
    raise ValueError("h3.3k27m is still present in the dataframe")

### Examining CTCF labels, to exclude some samples

Our classifier will tend to classify CTCF as input, so we cannot trust it to differentiate between CTCF and input signals.  
Likely CTCF samples need to be excluded from the prediction pool.

In [None]:
non_core_labels = ["non-core", "CTCF"]
ca_core_df = ca_pred_df[~ca_pred_df["manual_target_consensus"].isin((non_core_labels))]

if ca_core_df["manual_target_consensus"].isna().sum() > 0:
    raise ValueError("There are missing values in the target column.")

assert ca_pred_df.shape[0] > ca_core_df.shape[0]

print(ca_core_df.shape)

In [None]:
ca_core_df["manual_target_consensus"].value_counts(dropna=False)

In [None]:
display(
    ca_core_df[ca_core_df[DB_COLS].isin(["ctcf"]).any(axis=1)][
        ["Experimental-id", "Gse-geo", "GSM"]
        + DB_COLS
        + ["manual_target_consensus"]
        + PRED_COLS
    ].sort_values(["Gse-geo", "GSM"])
)

After a review of the experiment descriptions on GEO, it seems GSE102237, GSE108869 and GSE38411 samples marked as CTCF by cistrome are not clearly input (uncertain conclusion), and so should be excluded from our core samples.

As for GSE183379 samples marked as ctcf by C-A (7 samples), it seems none of them are actually CTCF, according to the original files names on GEO, so they don't need to be excluded.

BUT to not have to explain all this in the main paper, they will be all be excluded. 

In [25]:
to_exclude_ctcf = ["GSM2731525", "GSM2731526", "GSM3466332", "GSM3466333"]
to_include_ctct_mask = (ca_core_df["Gse-geo"] == "GSE183379") & (
    ca_core_df["C-A"] == "ctcf"
)
assert to_include_ctct_mask.sum() == 7

## Analysis

Base dataset used: Chip-Atlas experiments where at least one of the BD declared the target in core7.

Excluding: Samples where at least one the BG declared a target out of core7. (except for GSE183379, which seems to have some C-A annotation error.)

In [26]:
def print_target_info(df: pd.DataFrame) -> None:
    """Print information about the targets."""
    assay_count = df["manual_target_consensus"].value_counts(dropna=False)
    display(assay_count)
    print("Size of the dataset: ", len(df))
    display_perc(assay_count / len(df))

In [None]:
ca_pred_df["GEO_mod"].value_counts(dropna=False)

### Database composition

In [28]:
for db_col in DB_COLS:
    col = ca_core_df[db_col]
    if col.isna().sum():
        print("Missing values: ", ca_core_df[col.isna()])

In [None]:
print_target_info(ca_core_df)

In [30]:
no_consensus_df = ca_core_df[ca_core_df["manual_target_consensus"] == "no_consensus"]
# for db_col in DB_COLS:
#     display(no_consensus_df[db_col].value_counts(dropna=False))

#### manual_target_consensus / DBs target details

Create a new category called "identical_DBs_target" that specifies when, depending available sources, DBs give the same assay/target

Treat "Unclassified" from Chip-Atlas as absent samples for the target consensus evaluation.

In [31]:
SAME_TARGET = "core7_DBs_consensus"

In [None]:
id_db_target = []
unique_labels = Counter()
different_labels = Counter()

tmp_df = ca_core_df.loc[:, DB_COLS].copy()
tmp_df["C-A"].replace("unclassified", "----", inplace=True)

for labels in tmp_df.values:
    missing_N = sum(label == "----" for label in labels)
    db_labels = set(labels)

    try:
        db_labels.remove("----")
    except KeyError:
        pass
    if any(label not in CORE_ASSAYS + ["ctrl"] for label in db_labels):
        id_db_target.append("Ignored - Potential non-core")
    elif missing_N == 3:
        id_db_target.append("1 source")
    elif len(db_labels) == 1:
        id_db_target.append("Identical")
    else:
        id_db_target.append("Different")
        different_labels[tuple(db_labels)] += 1

    unique_labels[tuple(db_labels)] += 1

ca_core_df.loc[:, SAME_TARGET] = id_db_target

##### ca pred 2.2 / with "identical_DBs_target"

In [33]:
# Create new metadata that includes a column for target_consensus description overlap

# unmodified_ca_metadata = pd.read_csv(ca_dir / "CA_metadata_4DB+all_pred.20240606_mod2.1.tsv", sep="\t", low_memory=False)
# new_meta = ca_core_df[["Experimental-id", SAME_TARGET]].merge(unmodified_ca_metadata, on="Experimental-id", how="right")
# new_meta.loc[new_meta[SAME_TARGET].isna(), SAME_TARGET] = "non-core/CTCF"
# assert new_meta[SAME_TARGET].isna().sum() == 0

# display(new_meta[SAME_TARGET].value_counts(dropna=False))

# new_path = ca_pred_path.parent / ca_pred_path.name.replace("mod2.1.tsv", "mod2.2.tsv")
# new_meta.to_csv(new_path, sep="\t", index=False)

In [34]:
# for labels, count in h3_labels.most_common():
#     print("\t".join([labels])+f"\t{count}")

In [35]:
# for labels, count in unique_labels.items():
#     if len(labels) > 1 and "input" in labels:
#         print(labels, count)

In [36]:
# target_DB_count = ca_core_df[SAME_TARGET].value_counts(dropna=False)
# display(target_DB_count)
# print()
# display_perc(target_DB_count / target_DB_count.sum() * 100)

In [37]:
# with pd.option_context("display.max_rows", None):
#     display(ca_core_df[ca_core_df[SAME_TARGET] == "Different"][DB_COLS].value_counts())

In [None]:
display(ca_core_df[SAME_TARGET].value_counts(dropna=False))
print(f"Total core7 samples: {ca_core_df.shape[0]}")

In [39]:
outpath = ca_dir / (
    str(Path(ca_filename).stem) + "_core7_DB_not_identical_target_counts.tsv"
)

# ca_core_df[ca_core_df[SAME_TARGET] != "Identical"][
#     DB_COLS + ["manual_target_consensus", SAME_TARGET]
# ].value_counts(dropna=False).to_csv(outpath, sep="\t")

#### Upset plots

In [40]:
fig_dir = base_fig_dir / "fig_C-A" / "DB_upset"

In [41]:
def make_db_upsetplot(
    df: pd.DataFrame, db_cols: List[str], title: str
) -> upsetplot.UpSet:
    """Make an upsetplot of the sample presence in the different databases."""
    df = df.copy()
    if SAME_TARGET not in df.columns:
        raise ValueError("Column 'identical_DBs_target' not found in DataFrame.")

    # Create a new DataFrame with boolean columns for each database
    upset_df = pd.DataFrame()
    for col in db_cols:
        upset_df[col] = df[col] != "----"
    upset_df[SAME_TARGET] = df[SAME_TARGET]

    # Set the index for the UpSet plot
    upset_df = upset_df.set_index(db_cols)

    # Create the UpSet plot
    upset = upsetplot.UpSet(
        upset_df,
        intersection_plot_elements=0,  # disable the default bar chart
        sort_by="cardinality",
        show_counts=True,
        orientation="horizontal",
    )

    # Add stacked bars
    upset.add_stacked_bars(by=SAME_TARGET, elements=15)

    # Plot and set title
    axes = upset.plot()
    plt.suptitle(title)
    axes["totals"].set_title("Total")
    plt.legend(loc="center left")
    return upset

In [42]:
title = "All core7 ChIP-Atlas samples presence in used DBs\nTarget consensus"
upset = make_db_upsetplot(ca_core_df, DB_COLS, title=title)
# plt.savefig(fig_dir / "upsetplot_DB_core7_samples.svg", bbox_inches="tight")

In [None]:
(ca_core_df["is_EpiAtlas_EpiRR"] != "0").sum()

In [44]:
# no encode
no_encode_df = ca_core_df[ca_core_df["ENCODE"] == 0]
title = "ChIP-Atlas samples presence in used DBs\nTarget Consensus - No ENCODE"
upset = make_db_upsetplot(no_encode_df, DB_COLS, title=title)
# plt.savefig(fig_dir / "upsetplot_DB_core7_samples_noENC.svg", bbox_inches="tight")

In [45]:
# yes encode, no epiatlas
no_epiatlas_df = ca_core_df[ca_core_df["is_EpiAtlas_EpiRR"] == "0"]
title = "ChIP-Atlas core 7 samples presence in used DBs\nTarget Consensus - No EpiAtlas overlap"
upset = make_db_upsetplot(no_epiatlas_df, DB_COLS, title=title)
# plt.savefig(fig_dir / "upsetplot_DB_core7_samples_noEpiAtlas.svg", bbox_inches="tight")

In [46]:
def is_prediction_resolved(row, pred_col: str, db_cols: List[str]) -> bool:
    """Check if the prediction matches any of the database columns."""
    pred_val = row[pred_col]
    db_vals = [row[col] for col in db_cols]
    return pred_val in db_vals

In [None]:
# the classifier was able to resolve xx% of the cases where the target was not identical between the sources
different_targets_df = ca_core_df[ca_core_df[SAME_TARGET] == "Different"]

for min_pred_score in [0, 0.6]:
    filtered_df = different_targets_df[
        different_targets_df["Max_pred_assay7"] >= min_pred_score
    ]

    pred_col = PRED_COLS[0]

    num_resolved = filtered_df.apply(
        is_prediction_resolved, axis=1, args=(pred_col, DB_COLS)
    ).sum()

    print(
        f"Resolved (min_predScore >= {min_pred_score}): "
        f"{num_resolved} / {len(filtered_df)} "
        f"({num_resolved / len(filtered_df) * 100:.2f}%)"
    )

    # Exclude rows where the prediction is labeled as 'input'
    non_input_df = filtered_df[filtered_df[PRED_COLS[0]] != "input"]
    num_resolved = non_input_df.apply(
        is_prediction_resolved, axis=1, args=(pred_col, DB_COLS)
    ).sum()

    print(
        f"Resolved (min_predScore >= {min_pred_score}, excluding 'input' predictions): "
        f"{num_resolved} / {len(non_input_df)} "
        f"({num_resolved / len(non_input_df) * 100:.2f}%)"
    )

### High-level prediction accuracy breakdown

In [48]:
def print_high_level_pred_info(df: pd.DataFrame) -> None:
    """High level information about the predictions."""
    for assay in CORE_ASSAYS:
        print(f"{assay}")
        assay_df = df[df["manual_target_consensus"] == assay]
        for col in [
            "Predicted_class_assay7",
            "Predicted_class_assay11",
            "Predicted_class_assay13",
        ]:
            assay_number = col.rsplit("_", maxsplit=1)[-1]
            display(assay_df[col].value_counts() / len(assay_df) * 100)
            if any(label in col for label in ["11", "13"]):
                wrong_pred = assay_df[assay_df[col] != assay]

                display(
                    wrong_pred[f"2nd_pred_class_{assay_number}"].value_counts()
                    / len(wrong_pred)
                    * 100
                )
        print("\n")

    print("What is the actual target when wgbs-standard is predicted?")
    for assay_number in ["assay11", "assay13"]:
        print(f"{assay_number}")
        wgbs_dist = ca_pred_df[
            ca_pred_df[f"Predicted_class_{assay_number}"] == "wgbs-standard"
        ]["manual_target_consensus"]
        display(wgbs_dist.value_counts())
        display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

    print("What is the actual target when non-core is predicted?")
    col = "Predicted_class_assay13"
    wgbs_dist = ca_pred_df[ca_pred_df[col] == "non-core"]["manual_target_consensus"]
    display(wgbs_dist.value_counts())
    display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

In [None]:
print_high_level_pred_info(ca_pred_df)
# print_target_info(ca_pred_df)

In [None]:
for min_pred in [0.6, 0.8]:
    # for min_pred in np.arange(0.6, 1.05, 0.05):
    break_tie_mask = no_consensus_df["Max_pred_assay7"] >= min_pred
    nb_break_tie = break_tie_mask.sum()
    print(
        f"Break no_consensus (minPred >= {min_pred:.02f}): {nb_break_tie/ len(no_consensus_df) * 100:.02f}% ({nb_break_tie}/{len(no_consensus_df)})"
    )
    df = no_consensus_df[break_tie_mask]
    # display(df[no_consensus_df.columns[2:10]])
    # display(df.value_counts("Predicted_class_assay7"))
    nb_not_input = (df["Predicted_class_assay7"] != "input").sum()
    print(
        f"non-input tie breakers: {nb_not_input}/{nb_break_tie} ({nb_not_input/len(df) * 100:.02f}%)\n"
    )
    # print(df["ENCODE"].value_counts(), "\n")

In [None]:
enc_count = ca_core_df["ENCODE"].value_counts()
display(enc_count)
display_perc(enc_count / len(ca_core_df))

In [52]:
def print_pred_within_threshold(
    df: pd.DataFrame, min_pred: float = 0.6, col: str = "Max_pred_assay7"
) -> None:
    """Print the predictions percentage within a threshold."""
    try:
        mask = df[col].astype(float) >= min_pred
    except KeyError:
        print(f"Column {col} not found.")
        return
    nb_pred = mask.sum()
    print(
        f"Nb pred {col.split('_')[-1]} (pred score >= {min_pred:.02f}): {nb_pred/len(df) * 100:.02f}% ({nb_pred}/{len(df)})"
    )

In [53]:
# for min_pred in np.arange(0, 1.05, 0.05):
#     df = ca_core_df[ca_core_df["ENCODE"] == 0]
#     print_pred_within_threshold(df, min_pred=min_pred)

In [54]:
def save_confusion_matrix(
    df: pd.DataFrame,
    fig_dir: Path | str,
    nb_classes: int | str = 7,
    min_pred: float = 0.6,
):
    """Save the confusion matrix for core assays predictions. Does not filter."""
    col = f"Predicted_class_assay{nb_classes}"
    cm = sk_cm(df["manual_target_consensus"], df[col])
    cm_writer = ConfusionMatrixWriter(labels=CORE_ASSAYS, confusion_matrix=cm)

    name = f"confusion_matrix_assay{nb_classes}_core7_minPred{min_pred:.02f}"
    if df["ENCODE"].sum() == 0:
        name += "_noENCODE"

    cm_writer.to_all_formats(logdir=fig_dir, name=name)

In [55]:
def print_breakdown_predictions(
    df: pd.DataFrame, min_pred: float = 0.6, nb_classes: int | str = 7
) -> None:
    """Breakdown the predictions, print results."""
    df = df[df[f"Max_pred_assay{nb_classes}"] >= min_pred]

    pred_col = f"Predicted_class_assay{nb_classes}"
    match_consensus = df["manual_target_consensus"] == df[pred_col]
    nb_match = match_consensus.sum()
    nb_error = (~match_consensus).sum()
    print(f"Nb match assay{nb_classes}: {nb_match/ len(df):.2%} ({nb_match}/{len(df)})")
    print(f"Nb error assay{nb_classes}: {nb_error/ len(df):.2%} ({nb_error}/{len(df)})\n")

    correct_pred_df = df[match_consensus]
    incorrect_pred_df = df[~match_consensus]

    print(
        r"Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).",
        "\n",
    )
    for assay in CORE_ASSAYS:
        assay_df = df[df[pred_col] == assay]
        nb_assay = len(assay_df)

        nb_assay_correct = len(correct_pred_df[correct_pred_df[pred_col] == assay])
        nb_assay_incorrect = len(incorrect_pred_df[incorrect_pred_df[pred_col] == assay])

        print(f"Predictions as {assay}: {nb_assay / len(df):.2%} ({nb_assay}/{len(df)})")
        perc_cor = nb_assay_correct / nb_assay
        perc_cor2 = nb_assay_correct / len(df)
        perc_inc = nb_assay_incorrect / nb_assay
        perc_inc2 = nb_assay_incorrect / len(df)
        perc_inc3 = nb_assay_incorrect / len(incorrect_pred_df)

        print(
            f"Correct predictions as {assay}: {perc_cor:.2%} ({nb_assay_correct}/{nb_assay}) OR {perc_cor2:.2%} ({nb_assay_correct}/{len(df)})"
        )
        print(
            f"Incorrect predictions as {assay}: "
            f"{perc_inc:.2%} ({nb_assay_incorrect}/{nb_assay}) OR "
            f"{perc_inc2:.2%} ({nb_assay_incorrect}/{len(df)}) OR "
            f"{perc_inc3:.2%} ({nb_assay_incorrect}/{len(incorrect_pred_df)})\n"
        )

In [None]:
fig_dir = base_fig_dir / "fig_C-A" / "confusion_matrices"
for subset in [[0], [0, 1]]:
    if subset == [0]:
        print("Subset: no ENCODE")
    else:
        print("Subset: Include ENCODE")
    df = ca_core_df[ca_core_df["ENCODE"].isin(subset)]
    for min_pred in [0.6, 0.8, 0.9]:
        # print("Min pred score:", min_pred)
        # print_pred_within_threshold(df, min_pred=min_pred)
        # print_breakdown_predictions(df, min_pred=min_pred)

        # df = ca_core_df[ca_core_df["Max_pred_assay7"] >= min_pred]
        # save_confusion_matrix(df, fig_dir, min_pred=min_pred)
        # print("miaw")
        pass

### Mislabels by GSE

In [57]:
id_cols = [
    col for col in df_cols if any(string in col.lower() for string in ["id", "gse"])
]
id_cols.remove("Gse-title")

In [None]:
print(id_cols)

In [59]:
GSE = "Gse-geo"

In [None]:
nb_classes = 7
min_pred = 0.6
df = df[df[f"Max_pred_assay{nb_classes}"] >= min_pred]

pred_col = f"Predicted_class_assay{nb_classes}"
match_consensus = df["manual_target_consensus"] == df[pred_col]
nb_match = match_consensus.sum()
nb_error = (~match_consensus).sum()
print(f"Nb match assay{nb_classes}: {nb_match/ len(df):.2%} ({nb_match}/{len(df)})")
print(f"Nb error assay{nb_classes}: {nb_error/ len(df):.2%} ({nb_error}/{len(df)})\n")

incorrect_pred_df = df[~match_consensus]
incorrect_pred_df = incorrect_pred_df[incorrect_pred_df[pred_col] != "input"]

desired_cols = ["manual_target_consensus", pred_col]

# with pd.option_context("display.max_rows", None, "display.max_columns", None):
#     print("Incorrect predictions, breakdown by GSE count.")
#     gse_count = incorrect_pred_df.groupby(GSE).size().sort_values(ascending=False)  # type: ignore
#     display(gse_count)
#     gse_count.to_csv(
#         ca_dir / "gse_count_incorrect_pred_no_input_20240606_mod2.tsv", sep="\t"
#     )

#     print("Incorrect predictions, breakdown by GSE and target.")
#     gse_target_count = incorrect_pred_df.groupby(GSE)[desired_cols].value_counts()
#     display(gse_target_count)
#     gse_target_count.to_csv(
#         ca_dir / "gse_target_count_incorrect_pred_no_input_20240606_mod2.tsv", sep="\t"
#     )

### Summary

Keeping only core 7.

WITH ENCODE

- Nb pred assay7 (pred score >= 0.60): 89.60% (42314/47226)
- Nb match assay7: 95.74% (40511/42314)
- Nb error assay7: 4.26% (1803/42314)

Of pred score >= 0.60:
- Predictions as input: 39.32% (16636/42314)
- Correct predictions as input: 92.45% (15380/16636) OR 36.35% (15380/42314)
- Incorrect predictions as input: 7.55% (1256/16636) OR 2.97% (1256/42314) OR 69.66% (1256/1803)

NO ENCODE  

- Nb pred assay7 (pred score >= 0.60): 88.56% (35631/40232)
- Nb match assay7: 95.24% (33935/35631)
- Nb error assay7: 4.76% (1696/35631)

Of pred score >= 0.60:
- Predictions as input: 40.69% (14499/35631)
- Correct predictions as input: 91.96% (13334/14499) OR 37.42% (13334/35631)
- Incorrect predictions as input: 8.04% (1165/14499) OR 3.27% (1165/35631) OR 68.69% (1165/1696)

BREAK CONSENSUS (does not contain any ENCODE data)

- Break no_consensus (minPred >= 0.60): 92.59% (462/499) & non-input 189/462 (40.91%)  
- Break no_consensus (minPred >= 0.80): 76.57% (402/525) & non-input 163/402 (40.55%)  


### Varying consensus criterion (nb DB agreeing)

In [None]:
# ca_core_df.groupby(DB_COLS + ["manual_target_consensus", "Predicted_class_assay7"]).size()

reference_column = "manual_target_consensus"
columns_to_check = DB_COLS
ca_core_df["manual_target_consensus_size"] = (
    ca_core_df[columns_to_check].eq(ca_core_df[reference_column], axis=0)
).sum(axis=1)

for col in ["manual_target_consensus", "manual_target_consensus_size"]:
    val_count = ca_core_df[col].value_counts(dropna=False).sort_index()
    display(val_count)
    display_perc(val_count / len(ca_core_df))

In [None]:
print("Global:")
print_pred_within_threshold(ca_core_df, min_pred=0.6)
print()
print_breakdown_predictions(ca_core_df, min_pred=0.6, nb_classes=7)
sub_df = ca_core_df[ca_core_df["Max_pred_assay7"] >= 0.6]
assay_count = sub_df["manual_target_consensus"].value_counts().sort_index()
# display_perc(assay_count / len(sub_df))


for i in range(1, 5):
    df = ca_core_df[ca_core_df["manual_target_consensus_size"] == i]
    print(f"Consensus defined with {i} DB: {len(df)} files.")

    # Display % assay
    sub_df = df[df["Max_pred_assay7"] >= 0.6]
    assay_count = df["manual_target_consensus"].value_counts()

    # display_perc(assay_count / len(df))

    input_val = (assay_count / len(df))["input"]
    print(f"input: {input_val:.2%}")

    print_pred_within_threshold(df, min_pred=0.6)
    print()

    print_breakdown_predictions(df, min_pred=0.6, nb_classes=7)

#### Summary

There doesn't seem to be big differences in accuracy when looking at consensus defined by a different number of DB.  


Core 7 files, including ENCODE

| Consensus size | Nb files | Nb files (%) | Input size (%) | Accuracy (min_pred > 0.6) |
|----------------|----------|--------------|----------------|--------------------|
| 1              | 3324     | 6.98%        | 61.85%         | 95.46%             |
| 2              | 25551    | 53.81%       | 42.27%         | 95.83%             |
| 3              | 10275    | 21.60%       | 37.38%         | 95.36%             |
| 4              | 8076     | 17.61%       | 17.62%         | 96.03%             |
| global          | 47226    | 100%         | 38.37%         | 95.74%             |


For assay7 core7, min_pred 0.6:  

Global average
- acc: 95.74%
- %err=input (lowQual): 69.66%
- consensus input = 36.77%

1 to 4 DB consensus

- acc within [95.36%, 96.03%]
- %err=input (lowQual): within [63.43%, 79.39%] = [63.43%, 69.55%, 65.09%, 79.39%]
- consensus input = [60.03%, 41.01%, 35.73%, 15.84%]

### ENCODE core no EpiAtlas + Chip-Atlas (no ENCODE)

In [63]:
encode_dir = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "encode"
    / "assay_epiclass_1l_3000n"
)
encode_pred_path = encode_dir / "encode_only-core-7c_predictions.csv"
encode_pred = pd.read_csv(encode_pred_path)

In [64]:
encode_pred["Experimental-id"] = encode_pred["md5sum"]
encode_pred["Predicted_class_assay7"] = encode_pred["Predicted class"]
encode_pred["manual_target_consensus"] = encode_pred["True class"]
encode_pred["Max_pred_assay7"] = encode_pred["Max pred"]

In [65]:
global_pred_df = pd.concat([ca_core_df, encode_pred], ignore_index=True)

In [66]:
global_pred_df = global_pred_df[~(global_pred_df["ENCODE"] == 1)]

In [None]:
display(global_pred_df["ENCODE"].value_counts(dropna=False))
display(global_pred_df["manual_target_consensus"].value_counts(dropna=False))

In [68]:
global_pred_df = global_pred_df[
    global_pred_df["manual_target_consensus"].isin(CORE_ASSAYS)
]

In [None]:
min_pred = 0.6
print_pred_within_threshold(global_pred_df, min_pred=min_pred)
print_breakdown_predictions(global_pred_df, min_pred=min_pred, nb_classes=7)

In [None]:
base_data_dir

In [71]:
output_dir = base_data_dir / "training_results" / "predictions"

In [72]:
mislabels = global_pred_df[
    global_pred_df["Predicted_class_assay7"] != global_pred_df["manual_target_consensus"]
]
# mislabels.to_csv(output_dir / "mislabels_C-A&ENCODE_assay7.csv", index=False)

#### Summary

Main stats

- Nb pred assay7 (pred score >= 0.60): 89.49% (39910/44599)
- Nb match assay7: 95.65% (38172/39910)
- Nb error assay7: 4.35% (1738/39910)

Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).   

- Predictions as input: 40.39% (16119/39910)
- Correct predictions as input: 92.75% (14951/16119) OR 37.46% (14951/39910)
- Incorrect predictions as input: 7.25% (1168/16119) OR 2.93% (1168/39910) OR 67.20% (1168/1738)


### Chip-Atlas with ENCODE, excluding intersection of ENCODE and EpiAtlas

In [73]:
encode_metadata_dir = base_data_dir / "metadata" / "encode"
encode_epiatlas_mapping_path = encode_metadata_dir / "ENCODE_IHEC_keys.tsv"
encode_epiatlas_mapping_df = pd.read_csv(encode_epiatlas_mapping_path, sep="\t")

In [None]:
encode_epiatlas_mapping_df.head()

##### ca pred 2.1 / with "is_EpiAtlas_EpiRR"

In [75]:
# Create new metadata that includes a column for EpiAtlas_EpiRR overlap
# enc_df = encode_epiatlas_mapping_df[["is_EpiAtlas_EpiRR", "accession"]]

# current_ca_filename = "CA_metadata_4DB+all_pred.20240606_mod2.tsv"
# mod2_ca_metadata = pd.read_csv(ca_dir / current_ca_filename, sep="\t", low_memory=False)
# new_pred_df = mod2_ca_metadata.merge(enc_df, left_on="ENCODE_GSE", right_on="accession", how="left").drop_duplicates()

# new_pred_df.drop(columns=["accession"], inplace=True)
# new_pred_df["is_EpiAtlas_EpiRR"].fillna(0, inplace=True)

# assert ca_pred_df.shape[0] == new_pred_df.shape[0]

# new_path = ca_dir / current_ca_filename.replace("mod2.tsv", "mod2.1.tsv")
# new_pred_df.to_csv(ca_dir / new_path, sep="\t", index=False)

In [None]:
to_exclude_encode = encode_epiatlas_mapping_df[
    encode_epiatlas_mapping_df["is_EpiAtlas_EpiRR"].notna()
]["accession"].to_list()

print(len(to_exclude_encode))

In [77]:
ca_core_df_no_epiatlas = ca_core_df[~ca_core_df["ENCODE_GSE"].isin(to_exclude_encode)]

In [None]:
if not ca_core_df_no_epiatlas.shape[0] + len(to_exclude_encode) == ca_core_df.shape[0]:
    print("Mismatch in the number of files.")
    print(
        f"{ca_core_df_no_epiatlas.shape[0]} + {len(to_exclude_encode)} = {ca_core_df_no_epiatlas.shape[0] + len(to_exclude_encode)} != {ca_core_df.shape[0]}"
    )
    print(
        f"Only {ca_core_df.shape[0] - ca_core_df_no_epiatlas.shape[0]}/{len(to_exclude_encode)} files excluded."
    )

In [None]:
min_pred = 0.6
print_pred_within_threshold(ca_core_df_no_epiatlas, min_pred=min_pred)
print_breakdown_predictions(ca_core_df_no_epiatlas, min_pred=min_pred, nb_classes=7)

In [None]:
ca_core_df_no_epiatlas.columns

#### Add 9n-nc

In [81]:
new_9c_nc_preds_path = ca_dir / "C-A_predictions_9c-nc.csv"
new_9c_nc_preds = pd.read_csv(new_9c_nc_preds_path, sep=",")

In [82]:
current_cols = new_9c_nc_preds.columns.to_list()
new_cols_names = current_cols[0:1] + [
    f"{label}_assay9nc".replace(" ", "_") for label in current_cols[1:]
]
new_9c_nc_preds.columns = new_cols_names

In [None]:
print(ca_core_df_no_epiatlas.shape)
ca_core_df_no_epiatlas_2 = ca_core_df_no_epiatlas.merge(
    new_9c_nc_preds,
    left_on="Experimental-id",
    right_on="md5sum",
    how="left",
    suffixes=("", "_9nc"),
)
ca_core_df_no_epiatlas_2 = ca_core_df_no_epiatlas_2.drop(labels="md5sum_9nc", axis=1)
print(ca_core_df_no_epiatlas_2.shape)

In [None]:
ca_core_df_no_epiatlas_2.columns[144:]

In [None]:
# core7 stats
print_pred_within_threshold(
    ca_core_df_no_epiatlas_2, min_pred=min_pred, col="Max_pred_assay9nc"
)
print_breakdown_predictions(ca_core_df_no_epiatlas_2, min_pred=min_pred, nb_classes="9nc")
display(ca_core_df_no_epiatlas_2["Predicted_class_assay9nc"].value_counts())
display(ca_core_df_no_epiatlas_2["manual_target_consensus"].value_counts())

In [None]:
results_min_pred = {}

sub_df = ca_core_df_no_epiatlas_2[
    ca_core_df_no_epiatlas_2["Predicted_class_assay9nc"] == "non-core"
].copy()

for min_pred in np.arange(0, 1, 0.05):
    filtered_df = sub_df[sub_df["Max_pred_assay9nc"] >= min_pred]

    groupby = filtered_df.groupby(["manual_target_consensus"])
    for target, df in groupby:
        if target not in results_min_pred:
            results_min_pred[target] = {
                "min_pred": [],
                "input": [],
                "target": [],
                "Total ok": [],
                r"%correct_excluding_input": [],
            }

        pred_count = df["2nd_pred_class_assay9nc"].value_counts()
        pred_count = pred_count / pred_count.sum()
        input_perc = pred_count["input"]
        target_perc = pred_count[target] if target in pred_count else 0

        results_min_pred[target]["min_pred"].append(min_pred)
        results_min_pred[target]["input"].append(input_perc * 100)
        results_min_pred[target]["target"].append(target_perc * 100)

        if target != "input":
            results_min_pred[target]["Total ok"].append((input_perc + target_perc) * 100)
            results_min_pred[target][r"%correct_excluding_input"].append(
                (target_perc / (1 - input_perc)) * 100
            )
        else:
            results_min_pred[target]["Total ok"].append(None)
            results_min_pred[target][r"%correct_excluding_input"].append(None)

# Optionally, create a table with all the data
table_data = []
for target, data in results_min_pred.items():
    for i in range(len(data["min_pred"])):
        row = [
            target,
            data["min_pred"][i],
            data["input"][i],
            data["target"][i],
            data["Total ok"][i],
            data[r"%correct_excluding_input"][i],
        ]
        table_data.append(row)

df_results = pd.DataFrame(
    table_data,
    columns=[
        "Target",
        "min_pred",
        "input",
        "target",
        "Total ok",
        r"%correct_excluding_input",
    ],
)
cols = df_results.columns.to_list()
cols.remove("min_pred")

# pylint: disable=consider-using-f-string
with pd.option_context("display.float_format", "{:.2f}".format):
    display(df_results[df_results["min_pred"] < 0.05][cols].sort_values(by="Target"))  # type: ignore

In [87]:
results_pred_diff = {}

for min_pred_diff in np.arange(0, 0.6, 0.05):
    sub_df = ca_core_df_no_epiatlas_2[
        ca_core_df_no_epiatlas_2["Predicted_class_assay9nc"] == "non-core"
    ].copy()
    sub_df = sub_df[sub_df["1rst/2nd_prob_diff_assay9nc"] >= min_pred_diff]

    groupby = sub_df.groupby(["manual_target_consensus"])
    for target, df in groupby:
        if target not in results_pred_diff:
            results_pred_diff[target] = {
                "min_pred_diff": [],
                r"%correct_excluding_input": [],
            }

        pred_count = df["2nd_pred_class_assay9nc"].value_counts()
        pred_count = pred_count / pred_count.sum()
        input_perc = pred_count["input"]
        target_perc = pred_count[target] if target in pred_count else 0

        if target != "input":
            correct_excluding_input = target_perc / (1 - input_perc) * 100
            results_pred_diff[target]["min_pred_diff"].append(min_pred_diff)
            results_pred_diff[target][r"%correct_excluding_input"].append(
                correct_excluding_input
            )

In [89]:
# for df, name in [(results_min_pred, "min_pred"), (results_pred_diff, "min_pred_diff")]:
#     fig = go.Figure()

#     for target, data in df.items():
#         if target != "input":
#             fig.add_trace(
#                 go.Scatter(
#                     x=data[name],
#                     y=data[r"%correct_excluding_input"],
#                     mode="lines+markers",
#                     marker=dict(color=assay_colors[target]),
#                     name=target,
#                 )
#             )

#     fig.update_layout(
#         title=f"Performance by Target and {name}",
#         xaxis_title=name,
#         yaxis_title=r"%correct_excluding_input",
#         legend_title="Target",
#         hovermode="x unified",
#     )

#     fig.update_yaxes(range=[80, 95])

#     fig.show()

In [90]:
# confusion matrix
min_pred = 0.6
col = "Predicted_class_assay9nc"
labels = sorted(ca_core_df_no_epiatlas_2[col].unique().tolist())

df = ca_core_df_no_epiatlas_2[ca_core_df_no_epiatlas_2["Max_pred_assay9nc"] >= min_pred]
fig_dir = base_fig_dir / "fig_C-A" / "confusion_matrices"

# cm = sk_cm(df["manual_target_consensus"], df[col])
# cm_writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
# name = f"confusion_matrix_assay9nc_core7_minPred{min_pred:.02f}"
# cm_writer.to_all_formats(logdir=fig_dir, name=name)

#### Summary

Main stats  

assay7  

- Nb pred assay7 (pred score >= 0.60): 89.49% (41326/46179)
- Nb match assay7: 95.66% (39532/41326)
- Nb error assay7: 4.34% (1794/41326)

Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).   

- Predictions as input: 39.79% (16442/41326)
- Correct predictions as input: 92.42% (15195/16442) OR 36.77% (15195/41326)
- Incorrect predictions as input: 7.58% (1247/16442) OR 3.02% (1247/41326) OR 69.51% (1247/1794)

assay9nc

- Nb pred assay9nc (pred score >= 0.60): 92.10% (42531/46179)
- Nb match assay9nc: 25.88% (11007/42531)
- Nb error assay9nc: 74.12% (31524/42531)  --> mostly non-core preds

Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).   

- Predictions as input: 0.50% (211/42531)
- Correct predictions as input: 93.36% (197/211) OR 0.46% (197/42531)
- Incorrect predictions as input: 6.64% (14/211) OR 0.03% (14/42531) OR 0.04% (14/31524)


## Metadata complementation

In [None]:
other_categories = []
for name in ca_core_df.columns:
    if "True_class" in name and "assay" not in name:
        print(name)
        other_categories.append(name.split("_")[-1])

In [None]:
for df in [ca_core_df.copy(), ca_core_df_no_epiatlas.copy()]:
    print(df.shape)
    for name in other_categories:
        col = f"Max_pred_{name}"
        try:
            df.loc[:, col] = df[col].astype(float)
        except ValueError:
            print(f"Problem with {col}")
            continue
        # print(col)
        # col_pred = f"Predicted_class_{name}"
        # preds = df[col_pred].value_counts(dropna=False)
        # display(preds)

        # print_pred_within_threshold(df, min_pred=0.6, col=col)

Chip-Atlas all core7

Predictions with pred score >= 0.60 (of total 47226)
| Category    | % Samples | Nb Samples |
|-------------|-----------|------------|
| cancer2      | 93.95%    | 44371      |
| donor_life_stage   | 94.91%    | 44823  |
| sex3         | 87.15%    | 41156      |
| biomaterial_type      | 79.39%    | 37493      |
| paired_end      | 95.94%    | 45310      |


Chip-Atlas all core 7, no epiatlas encode
Predictions with pred score >= 0.60 (of total 46179)
| Category             | % Samples | Nb Samples |
|----------------------|-----------|------------|
| cancer2              | 93.93%    | 43375      |
| donor_life_stage     | 94.88%    | 43815      |
| sex3                 | 87.02%    | 40186      |
| biomaterial_type     | 79.17%    | 36562      |
| paired_end           | 95.96%    | 44313      |


### track type

In [141]:
track_type_pred_path = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "C-A"
    / "track_type"
    / "split0_test_prediction_C-A_100kb_all_none.csv"
)
track_type_pred_df = pd.read_csv(track_type_pred_path)

In [142]:
track_type_pred_df["Max_pred_track_type"] = track_type_pred_df.loc[
    :, track_type_pred_df.columns[3:]
].max(axis=1)

In [None]:
track_type_df = track_type_pred_df.merge(
    ca_pred_df, left_on="Unnamed: 0", right_on="Experimental-id", how="inner"
)

print(track_type_df.shape, ca_pred_df.shape, track_type_pred_df.shape)

In [149]:
# write each table in a separate excel sheet
output = track_type_pred_path.parent / "track_type_predictions_pivot.csv"
output.unlink(missing_ok=True)

with open(output, "a", encoding="utf8") as csv_stream:
    for min_pred in [0, 0.6, 0.8]:
        df = track_type_df[track_type_df["Max_pred_track_type"] >= min_pred]
        pivot = df.pivot_table(
            index="manual_target_consensus",
            columns="Predicted class",
            values="Max_pred_track_type",
            aggfunc="count",
            fill_value=0,
            margins=True,
        ).astype(int)
        relative_pivot = pivot.div(pivot["All"], axis=0) * 100

        csv_stream.write(f"Count Pivot - Min pred: {min_pred}\n")
        pivot.to_csv(csv_stream)
        csv_stream.write("\n")

        csv_stream.write(f"Relative Pivot - Min pred: {min_pred}\n")
        relative_pivot.to_csv(csv_stream)
        csv_stream.write("\n")

        # display(pivot)
        # with pd.option_context("display.float_format", "{:.2f}".format):
        #     display(relative_pivot)