In [None]:
"""Workbook to analyse Chip-Atlas predictions, destined for the paper.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, pointless-statement

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np

# import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import confusion_matrix as sk_cm

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.notebooks.paper.paper_utilities import ASSAY_ORDER

# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

## Setup

In [None]:
ASSAY_ORDER

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [None]:
ca_pred_path = (
    base_data_dir
    / "training_results"
    / "C-A"
    / "CA_metadata_4DB+all_pred_subset.20240606.tsv"
)
ca_pred_df = pd.read_csv(ca_pred_path, sep="\t", low_memory=False)

| Assay | Exp Key                               | Nb Files | Training Size | Oversampling | Expected Nb Files                      |
|-------|---------------------------------------|----------|---------------|--------------|---------------------------------------|
| 13c   | dd3710b73c0341af85a17ce1998362d0      | 24989    | 116550        | true         | 24989                                 |
| 11c   | 0f8e5eb996114868a17057bebe64f87c      | 20922    | 46128         | true         | 20922                                 |
| 7c    | 69488630801b4a05a53b5d9e572f0aaa      | 16788    | 34413         | true         | 16788 (contre-vérifié)                |

*using hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2


In [None]:
ca_pred_df.columns

In [None]:
core_assays = ASSAY_ORDER[0:7]

In [None]:
db_cols = ["GEO_mod", "C-A", "Cistrome", "NGS_mod"]

In [None]:
print(ca_pred_df.shape)
min_pred = 0.8
ca_pred_df = ca_pred_df[
    (ca_pred_df["Max_pred_assay13"].astype(float) > min_pred)
    | (ca_pred_df["Max_pred_assay7"].astype(float) > min_pred)
]
print(ca_pred_df.shape)

## Analysis

In [None]:
def print_high_level_pred_info(df: pd.DataFrame) -> None:
    """High level information about the predictions."""
    for assay in core_assays:
        print(f"{assay}")
        assay_df = df[df["manual_target_consensus"] == assay]
        for col in [
            "Predicted_class_assay7",
            "Predicted_class_assay11",
            "Predicted_class_assay13",
        ]:
            assay_number = col.rsplit("_", maxsplit=1)[-1]
            display(assay_df[col].value_counts() / len(assay_df) * 100)
            if any(label in col for label in ["11", "13"]):
                wrong_pred = assay_df[assay_df[col] != assay]

                display(
                    wrong_pred[f"2nd_pred_class_{assay_number}"].value_counts()
                    / len(wrong_pred)
                    * 100
                )
        print("\n")

    print("What is the actual target when wgbs-standard is predicted?")
    for assay_number in ["assay11", "assay13"]:
        print(f"{assay_number}")
        wgbs_dist = ca_pred_df[
            ca_pred_df[f"Predicted_class_{assay_number}"] == "wgbs-standard"
        ]["manual_target_consensus"]
        display(wgbs_dist.value_counts())
        display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

    print("What is the actual target when non-core is predicted?")
    col = "Predicted_class_assay13"
    wgbs_dist = ca_pred_df[ca_pred_df[col] == "non-core"]["manual_target_consensus"]
    display(wgbs_dist.value_counts())
    display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

In [None]:
# print_high_level_pred_info(ca_pred_df)

In [None]:
ca_pred_df["manual_target_consensus"].value_counts(dropna=False)

In [None]:
non_core_labels = ["no_consensus", "non-core", "CTCF"]
ca_core_df = ca_pred_df[~ca_pred_df.isin(non_core_labels)]
ca_core_df = ca_core_df.dropna(subset=["manual_target_consensus"])

In [None]:
ca_core_df.shape

In [None]:
no_consensus_df = ca_pred_df[ca_pred_df["manual_target_consensus"] == "no_consensus"]

In [None]:
no_consensus_df.shape

In [None]:
for min_pred in [0.6, 0.8]:
    # for min_pred in np.arange(0.6, 1.05, 0.05):
    break_tie_mask = no_consensus_df["Max_pred_assay7"] >= min_pred
    nb_break_tie = break_tie_mask.sum()
    print(
        f"Break no_consensus (minPred >= {min_pred:.02f}): {nb_break_tie/ len(no_consensus_df) * 100:.02f}% ({nb_break_tie}/{len(no_consensus_df)})"
    )
    df = no_consensus_df[break_tie_mask]
    # display(df[no_consensus_df.columns[2:10]])
    display(df.value_counts("Predicted_class_assay7"))
    nb_not_input = (df["Predicted_class_assay7"] != "input").sum()
    print(
        f"non-input tie breakers: {nb_not_input}/{nb_break_tie} ({nb_not_input/len(df) * 100:.02f}%\n"
    )
    print(df["ENCODE"].value_counts(), "\n")

### Notes

Keeping only core 7.

WITH ENCODE

Nb pred assay7 (pred score >= 0.60): 93.41% (41323/44236) VS 89% of total predictions (including non-core7)  
Nb match assay7: 96.08% (39704/41323)  
Nb error assay7: 3.92% (1619/41323)  

Of pred score >= 0.60:
- Predictions as input: 39.36% (16266/41323)  
- Correct predictions as input: 93.08% (15141/16266) OR 36.64% (15141/41323)  
- Incorrect predictions as input: 6.92% (1125/16266) OR 2.72% (1125/41323) OR 69.49% (1125/1619)  

NO ENCODE  

Nb pred assay7 (pred score >= 0.60): 92.70%  
Nb match assay7: 95.61% (33303/34832)  
Nb error assay7: 4.39% (1529/34832)  

Of pred score >= 0.60:  
- Predictions as input: 40.61% (14145/34832)  
- Correct predictions as input: 92.60% (13098/14145) OR 37.60% (13098/34832)
- Incorrect predictions as input: 7.40% (1047/14145) OR 3.01% (1047/34832) OR 68.48% (1047/1529)

BREAK CONSENSUS (does not contain any ENCODE data)  

- Break no_consensus (minPred >= 0.60): 92.59% (462/499) & non-input 189/462 (40.91%)  
- Break no_consensus (minPred >= 0.80): 80.56% (402/499) & non-input 163/402 (40.55%)  


In [None]:
ca_core_df["ENCODE"].value_counts()

In [None]:
# ca_core_df = ca_core_df[ca_core_df["ENCODE"] == 0]

In [None]:
for min_pred in np.arange(0, 1.05, 0.05):
    nb_pred = (ca_core_df["Max_pred_assay7"] >= min_pred).sum()
    print(
        f"Nb pred assay7 (pred score >= {min_pred:.02f}): {nb_pred/ len(ca_core_df):.2%}"
    )

In [None]:
fig_dir = base_fig_dir / "fig_C-A"
for min_pred in [0.6, 0.8, 0.9]:
    print("Min pred score:", min_pred)
    sub_df = ca_core_df[ca_core_df["Max_pred_assay7"] >= min_pred]
    match_consensus = (
        sub_df["manual_target_consensus"] == sub_df["Predicted_class_assay7"]
    )
    nb_match = match_consensus.sum()
    nb_error = (~match_consensus).sum()
    print(f"Nb match assay7: {nb_match/ len(sub_df):.2%} ({nb_match}/{len(sub_df)})")
    print(f"Nb error assay7: {nb_error/ len(sub_df):.2%} ({nb_error}/{len(sub_df)})\n")

    correct_pred_df = sub_df[match_consensus]
    incorrect_pred_df = sub_df[~match_consensus]

    confusion_mat = sk_cm(
        sub_df["manual_target_consensus"],
        sub_df["Predicted_class_assay7"],
        labels=core_assays,
    )

    mat_writer = ConfusionMatrixWriter(labels=core_assays, confusion_matrix=confusion_mat)
    name = f"confusion_matrix_assay7_core7_minPred{min_pred:.02f}"
    if sub_df["ENCODE"].sum() == 0:
        name += "_noENCODE"
    mat_writer.to_all_formats(logdir=fig_dir, name=name)

    print(
        r"Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).",
        "\n",
    )
    for assay in core_assays:
        assay_df = sub_df[sub_df["Predicted_class_assay7"] == assay]
        nb_assay = len(assay_df)

        nb_assay_correct = len(
            correct_pred_df[correct_pred_df["Predicted_class_assay7"] == assay]
        )
        nb_assay_incorrect = len(
            incorrect_pred_df[incorrect_pred_df["Predicted_class_assay7"] == assay]
        )

        print(
            f"Predictions as {assay}: {nb_assay / len(sub_df):.2%} ({nb_assay}/{len(sub_df)})"
        )
        perc_cor = nb_assay_correct / nb_assay
        perc_cor2 = nb_assay_correct / len(sub_df)
        perc_inc = nb_assay_incorrect / nb_assay
        perc_inc2 = nb_assay_incorrect / len(sub_df)
        perc_inc3 = nb_assay_incorrect / len(incorrect_pred_df)

        print(
            f"Correct predictions as {assay}: {perc_cor:.2%} ({nb_assay_correct}/{nb_assay}) OR {perc_cor2:.2%} ({nb_assay_correct}/{len(sub_df)})"
        )
        print(
            f"Incorrect predictions as {assay}: "
            f"{perc_inc:.2%} ({nb_assay_incorrect}/{nb_assay}) OR "
            f"{perc_inc2:.2%} ({nb_assay_incorrect}/{len(sub_df)}) OR "
            f"{perc_inc3:.2%} ({nb_assay_incorrect}/{len(incorrect_pred_df)})\n"
        )

In [None]:
ca_core_df.groupby(db_cols + ["manual_target_consensus", "Predicted_class_assay7"]).size()

reference_column = "manual_target_consensus"
columns_to_check = db_cols
ca_core_df["manual_target_consensus_size"] = (
    ca_core_df[columns_to_check].eq(ca_core_df[reference_column], axis=0)
).sum(axis=1)

for col in ["manual_target_consensus", "manual_target_consensus_size"]:
    print(ca_core_df[col].value_counts(dropna=False))

wut = ca_core_df[ca_core_df["manual_target_consensus_size"] == 0]
display(wut)