In [None]:
"""Workbook to analyse Chip-Atlas predictions, destined for the paper.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, pointless-statement

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

from pathlib import Path

import numpy as np

# import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import confusion_matrix as sk_cm

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.notebooks.paper.paper_utilities import ASSAY_ORDER

# import plotly.express as px
# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

## Setup

In [None]:
ASSAY_ORDER

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [None]:
ca_dir = base_data_dir / "training_results" / "C-A"
ca_pred_path = ca_dir / "CA_metadata_4DB+all_pred_subset.20240606.tsv"
ca_pred_df = pd.read_csv(ca_pred_path, sep="\t", low_memory=False)

| Assay | Exp Key                               | Nb Files | Training Size | Oversampling | Expected Nb Files                      |
|-------|---------------------------------------|----------|---------------|--------------|---------------------------------------|
| 13c   | dd3710b73c0341af85a17ce1998362d0      | 24989    | 116550        | true         | 24989                                 |
| 11c   | 0f8e5eb996114868a17057bebe64f87c      | 20922    | 46128         | true         | 20922                                 |
| 7c    | 69488630801b4a05a53b5d9e572f0aaa      | 16788    | 34413         | true         | 16788 (contre-vérifié)                |

*using hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2


In [None]:
ca_pred_df.columns

In [None]:
core_assays = ASSAY_ORDER[0:7]

In [None]:
db_cols = ["GEO_mod", "C-A", "Cistrome", "NGS_mod"]

In [None]:
def display_perc(df: pd.DataFrame | pd.Series):
    """Display a DataFrame with percentages."""
    # pylint: disable=consider-using-f-string
    with pd.option_context("display.float_format", "{:.2f}".format):
        display(df)

### File correction

CA_metadata_4DB+all_pred.20240606.tsv has some mistakes in GEO_mod and manual_target_consensus. Using values in CA_metadata_mod2.tsv to overwrite.

In [None]:
ca_mod_path = ca_dir / "CA_metadata_mod2.tsv"
ca_mod_df = pd.read_csv(ca_mod_path, sep="\t", low_memory=False)
ca_mod_df.head(n=2)

In [None]:
ca_pred_df = ca_pred_df.merge(ca_mod_df, on="Experimental-id", how="left")
ca_pred_df[["manual_target_consensus", "GEO_mod"]] = ca_pred_df[
    ["manual_target_consensus2", "GEO_mod2"]
]
ca_pred_df = ca_pred_df.drop(columns=["manual_target_consensus2", "GEO_mod2"])

## Analysis

In [None]:
def print_high_level_pred_info(df: pd.DataFrame) -> None:
    """High level information about the predictions."""
    for assay in core_assays:
        print(f"{assay}")
        assay_df = df[df["manual_target_consensus"] == assay]
        for col in [
            "Predicted_class_assay7",
            "Predicted_class_assay11",
            "Predicted_class_assay13",
        ]:
            assay_number = col.rsplit("_", maxsplit=1)[-1]
            display(assay_df[col].value_counts() / len(assay_df) * 100)
            if any(label in col for label in ["11", "13"]):
                wrong_pred = assay_df[assay_df[col] != assay]

                display(
                    wrong_pred[f"2nd_pred_class_{assay_number}"].value_counts()
                    / len(wrong_pred)
                    * 100
                )
        print("\n")

    print("What is the actual target when wgbs-standard is predicted?")
    for assay_number in ["assay11", "assay13"]:
        print(f"{assay_number}")
        wgbs_dist = ca_pred_df[
            ca_pred_df[f"Predicted_class_{assay_number}"] == "wgbs-standard"
        ]["manual_target_consensus"]
        display(wgbs_dist.value_counts())
        display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

    print("What is the actual target when non-core is predicted?")
    col = "Predicted_class_assay13"
    wgbs_dist = ca_pred_df[ca_pred_df[col] == "non-core"]["manual_target_consensus"]
    display(wgbs_dist.value_counts())
    display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

In [None]:
# print_high_level_pred_info(ca_pred_df)

In [None]:
def print_target_info(df: pd.DataFrame) -> None:
    """Print information about the targets."""
    assay_count = df["manual_target_consensus"].value_counts(dropna=False)
    display(assay_count)
    print("Size of the dataset: ", len(df))
    display_perc(assay_count / len(df))

In [None]:
print_target_info(ca_pred_df)

In [None]:
non_core_labels = ["no_consensus", "non-core", "CTCF"]
ca_core_df = ca_pred_df[~ca_pred_df.isin(non_core_labels)]
ca_core_df = ca_core_df.dropna(subset=["manual_target_consensus"])
print(ca_core_df.shape)

In [None]:
print_target_info(ca_core_df)

In [None]:
no_consensus_df = ca_pred_df[ca_pred_df["manual_target_consensus"] == "no_consensus"]
print(no_consensus_df.shape)

In [None]:
for min_pred in [0.6, 0.8]:
    # for min_pred in np.arange(0.6, 1.05, 0.05):
    break_tie_mask = no_consensus_df["Max_pred_assay7"] >= min_pred
    nb_break_tie = break_tie_mask.sum()
    print(
        f"Break no_consensus (minPred >= {min_pred:.02f}): {nb_break_tie/ len(no_consensus_df) * 100:.02f}% ({nb_break_tie}/{len(no_consensus_df)})"
    )
    df = no_consensus_df[break_tie_mask]
    # display(df[no_consensus_df.columns[2:10]])
    # display(df.value_counts("Predicted_class_assay7"))
    nb_not_input = (df["Predicted_class_assay7"] != "input").sum()
    print(
        f"non-input tie breakers: {nb_not_input}/{nb_break_tie} ({nb_not_input/len(df) * 100:.02f}%)\n"
    )
    # print(df["ENCODE"].value_counts(), "\n")

In [None]:
enc_count = ca_core_df["ENCODE"].value_counts()
display(enc_count)
display_perc(enc_count / len(ca_core_df))

In [None]:
def print_pred_within_threshold(
    df: pd.DataFrame, min_pred: float = 0.6, nb_classes: str | int = 7
) -> None:
    """Print the predictions within a threshold."""
    col = f"Max_pred_assay{nb_classes}"
    mask = df[col] >= min_pred
    nb_pred = mask.sum()
    print(
        f"Nb pred assay{nb_classes} (pred score >= {min_pred:.02f}): {nb_pred/len(df) * 100:.02f}% ({nb_pred}/{len(df)})"
    )

In [None]:
for min_pred in np.arange(0, 1.05, 0.05):
    df = ca_core_df[ca_core_df["ENCODE"] == 0]
    print_pred_within_threshold(df, min_pred=min_pred, nb_classes=7)

In [None]:
def save_confusion_matrix(
    df: pd.DataFrame,
    fig_dir: Path | str,
    nb_classes: int | str = 7,
    min_pred: float = 0.6,
):
    """Save the confusion matrix for core assays predictions. Does not filter."""
    col = f"Predicted_class_assay{nb_classes}"
    cm = sk_cm(df["manual_target_consensus"], df[col])
    cm_writer = ConfusionMatrixWriter(labels=core_assays, confusion_matrix=cm)

    name = f"confusion_matrix_assay{nb_classes}_core7_minPred{min_pred:.02f}"
    if df["ENCODE"].sum() == 0:
        name += "_noENCODE"

    cm_writer.to_all_formats(logdir=fig_dir, name=name)

In [None]:
def print_breakdown_predictions(
    df: pd.DataFrame, min_pred: float = 0.6, nb_classes: int | str = 7
) -> None:
    """Breakdown the predictions, print results."""
    df = df[df[f"Max_pred_assay{nb_classes}"] >= min_pred]

    pred_col = f"Predicted_class_assay{nb_classes}"
    match_consensus = df["manual_target_consensus"] == df[pred_col]
    nb_match = match_consensus.sum()
    nb_error = (~match_consensus).sum()
    print(f"Nb match assay{nb_classes}: {nb_match/ len(df):.2%} ({nb_match}/{len(df)})")
    print(f"Nb error assay{nb_classes}: {nb_error/ len(df):.2%} ({nb_error}/{len(df)})\n")

    correct_pred_df = df[match_consensus]
    incorrect_pred_df = df[~match_consensus]

    print(
        r"Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).",
        "\n",
    )
    for assay in core_assays:
        assay_df = df[df[pred_col] == assay]
        nb_assay = len(assay_df)

        nb_assay_correct = len(correct_pred_df[correct_pred_df[pred_col] == assay])
        nb_assay_incorrect = len(incorrect_pred_df[incorrect_pred_df[pred_col] == assay])

        print(f"Predictions as {assay}: {nb_assay / len(df):.2%} ({nb_assay}/{len(df)})")
        perc_cor = nb_assay_correct / nb_assay
        perc_cor2 = nb_assay_correct / len(df)
        perc_inc = nb_assay_incorrect / nb_assay
        perc_inc2 = nb_assay_incorrect / len(df)
        perc_inc3 = nb_assay_incorrect / len(incorrect_pred_df)

        print(
            f"Correct predictions as {assay}: {perc_cor:.2%} ({nb_assay_correct}/{nb_assay}) OR {perc_cor2:.2%} ({nb_assay_correct}/{len(df)})"
        )
        print(
            f"Incorrect predictions as {assay}: "
            f"{perc_inc:.2%} ({nb_assay_incorrect}/{nb_assay}) OR "
            f"{perc_inc2:.2%} ({nb_assay_incorrect}/{len(df)}) OR "
            f"{perc_inc3:.2%} ({nb_assay_incorrect}/{len(incorrect_pred_df)})\n"
        )

In [None]:
fig_dir = base_fig_dir / "fig_C-A"
for subset in [[0], [0, 1]]:
    if subset == [0]:
        print("Subset: no ENCODE")
    else:
        print("Subset: Include ENCODE")
    df = ca_core_df[ca_core_df["ENCODE"].isin(subset)]
    for min_pred in [0.6, 0.8, 0.9]:
        print("Min pred score:", min_pred)
        print_pred_within_threshold(df, min_pred=min_pred, nb_classes=7)
        print_breakdown_predictions(df, min_pred=min_pred)

        # df = ca_core_df[ca_core_df["Max_pred_assay7"] >= min_pred]
        # save_confusion_matrix(df, fig_dir, min_pred=min_pred)

### Summary

Keeping only core 7.

WITH ENCODE

- Nb pred assay7 (pred score >= 0.60): 89.60% (42314/47226)
- Nb match assay7: 95.74% (40511/42314)
- Nb error assay7: 4.26% (1803/42314)

Of pred score >= 0.60:
- Predictions as input: 39.32% (16636/42314)
- Correct predictions as input: 92.45% (15380/16636) OR 36.35% (15380/42314)
- Incorrect predictions as input: 7.55% (1256/16636) OR 2.97% (1256/42314) OR 69.66% (1256/1803)

NO ENCODE  

- Nb pred assay7 (pred score >= 0.60): 88.56% (35631/40232)
- Nb match assay7: 95.24% (33935/35631)
- Nb error assay7: 4.76% (1696/35631)

Of pred score >= 0.60:
- Predictions as input: 40.69% (14499/35631)
- Correct predictions as input: 91.96% (13334/14499) OR 37.42% (13334/35631)
- Incorrect predictions as input: 8.04% (1165/14499) OR 3.27% (1165/35631) OR 68.69% (1165/1696)

BREAK CONSENSUS (does not contain any ENCODE data)

- Break no_consensus (minPred >= 0.60): 92.59% (462/499) & non-input 189/462 (40.91%)  
- Break no_consensus (minPred >= 0.80): 76.57% (402/525) & non-input 163/402 (40.55%)  


### Varying consensus criterion (nb DB agreeing)

In [None]:
# ca_core_df.groupby(db_cols + ["manual_target_consensus", "Predicted_class_assay7"]).size()

reference_column = "manual_target_consensus"
columns_to_check = db_cols
ca_core_df["manual_target_consensus_size"] = (
    ca_core_df[columns_to_check].eq(ca_core_df[reference_column], axis=0)
).sum(axis=1)

for col in ["manual_target_consensus", "manual_target_consensus_size"]:
    val_count = ca_core_df[col].value_counts(dropna=False).sort_index()
    display(val_count)
    display_perc(val_count / len(ca_core_df))

In [None]:
# print("Global:")
# print_pred_within_threshold(ca_core_df, min_pred=0.6, nb_classes=7)
# print()
# print_breakdown_predictions(ca_core_df, min_pred=0.6, nb_classes=7)
# sub_df = ca_core_df[ca_core_df["Max_pred_assay7"] >= 0.6]
# assay_count = sub_df["manual_target_consensus"].value_counts().sort_index()
# display_perc(assay_count / len(sub_df))


for i in range(1, 5):
    df = ca_core_df[ca_core_df["manual_target_consensus_size"] == i]
    print(f"Consensus defined with {i} DB: {len(df)} files.")

    # Display % assay
    # sub_df = df[df["Max_pred_assay7"] >= 0.6]
    # assay_count = df["manual_target_consensus"].value_counts()

    # display_perc(assay_count / len(df))

    #     input_val = (assay_count / len(df))["input"]
    #     print(f"input: {input_val:.2%}")

    print_pred_within_threshold(df, min_pred=0.6, nb_classes=7)
    # print()

    print_breakdown_predictions(df, min_pred=0.6, nb_classes=7)

#### Summary

There doesn't seem to be big differences in accuracy when looking at consensus defined by a different number of DB.  




Core 7 files, including ENCODE
| Consensus size | Nb files | Nb files (%) | Input size (%) |
|-------------|------------------------------|------------|------------|
| NA           | 47226                         | 100%      |  38.37%      |
| 1           | 3324                         | 6.98%      | 61.85%     |
| 2           | 25551                        | 53.81%     | 42.27%     |
| 3           | 10275                        | 21.60%     | 37.38%     |
| 4           | 8076                         | 17.61%     | 17.62%     |








For assay7 core7, min_pred 0.6:  

Global average
- acc: 95.74%
- %err=input (lowQual): 69.66%
- consensus input = 36.77%

1 to 4 DB consensus

- acc within [95.36%, 96.03%]
- %err=input (lowQual): within [63.43%, 79.39%] = [63.43%, 69.55%, 65.09%, 79.39%]
- consensus input = [60.03%, 41.01%, 35.73%, 15.84%]

### ENCODE core no EpiAtlas + Chip-Atlas (no ENCODE)

In [None]:
encode_dir = base_data_dir / "training_results" / "encode_predictions" / "assay_epiclass"
encode_pred_path = encode_dir / "encode_only-core-7c_predictions.csv"
encode_pred = pd.read_csv(encode_pred_path)

In [None]:
encode_pred["Experimental-id"] = encode_pred["md5sum"]
encode_pred["Predicted_class_assay7"] = encode_pred["Predicted class"]
encode_pred["manual_target_consensus"] = encode_pred["True class"]
encode_pred["Max_pred_assay7"] = encode_pred["Max pred"]

In [None]:
global_pred_df = pd.concat([ca_core_df, encode_pred], ignore_index=True)

In [None]:
global_pred_df = global_pred_df[~(global_pred_df["ENCODE"] == 1)]

In [None]:
display(global_pred_df["ENCODE"].value_counts(dropna=False))
display(global_pred_df["manual_target_consensus"].value_counts(dropna=False))

In [None]:
global_pred_df = global_pred_df[
    global_pred_df["manual_target_consensus"].isin(core_assays)
]

In [None]:
min_pred = 0.6
print_pred_within_threshold(global_pred_df, min_pred=min_pred, nb_classes=7)
print_breakdown_predictions(global_pred_df, min_pred=min_pred, nb_classes=7)

In [None]:
base_data_dir

In [None]:
output_dir = base_data_dir / "training_results" / "predictions"

In [None]:
mislabels = global_pred_df[
    global_pred_df["Predicted_class_assay7"] != global_pred_df["manual_target_consensus"]
]
# mislabels.to_csv(output_dir / "mislabels_C-A&ENCODE_assay7.csv", index=False)

#### Summary

- Nb pred assay7 (pred score >= 0.60): 89.49% (39910/44599)
- Nb match assay7: 95.65% (38172/39910)
- Nb error assay7: 4.35% (1738/39910)

Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).   

- Predictions as input: 40.39% (16119/39910)
- Correct predictions as input: 92.75% (14951/16119) OR 37.46% (14951/39910)
- Incorrect predictions as input: 7.25% (1168/16119) OR 2.93% (1168/39910) OR 67.20% (1168/1738)
