In [None]:
"""Workbook to analyse classifier predictions on ChIP-Atlas data.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, pointless-statement

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

## Setup

In [None]:
from __future__ import annotations

import ast
from collections import Counter
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
import upsetplot
from IPython.display import display
from matplotlib import pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix as sk_cm

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.notebooks.paper.metrics_per_assay import MetricsPerAssay
from epi_ml.utils.notebooks.paper.paper_utilities import ASSAY_ORDER, IHECColorMap

In [None]:
display(ASSAY_ORDER)

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [None]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map

In [None]:
ca_dir = base_data_dir / "training_results" / "predictions" / "C-A" / "assay_epiclass"

ca_filename = "CA_metadata_4DB+all_pred.20240606_mod3.0.tsv"
ca_pred_path = ca_dir / ca_filename

ca_pred_df = pd.read_csv(ca_pred_path, sep="\t", low_memory=False)

print(ca_pred_df.shape)

| Assay | Exp Key                               | Nb Files | Training Size | Oversampling |
|-------|---------------------------------------|----------|---------------|--------------|
| 13c*   | dd3710b73c0341af85a17ce1998362d0      | 24989    | 116550        | true         |
| 11c   | 0f8e5eb996114868a17057bebe64f87c      | 20922    | 46128         | true         |
| 7c    | 69488630801b4a05a53b5d9e572f0aaa      | 16788    | 34413         | true         |

*using hg38_2023-epiatlas-dfreeze_v2.1_w_encode_noncore_2


In [None]:
df_cols = ca_pred_df.columns.to_list()

In [None]:
CORE_ASSAYS = ASSAY_ORDER[0:7]

DB_COLS = ["GEO_mod", "C-A", "Cistrome", "NGS_mod"]

PRED_COLS = [
    "Predicted_class_assay7",
    "Predicted_class_assay11",
    "Predicted_class_assay13",
]

SAME_TARGET = "core7_DBs_consensus"

In [None]:
def print_column_content(df: pd.DataFrame, col: str) -> None:
    """Print information about the targets."""
    label_count_df = df[col].value_counts(dropna=False).to_frame()

    label_count_df["relative"] = label_count_df["count"] / len(df)

    label_count_df.loc["Total", "count"] = len(df)
    label_count_df.loc["Total", "relative"] = 1

    style_map = {
        "count": "{:.0f}",
        "relative": "{:.2%}",
    }

    display(label_count_df.style.format(style_map))  # type: ignore

## Analysis

Base dataset used: Chip-Atlas experiments where at least one of the BD declared the target in core7.

Excluding: 
- Samples where at least one the DB declared a target out of core7.
- samples overlapping with EpiATLAS dataset (different file creation pipeline, same base bam)

### Database composition

In [None]:
tmp_df = ca_pred_df.loc[:, DB_COLS].copy(deep=True)
tmp_df["C-A"].replace("unclassified", "----", inplace=True)

id_db_target = []
unique_labels = Counter()
different_labels = Counter()

for labels in tmp_df.values:
    missing_N = sum(label == "----" for label in labels)
    db_labels = set(labels)

    try:
        db_labels.remove("----")
    except KeyError:
        pass
    if missing_N == 3:
        id_db_target.append("1 source")
    elif len(db_labels) == 1:
        id_db_target.append("Identical")
    else:
        id_db_target.append("Different")
        different_labels[tuple(db_labels)] += 1

    unique_labels[tuple(db_labels)] += 1


display(pd.Series(id_db_target).value_counts(dropna=False, normalize=True))

In [None]:
non_core_labels = ["non-core", "CTCF", "ctcf"]
non_core_labels_2 = ["Ignored - Potential non-core", "non-core/CTCF"]

print(f"Starting with {len(ca_pred_df)} rows.")
ca_core_df = ca_pred_df[~ca_pred_df[SAME_TARGET].isin(non_core_labels_2)]
diff_N = len(ca_pred_df) - len(ca_core_df)
print(
    f"Removed {diff_N} rows with {SAME_TARGET} in {non_core_labels_2}.\nAfter this, {len(ca_core_df)} rows remain."
)

In [None]:
N_diff = len(ca_core_df)
ca_core_df = ca_core_df[ca_core_df["is_EpiAtlas_EpiRR"].astype(str) == "0"].copy()
N_diff -= len(ca_core_df)
print(
    f"Removed {N_diff} rows with EpiATLAS EpiRR overlap. After this, {len(ca_core_df)} rows remain."
)

In [None]:
for db_col in DB_COLS:
    col = ca_core_df[db_col]
    if col.isna().sum():
        print("Missing values: ", ca_core_df[col.isna()])

In [None]:
print_column_content(ca_core_df, "manual_target_consensus")

In [None]:
print_column_content(ca_core_df, SAME_TARGET)

In [None]:
no_consensus_df = ca_core_df[ca_core_df["manual_target_consensus"] == "no_consensus"]

#### Upset plots

In [None]:
fig_dir = base_fig_dir / "fig_C-A" / "DB_upset" / "no_EpiATLAS"
fig_dir.mkdir(exist_ok=True)

In [None]:
def make_db_upsetplot(
    df: pd.DataFrame, db_cols: List[str], title: str
) -> upsetplot.UpSet:
    """Make an upsetplot of the sample presence in the different databases."""
    df = df.copy()
    if SAME_TARGET not in df.columns:
        raise ValueError("Column 'identical_DBs_target' not found in DataFrame.")

    # Create a new DataFrame with boolean columns for each database
    upset_df = pd.DataFrame()
    for col in db_cols:
        upset_df[col] = df[col] != "----"
    upset_df[SAME_TARGET] = df[SAME_TARGET]

    # Set the index for the UpSet plot
    upset_df = upset_df.set_index(db_cols)

    # Create the UpSet plot
    upset = upsetplot.UpSet(
        upset_df,
        intersection_plot_elements=0,  # disable the default bar chart
        sort_by="cardinality",
        show_counts=True,  # type: ignore
        orientation="horizontal",
    )

    # Add stacked bars
    upset.add_stacked_bars(by=SAME_TARGET, elements=15)

    # Plot and set title
    axes = upset.plot()
    plt.suptitle(title)
    axes["totals"].set_title("Total")
    plt.legend(loc="center left")
    return upset

Everything

In [None]:
title = "All core7 ChIP-Atlas samples presence in used DBs)\nTarget consensus"
upset = make_db_upsetplot(ca_core_df, DB_COLS, title=title)

plt.savefig(fig_dir / "upsetplot_DB_core7_samples.svg", bbox_inches="tight")

No ENCODE EpiRR overlap

In [None]:
print_column_content(ca_core_df, "ENCODE")

In [None]:
# no encode
no_encode_df = ca_core_df[ca_core_df["ENCODE"] == 0]
title = "ChIP-Atlas samples presence in used DBs\nTarget Consensus - No ENCODE"

upset = make_db_upsetplot(no_encode_df, DB_COLS, title=title)

plt.savefig(fig_dir / "upsetplot_DB_core7_samples_noENC.svg", bbox_inches="tight")

In [None]:
def is_prediction_resolved(row, pred_col: str, db_cols: List[str]) -> bool:
    """Check if the prediction matches any of the database columns."""
    pred_val = row[pred_col]
    db_vals = [row[col] for col in db_cols]
    return pred_val in db_vals

In [None]:
# the classifier was able to resolve xx% of the cases where the target was not identical between the sources
different_targets_df = ca_core_df[ca_core_df[SAME_TARGET] == "Different"]

for min_pred_score in [0, 0.6]:
    filtered_df = different_targets_df[
        different_targets_df["Max_pred_assay7"] >= min_pred_score
    ]

    pred_col = PRED_COLS[0]

    num_resolved = filtered_df.apply(
        is_prediction_resolved, axis=1, args=(pred_col, DB_COLS)
    ).sum()

    print(
        f"Resolved (min_predScore >= {min_pred_score}): "
        f"{num_resolved} / {len(filtered_df)} "
        f"({num_resolved / len(filtered_df) * 100:.2f}%)"
    )

    # Exclude rows where the prediction is labeled as 'input'
    non_input_df = filtered_df[filtered_df[PRED_COLS[0]] != "input"]
    num_resolved = non_input_df.apply(
        is_prediction_resolved, axis=1, args=(pred_col, DB_COLS)
    ).sum()

    print(
        f"Resolved (min_predScore >= {min_pred_score}, excluding 'input' predictions): "
        f"{num_resolved} / {len(non_input_df)} "
        f"({num_resolved / len(non_input_df) * 100:.2f}%)"
    )

### High-level prediction accuracy breakdown

Create `epiclass_match_status` column and join it to predictions.

This category represents the agreement between the databases labels and 
the classifier prediction.  

If there is a database consensus and our prediction matches, it's a complete match.  
If there is no database consensus, but our prediction matches one of the databases labels, it's a partial match.  
Otherwise, no match.  

In [None]:
epiclass_match_status = []
for _, row in ca_core_df.iterrows():
    target_vals = [row[col] for col in DB_COLS]
    consensus: str = row["manual_target_consensus"]

    epiclass_target: str = row["Predicted_class_assay7"]

    if epiclass_target == consensus:
        epiclass_match_status.append("Complete match")
        continue

    if epiclass_target in target_vals:
        epiclass_match_status.append("Partial match")
        continue

    epiclass_match_status.append("No match")

ca_core_df["epiclass_match_status"] = epiclass_match_status

In [None]:
for min_pred_score in [0, 0.6]:
    print(f"Prediction agreement, minimum prediction score >= {min_pred_score:.2f}")
    subset_df = ca_core_df[ca_core_df["Max_pred_assay7"] >= min_pred_score]
    print_column_content(subset_df, "epiclass_match_status")

In [None]:
predictions_dir = base_data_dir / "training_results" / "predictions"
pred_path = predictions_dir / "C-A" / "CA_only_pred_20240606.tsv"
pred_df = pd.read_csv(pred_path, sep="\t", low_memory=False)
print(pred_df.shape, pred_df.columns[0])

In [None]:
new_col = "epiclass_match_status"
if new_col not in pred_df.columns:
    index_1 = pred_df.columns[0]
    index_2 = ca_core_df.columns[0]
    pred_df = pd.merge(
        pred_df,
        ca_core_df[[index_2, new_col]],
        how="left",
        left_on=pred_df.columns[0],
        right_on=ca_core_df.columns[0],
        suffixes=("", "_DROP"),
    )
    pred_df = pred_df.drop(
        columns=[col for col in pred_df.columns if col.endswith("_DROP")]
    )
    pred_df["epiclass_match_status"].fillna("NA", inplace=True)
    pred_df.to_csv(pred_path, sep="\t", index=False)

del pred_df

Details prediction stats

In [None]:
def print_high_level_pred_info(df: pd.DataFrame, save_conf_matrix: bool = False) -> None:
    """High level information about the predictions."""
    for assay in CORE_ASSAYS:
        print(f"{assay}")
        assay_df = df[df["manual_target_consensus"] == assay]
        for col in [
            "Predicted_class_assay7",
            "Predicted_class_assay11",
            "Predicted_class_assay13",
        ]:
            assay_number = col.rsplit("_", maxsplit=1)[-1]
            display(assay_df[col].value_counts() / len(assay_df) * 100)
            if any(label in col for label in ["11", "13"]):
                wrong_pred = assay_df[assay_df[col] != assay]

                display(
                    wrong_pred[f"2nd_pred_class_{assay_number}"].value_counts()
                    / len(wrong_pred)
                    * 100
                )
        print("\n")

    if save_conf_matrix:
        for col in [
            "Predicted_class_assay7",
            "Predicted_class_assay11",
            "Predicted_class_assay13",
        ]:
            labels = sorted(df[col].unique().tolist())
            cm = sk_cm(
                df["manual_target_consensus"],
                df[col],
                labels=labels,
            )
            cm_writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
            cm_writer.to_png(
                Path.home() / "Downloads" / f"C-A_confusion_matrix_{col}.png"
            )

    print("What is the actual target when wgbs-standard is predicted?")
    for assay_number in ["assay11", "assay13"]:
        print(f"{assay_number}")
        wgbs_dist = ca_pred_df[
            ca_pred_df[f"Predicted_class_{assay_number}"] == "wgbs-standard"
        ]["manual_target_consensus"]
        display(wgbs_dist.value_counts())
        display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

    print("What is the actual target when non-core is predicted?")
    col = "Predicted_class_assay13"
    wgbs_dist = ca_pred_df[ca_pred_df[col] == "non-core"]["manual_target_consensus"]
    display(wgbs_dist.value_counts())
    display(wgbs_dist.value_counts() / len(wgbs_dist) * 100)

In [None]:
verbose = False

if verbose:
    print_column_content(ca_pred_df, "manual_target_consensus")
    print_high_level_pred_info(ca_pred_df, save_conf_matrix=True)

In [None]:
for min_pred in [0.6, 0.8]:
    break_tie_mask = no_consensus_df["Max_pred_assay7"] >= min_pred
    nb_break_tie = break_tie_mask.sum()
    print(
        f"Break no_consensus (minPred >= {min_pred:.02f}): {nb_break_tie/ len(no_consensus_df) * 100:.02f}% ({nb_break_tie}/{len(no_consensus_df)})"
    )
    df = no_consensus_df[break_tie_mask]

    nb_not_input = (df["Predicted_class_assay7"] != "input").sum()
    print(
        f"non-input tie breakers: {nb_not_input}/{nb_break_tie} ({nb_not_input/len(df) * 100:.02f}%)\n"
    )
    print(df["ENCODE"].value_counts(), "\n")

In [None]:
def print_pred_within_threshold(
    df: pd.DataFrame, min_pred: float = 0.6, col: str = "Max_pred_assay7"
) -> None:
    """Print the predictions percentage within a threshold."""
    try:
        mask = df[col].astype(float) >= min_pred
    except KeyError:
        print(f"Column {col} not found.")
        return
    nb_pred = mask.sum()
    print(
        f"Nb pred {col.split('_')[-1]} (pred score >= {min_pred:.02f}): {nb_pred/len(df) * 100:.02f}% ({nb_pred}/{len(df)})"
    )

In [None]:
def save_confusion_matrix(
    df: pd.DataFrame,
    fig_dir: Path | str,
    nb_classes: int | str = 7,
    min_pred: float = 0.6,
):
    """Save the confusion matrix for core assays predictions. Does not filter."""
    col = f"Predicted_class_assay{nb_classes}"
    cm = sk_cm(df["manual_target_consensus"], df[col], labels=CORE_ASSAYS)
    cm_writer = ConfusionMatrixWriter(labels=CORE_ASSAYS, confusion_matrix=cm)

    name = f"confusion_matrix_assay{nb_classes}_core7_minPred{min_pred:.02f}"
    if df["ENCODE"].sum() == 0:
        name += "_noENCODE"

    cm_writer.to_all_formats(logdir=fig_dir, name=name)

In [None]:
def print_breakdown_predictions(
    df: pd.DataFrame,
    min_pred: float = 0.6,
    nb_classes: int | str = 7,
    verbose: bool = True,
) -> Dict[str, float]:
    """Breakdown the predictions, print results."""
    df = df[df[f"Max_pred_assay{nb_classes}"] >= min_pred]

    pred_col = f"Predicted_class_assay{nb_classes}"
    match_consensus = df["manual_target_consensus"] == df[pred_col]
    nb_match = match_consensus.sum()
    nb_error = (~match_consensus).sum()
    print(f"Nb match assay{nb_classes}: {nb_match/ len(df):.2%} ({nb_match}/{len(df)})")
    print(f"Nb error assay{nb_classes}: {nb_error/ len(df):.2%} ({nb_error}/{len(df)})\n")

    correct_pred_df = df[match_consensus]
    incorrect_pred_df = df[~match_consensus]

    if verbose:
        print(
            r"Following ratios: % of assay subset OR % of all predictions OR % of all incorrect predictions (potential mislabels).",
            "\n",
        )
    acc_per_class = {}
    for assay in CORE_ASSAYS:
        assay_df = df[df[pred_col] == assay]
        nb_assay = len(assay_df)

        nb_assay_correct = len(correct_pred_df[correct_pred_df[pred_col] == assay])
        nb_assay_incorrect = len(incorrect_pred_df[incorrect_pred_df[pred_col] == assay])

        if verbose:
            print(
                f"Predictions as {assay}: {nb_assay / len(df):.2%} ({nb_assay}/{len(df)})"
            )
        perc_cor = nb_assay_correct / nb_assay
        perc_cor2 = nb_assay_correct / len(df)
        perc_inc = nb_assay_incorrect / nb_assay
        perc_inc2 = nb_assay_incorrect / len(df)
        perc_inc3 = nb_assay_incorrect / len(incorrect_pred_df)

        if verbose:
            print(
                f"Correct predictions as {assay}: {perc_cor:.2%} ({nb_assay_correct}/{nb_assay}) OR {perc_cor2:.2%} ({nb_assay_correct}/{len(df)})"
            )
            print(
                f"Incorrect predictions as {assay}: "
                f"{perc_inc:.2%} ({nb_assay_incorrect}/{nb_assay}) OR "
                f"{perc_inc2:.2%} ({nb_assay_incorrect}/{len(df)}) OR "
                f"{perc_inc3:.2%} ({nb_assay_incorrect}/{len(incorrect_pred_df)})\n"
            )
        acc_per_class[assay] = perc_cor

    return acc_per_class

In [None]:
verbose = False

fig_dir = base_fig_dir / "fig_C-A" / "confusion_matrices"
for subset in [[0], [0, 1]]:
    # continue
    if verbose:
        if subset == [0]:
            print("Subset: no ENCODE")
        else:
            print("Subset: Include ENCODE")

    df = ca_core_df[ca_core_df["ENCODE"].isin(subset)]

    for min_pred in [0.6, 0.8, 0.9]:
        # continue
        if verbose:
            print("Min pred score:", min_pred)
            print_pred_within_threshold(df, min_pred=min_pred)
            print_breakdown_predictions(df, min_pred=min_pred)

        sub_df = df[df["Max_pred_assay7"] >= min_pred]
        save_confusion_matrix(sub_df, fig_dir, min_pred=min_pred)

### Mislabels by GSE

In [None]:
logdir = ca_dir / "GSE_mispred"
logdir.mkdir(exist_ok=True, parents=True)

In [None]:
GSE = "Gse-geo"

In [None]:
verbose = False

nb_classes = 7
min_pred = 0.6
pred_col = f"Predicted_class_assay{nb_classes}"
max_pred_col = f"Max_pred_assay{nb_classes}"

excluding_no_consensus = True
excluding_ENCODE = False

In [None]:
df = ca_core_df.copy(deep=True)

In [None]:
if excluding_no_consensus:
    N_diff = len(df)
    df = df[df["manual_target_consensus"] != "no_consensus"]
    N_diff -= len(df)
    print(f"Removed {N_diff} rows with no consensus.\nLeft with {len(df)} rows.")

In [None]:
if excluding_ENCODE:
    N_diff = len(df)
    df = df[df["ENCODE"] == 0]
    N_diff -= len(df)
    print(f"Removed {N_diff} rows with ENCODE.")

    this_logdir = logdir / "excluding_ENCODE"
else:
    this_logdir = logdir / "including_ENCODE"

this_logdir.mkdir(exist_ok=True, parents=True)

In [None]:
N_total = len(df)
N_diff = len(df)
df = df[df[max_pred_col] >= min_pred]
N_diff -= len(df)

print(
    f"Removed {N_diff}/{N_total} ({N_diff/N_total:.2%}) rows with pred score < {min_pred}\nLeft with {len(df)} rows."
)

In [None]:
verbose = True

In [None]:
if "no_consensus" in df["manual_target_consensus"].unique():
    raise ValueError("'no_consensus' present in df, cannot compute accuracy.")

no_match = df["epiclass_match_status"] == "No match"
nb_match = (~no_match).sum()
nb_error = (no_match).sum()
print(f"Nb match assay{nb_classes}: {nb_match/ len(df):.2%} ({nb_match}/{len(df)})")
print(f"Nb mismatch assay{nb_classes}: {nb_error/ len(df):.2%} ({nb_error}/{len(df)})\n")

incorrect_pred_df = df[no_match]

if verbose:
    print("Incorrect predictions, breakdown by predicted class:")
    display(incorrect_pred_df[pred_col].value_counts(normalize=True))

incorrect_pred_df = incorrect_pred_df[incorrect_pred_df[pred_col] != "input"]

print(
    f"Excluding input predictions. Left with {len(incorrect_pred_df)} complete mismatches.\n"
)

desired_cols = ["manual_target_consensus", pred_col]

with pd.option_context("display.max_rows", None, "display.max_columns", None):
    gse_count = incorrect_pred_df.groupby(GSE).size().sort_values(ascending=False)
    gse_count = gse_count.to_frame()
    gse_count.columns = ["Nb of mismatches"]
    if verbose:
        print(
            f"Incorrect predictions, breakdown by GSE count ({len(gse_count)} unique GSE)"
        )

    gse_count["cumsum"] = gse_count.cumsum()
    gse_count["cumsum (%)"] = (
        gse_count["cumsum"] * 100 / sum(gse_count["Nb of mismatches"])
    )
    if verbose:
        display(gse_count.reset_index())

    gse_count.to_csv(
        this_logdir / "gse_count_incorrect_pred_no_input_20240606_mod3.tsv", sep="\t"
    )

    gse_target_count = incorrect_pred_df.groupby(GSE)[desired_cols].value_counts(dropna=False)  # type: ignore
    if verbose:
        print("Incorrect predictions, breakdown by GSE and target.")
        display(gse_target_count)

    gse_target_count = gse_target_count.to_frame()
    gse_target_count.columns = ["Nb of mismatches"]

    gse_target_count.to_csv(
        this_logdir / "gse_target_count_incorrect_pred_no_input_20240606_mod3.tsv",
        sep="\t",
    )

### Unclassified files

In [None]:
unclassified = ca_core_df[ca_core_df["C-A"] == "unclassified"]
print(f"Nb unclassified: {len(unclassified)} ({len(unclassified) / len(ca_core_df):.2%})")

high_pred = unclassified[unclassified[max_pred_col] >= min_pred]
print(
    f"Nb high pred unclassified: {len(high_pred)} ({len(high_pred) / len(unclassified):.2%})"
)

### Accuracy Summary

-- Assay7 classifier, ENCODE effect --

Keeping only core 7 targets, excludes no consensus (since it's always going to not match)

NO ENCODE

- Removed 4526/39668 (11.41%) rows with pred score < 0.6
- Nb match assay7: 95.54% (33575/35142)
- Nb mismatch assay7: 4.46% (1567/35142)

WITH ENCODE

- Removed 4778/45615 (10.47%) rows with pred score < 0.6
- Nb match assay7: 95.92% (39172/40837)
- Nb mismatch assay7: 4.08% (1665/40837)
  - Mismatch predicted as input: 68.83%
<br><br>

-- Resolving different predictions --

Total different predictions: 706

min_predScore >= 0.6 (87.68%)
- Resolved: 612 / 619 (98.87%)
- Resolved, excluding 'input' predictions: 214 / 217 (98.62%)

### Varying consensus criterion (nb DB agreeing)

In [None]:
df = ca_core_df.copy(deep=True)

reference_column = "manual_target_consensus"
columns_to_check = DB_COLS
df["manual_target_consensus_size"] = (
    df[columns_to_check].eq(df[reference_column], axis=0)
).sum(axis=1)

for col in ["manual_target_consensus", "manual_target_consensus_size"]:
    print_column_content(df, col)

In [None]:
print("Global:")

sub_df = df[df["Max_pred_assay7"] >= 0.6]
assay_count = sub_df["manual_target_consensus"].value_counts(normalize=True).sort_index()
print(f"input pred: {assay_count['input']:.2%}")
print_pred_within_threshold(df, min_pred=0.6)

acc_per_class = print_breakdown_predictions(df, min_pred=0.6, nb_classes=7, verbose=False)
avg_acc_per_class = np.mean(list(acc_per_class.values()))
print(f"Average acc per class: {avg_acc_per_class:.2%}")
print()

verbose = True

N_global = len(df)
for i in range(1, 5):
    con_df = df[df["manual_target_consensus_size"] == i]
    print(
        f"Consensus defined with {i} DB: {len(con_df)} files. ({len(con_df)/N_global:.2%})"
    )

    # Display % assay
    if verbose:
        sub_df = con_df[con_df["Max_pred_assay7"] >= 0.6]
        assay_count = sub_df["manual_target_consensus"].value_counts(normalize=True)
        print(f"input: {assay_count['input']:.2%}")

        print_pred_within_threshold(con_df, min_pred=0.6)

        acc_per_class = print_breakdown_predictions(
            con_df, min_pred=0.6, nb_classes=7, verbose=False
        )
        avg_acc_per_class = np.mean(list(acc_per_class.values()))
        print(f"Average acc per class: {avg_acc_per_class:.2%}")
        print()

#### Summary

There doesn't seem to be big differences in accuracy when looking at consensus defined by a different number of DB.  
There biggest difference is consensus size 1 (only 1 DB source): 3.27% less good than global for avg accuracy per class. (global acc is very similar)

Core 7 files, including ENCODE

| Consensus size | Nb files | Nb files   | Input size     | Files with min_pred > 0.6| Global accuracy (Nb match assay7) | Average acc per class |
|---------------|---------|--------------|----------------|--------------------------|-----------------------------------|-----------------------|
| 1             | 3320    | 7.05%        | 61.81%         | 88.80%                   | 95.45%                            | 92.87%                |
| 2             | 25248   | 53.64%       | 42.08%         | 89.17%                   | 95.80%                            | 96.93%                |
| 3             | 9976    | 21.20%       | 36.96%         | 88.99%                   | 95.31%                            | 95.28%                |
| 4             | 8118    | 17.25%       | 18.05%         | 92.21%                   | 96.05%                            | 96.14%                |
| Global        | 46018   | 100%         | 36.30          | 89.64%                   | 96.05%                            | 96.14%                |


## Other metadata

In [None]:
ca_metadata_dir = base_data_dir / "metadata" / "chip_atlas"

In [None]:
other_meta_df = pd.read_csv(
    ca_metadata_dir / "CA_extracted_metadata_FW_20250314.tsv",
    sep="\t",
)
other_meta_df.fillna("unknown", inplace=True)
display(other_meta_df.shape)

In [None]:
general_metadata_path = ca_metadata_dir / "CA_metadata_joined_20250306.tsv"
general_metadata_df = pd.read_csv(general_metadata_path, sep="\t", low_memory=False)
print(general_metadata_df.shape)

In [None]:
col_label = "Meta_data_submitted_by_authors"
temp_df = general_metadata_df[general_metadata_df[col_label] != "unknown"]

other_meta_vals = {}
for key, vals in temp_df[["Experimental-id", col_label]].values:
    values_list = ast.literal_eval(vals)
    values_dict = {val.split("=")[0]: val.split("=")[1] for val in values_list}
    values_dict["Experimental-id"] = key
    other_meta_vals[key] = values_dict

In [None]:
other_meta_df = pd.DataFrame.from_dict(other_meta_vals, orient="index")

In [None]:
biomaterial_related_cols = [
    "strain",
    "cell type",
    "cell_type",
    "cell line",
    "cell_line",
    "sample type",
    "sample_type",
    "tissue",
    "biomaterial_type",
    "cell",
    "cells",
    "cells/tissue",
    "cell line/type",
    "tissue source/type",
    "tissue source",
    "cell line id",
    "parental cell line",
    "tissue origin",
    "biosample type",
    "cell line source",
    "clone name",
    "cell-type/cell line",
    "cell line background",
    "cell-line",
    "histology",
    "tissue/cell type",
    "source cell type",
    "cell types",
    "cell_line/tissue",
    "cell lines",
    "cell line or tissue",
    "cell or tissue type",
    "line name",
    "tissue/cells",
    "cell line/strain",
    "cell strain",
    "biosample",
    "cell lin",
    "cell line of origin",
    "cell line name",
    "tissue/cell line",
]

In [None]:
biomaterial_df = other_meta_df[biomaterial_related_cols]

In [None]:
biomaterial_df = (
    biomaterial_df.replace("n/a", np.nan).replace("unknown", np.nan).replace("NA", np.nan)
)
relevant_cols = []

for col in biomaterial_df.columns:
    col_content = biomaterial_df[col]
    counts = col_content.value_counts(dropna=True)
    if counts.sum() > 100:
        relevant_cols.append(col)

        # with pd.option_context("display.max_rows", None, "display.max_columns", None):
        #     print(col.upper(), "\n")
        #     print(counts)
        #     print("\n")

In [None]:
biomaterial_df = biomaterial_df[relevant_cols]
biomaterial_df.to_csv(ca_metadata_dir / "biomaterial_metadata_20250306.csv")

#### Performance metrics

In [None]:
df = ca_core_df.copy(deep=True)
df = pd.merge(df, other_meta_df, how="left", on="Experimental-id")

In [None]:
df = pd.merge(
    df,
    general_metadata_df,
    how="left",
    on="Experimental-id",
    suffixes=("", "_DROP"),
)
df = df.drop([col for col in df.columns if col.endswith("_DROP")], axis=1)
df = df.drop("ID", axis=1)

In [None]:
df["Predicted_class_donorlife"] = df["Predicted_class_donorlife"].replace(
    {
        "newborn": "perinatal",
        "fetal": "perinatal",
        "embryonic": "perinatal",
    }
)

Creating a DF with no cell line

Life stage classifier was not trained on any cell line, and also the notion of life stage makes less sense for cell lines.

In [None]:
mask_cell_line = df["Cell_type_description"].str.lower().str.contains("cell line")

no_cell_line_df = df[~mask_cell_line]
print(f"{len(df) - len(no_cell_line_df)} rows removed.")

Sanity check, are there some cell lines with life stages, if so what are the samples?

In [None]:
cell_line_df = df[mask_cell_line]
display(cell_line_df["expected_donorlife"].value_counts(dropna=False))

In [None]:
print("Cell lines with perinatal status:")
display(
    cell_line_df[cell_line_df["expected_donorlife"] == "perinatal"][
        ["Cell_type", "Cell_type_description"]
    ].drop_duplicates()
)

I think dropping "HEK293-T-REx" and "NT2-D1" from life stage predictions is justified.

Metrics

In [None]:
def print_metrics(df, min_pred: float = 0.6):
    """Prints metrics for the given df.
    Classification report and confusion matrix for sex, cancer and donorlife."""
    df = df.copy(deep=True)
    for name in ["sex", "cancer", "donorlife"]:
        print(f"--- {name} ---")
        col_max_pred = f"Max_pred_{name}"
        col_pred = f"Predicted_class_{name}"
        col_true = f"expected_{name}"

        no_unknown_df = df[df[col_true] != "unknown"]
        print(
            f"Removing {len(df) - len(no_unknown_df)} rows with unknown.\nLeft with {len(no_unknown_df)} rows."
        )

        high_conf_df = no_unknown_df[no_unknown_df[col_max_pred] >= min_pred]
        print(
            f"Removing {len(no_unknown_df) - len(high_conf_df)} rows with low confidence.\nLeft with {len(high_conf_df)} rows\n"
        )

        preds = high_conf_df[col_pred]
        true = high_conf_df[col_true]
        labels = sorted(set(true.unique()) | set(preds.unique()))

        print(classification_report(true, preds, zero_division=0, digits=4))

        cm = sk_cm(true, preds, labels=labels)
        print(labels)
        print(str(cm) + "\n")

In [None]:
for dataframe in [df, no_cell_line_df]:
    print_metrics(dataframe, min_pred=0.6)

What about life stage performance for similar cell types? Difficult to know without extensive labeling, but ENCODE results suggest it has a significant effect.

## Summary metrics by assay

In [None]:
metrics_handler = MetricsPerAssay()

In [None]:
df["in_epiatlas"] = df["is_EpiAtlas_EpiRR"].astype(str) != "0"
display(df["in_epiatlas"].value_counts(dropna=False))

In [None]:
categories = ["assay7", "sex", "cancer", "donorlife"]
column_templates = {
    "True": "expected_{}",
    "Predicted": "Predicted_class_{}",
    "Max pred": "Max_pred_{}",
}
df["expected_assay7"] = df["manual_target_consensus"]

compute_fct_kwargs = {
    "no_epiatlas": True,
    "merge_assays": False,
    "categories": categories,
    "column_templates": column_templates,
    "assay_label": "manual_target_consensus",
    "core_assays": CORE_ASSAYS + ["no_consensus"],
    "non_core_assays": [],
}

In [398]:
base_filename = "C-A_acc_per_assay"

metrics_handler.compute_multiple_metric_formats(
    preds=df,
    general_filename=base_filename,
    folders_to_save=[ca_pred_path.parent.parent],
    verbose=False,
    return_df=False,
    compute_fct_kwargs=compute_fct_kwargs,
)