In [None]:
"""Workbook to analyse encode predictions.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, pointless-statement

## SETUP

In [None]:
%load_ext autoreload
%autoreload 2

In [217]:
from __future__ import annotations

import functools
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
from IPython.display import display
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

from epi_ml.core.confusion_matrix import ConfusionMatrixWriter
from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_MERGE_DICT,
    ASSAY_ORDER,
    CELL_TYPE,
    LIFE_STAGE,
    SEX,
    IHECColorMap,
    MetadataHandler,
    SplitResultsHandler,
    add_second_highest_prediction,
    display_perc,
)

# from plotly.subplots import make_subplots

In [218]:
CANCER = "harmonized_sample_cancer_high"
CORE_ASSAYS = ASSAY_ORDER[0:7]

In [219]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [220]:
metadata_handler = MetadataHandler(paper_dir)
split_results_handler = SplitResultsHandler()

In [221]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map

In [222]:
encode_metadata_dir = base_data_dir / "metadata" / "encode"

In [223]:
encode_predictions_dir = base_data_dir / "training_results" / "predictions" / "encode"

In [224]:
for path in [encode_metadata_dir, encode_predictions_dir]:
    if not path.exists():
        raise FileNotFoundError(f"Directory {path} does not exist.")

In [225]:
accepted_ct = [
    "T cell",
    "neutrophil",
    "brain",
    "monocyte",
    "lymphocyte of B lineage",
    "myeloid cell",
    "venous blood",
    "macrophage",
    "mesoderm-derived structure",
    "endoderm-derived structure",
    "colon",
    "connective tissue cell",
    "hepatocyte",
    "mammary gland epithelial cell",
    "muscle organ",
    "extraembryonic cell",
]
accepted_ct = [ct.lower() for ct in accepted_ct]

### Merge all pred if available

In [226]:
full_metadata_path = encode_metadata_dir / "encode_metadata_2023-10-25.csv"
full_metadata_df = pd.read_csv(full_metadata_path)
full_metadata_df.set_index("md5sum", inplace=True)

In [227]:
pred_dfs = {}
for folder in encode_predictions_dir.glob("*"):
    if not folder.is_dir():
        continue
    # match categories with dir names of format: [cat_name]_1l_3000n
    cat = folder.name.split("_1l_3000n")[0]
    pred_file = list(folder.rglob("complete_no_valid_oversample_*.csv"))[0]
    encode_df = pd.read_csv(pred_file)
    pred_dfs[cat] = encode_df

In [228]:
same_col_names = 8
# Make all different columns have unique relevant names except for the pred vector
for cat, df in pred_dfs.items():
    df.drop(columns=["Same?"], inplace=True)
    old_names = df.columns[1 : same_col_names - 1]
    new_names = [f"{old_name} ({cat})" for old_name in old_names]
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    df.set_index("md5sum", inplace=True)
    pred_dfs[cat] = df

In [229]:
df_order = [ASSAY, CELL_TYPE, SEX, LIFE_STAGE, CANCER]
df_list = [pred_dfs[cat] for cat in df_order]
full_merged_df = functools.reduce(merge_dataframes, df_list)

In [230]:
meta_columns = full_metadata_df.columns
result_columns = full_merged_df.columns

In [231]:
full_merged_df = full_merged_df.merge(
    full_metadata_df,
    left_index=True,
    right_index=True,
    how="inner",
    suffixes=("", "_delete"),
)
full_merged_df = full_merged_df.filter(regex=r"^(?:(?!_delete).)+$")

In [232]:
meta_col_order = [col for col in meta_columns if col in full_merged_df.columns]
results_col_order = [col for col in result_columns if col in full_merged_df.columns]

new_order = results_col_order + meta_col_order
full_merged_df = full_merged_df[new_order]

In [233]:
full_merged_df["True class (harmonized_donor_sex)"] = full_merged_df["donor_sex"]
full_merged_df["True class (harmonized_donor_life_stage)"] = full_merged_df["life_stage"]
full_merged_df["True class (harmonized_sample_cancer_high)"] = full_merged_df[
    "cancer_status"
]
full_merged_df["True class (assay_epiclass)"] = full_merged_df["assay_epiclass"]

### EpiAtlas overlap

In [234]:
encode_epiatlas_mapping_path = encode_metadata_dir / "ENCODE_IHEC_keys.tsv"
encode_epiatlas_mapping_df = pd.read_csv(encode_epiatlas_mapping_path, sep="\t")

In [235]:
in_epiatlas = encode_epiatlas_mapping_df[
    encode_epiatlas_mapping_df["is_EpiAtlas_EpiRR"].notnull()
]["ENC_ID"].tolist()

In [236]:
full_merged_df["in_EpiAtlas"] = full_merged_df.index.isin(in_epiatlas)

## CELL TYPE

#### Getting ontology info

In [237]:
encode_metadata_dir = base_data_dir / "metadata" / "encode"
curie_def_df = pd.read_csv(
    encode_metadata_dir / "EpiAtlas_list-curie_term_HSOI.tsv",
    sep="\t",
    names=["code", "term", CELL_TYPE],
)
encode_ontology_df = pd.read_csv(encode_metadata_dir / "encode_ontol+assay.tsv", sep="\t")

In [None]:
encode_ontology_df.shape

In [None]:
encode_ontology_df.head()

In [None]:
with pd.option_context("display.max_rows", None):
    display(curie_def_df.groupby(CELL_TYPE).size())

In [241]:
metadata_df = encode_ontology_df.merge(
    curie_def_df, left_on="Biosample term id", right_on="code", how="left"
)
metadata_df = metadata_df.drop(columns=["code", "term"])

In [None]:
display(metadata_df.shape)

In [243]:
metadata_df[CELL_TYPE] = metadata_df[CELL_TYPE].str.lower().copy()

In [244]:
full_merged_df = full_merged_df.merge(
    metadata_df, left_index=True, right_on="ENC_ID", how="left"
)
full_merged_df["True class (harmonized_sample_ontology_intermediate)"] = full_merged_df[
    "harmonized_sample_ontology_intermediate"
]
full_merged_df.set_index("md5sum_encode", inplace=True)
full_merged_df.to_csv(encode_predictions_dir / "encode_predictions_augmented_merged.csv")

In [None]:
only_good_ct_df = metadata_df[metadata_df[CELL_TYPE].isin(accepted_ct)]
only_good_ct_count = only_good_ct_df[CELL_TYPE].value_counts()
display_perc(only_good_ct_count / only_good_ct_count.sum() * 100)

In [None]:
counts = metadata_df[CELL_TYPE].value_counts(dropna=False)
display_perc(counts / counts.sum() * 100)

In [None]:
counts_good = counts[counts.index.isin(accepted_ct)]
display_perc(counts_good / counts.sum() * 100)

#### Missing harmonized_sample_ontology_intermediate details

In [None]:
# check term on missing CELL_TYPE
missing_cell_type = metadata_df[metadata_df[CELL_TYPE].isna()]
print(missing_cell_type.shape)

biosample_cols = ["Biosample term id", "Biosample term name"]

missing_count = missing_cell_type[biosample_cols].value_counts()
display(missing_count.shape)
with pd.option_context(
    "display.float_format",
    "{:.2f}".format,  # pylint: disable=consider-using-f-string
    "display.max_rows",
    None,
):
    display(missing_count / missing_count.sum() * 100)

In [249]:
t_cell_types = [
    name for name in missing_cell_type["Biosample term name"].unique() if "T cell" in name
]
b_cell_types = [
    name for name in missing_cell_type["Biosample term name"].unique() if "B cell" in name
]

In [None]:
t_cell_count = missing_cell_type[
    missing_cell_type["Biosample term name"].isin(t_cell_types)
][biosample_cols].value_counts()
display(t_cell_count, t_cell_count.sum())

In [None]:
b_cell_count = missing_cell_type[
    missing_cell_type["Biosample term name"].isin(b_cell_types)
][biosample_cols].value_counts()
display(b_cell_count, b_cell_count.sum())

perc_missing = (
    (t_cell_count.sum() + b_cell_count.sum()) / missing_cell_type.shape[0] * 100
)
print(f"t+b cells, percentage of missing cell types: {perc_missing:.2f}%")

#### Match predictions from various trainings with ontology info

#### Computing accuracies

In [252]:
pred_folder = (
    base_data_dir
    / "training_results/dfreeze_v2/hg38_100kb_all_none/harmonized_sample_ontology_intermediate_1l_3000n/complete-no_valid-oversampling"
)

In [253]:
metadata_df["Assay"] = metadata_df["Assay"].str.lower()
metadata_df[CELL_TYPE] = metadata_df[CELL_TYPE].str.lower()
df = metadata_df.dropna(subset=[CELL_TYPE])  # drop rows with missing cell type
df = metadata_df.dropna(subset=["Assay"])  # drop rows with missing assay
non_core_metadata_df = df[~df["Assay"].isin(ASSAY_ORDER)]
core_metadata_df = df[df["Assay"].isin(ASSAY_ORDER)]

In [None]:
non_core_metadata_df.columns

In [255]:
# counts = metadata_df["Assay"].value_counts(dropna=False)
# print(len(counts))
# counts.to_csv(
#     path_or_buf=Path().home() / "downloads" / "encode_assay_counts.csv",
#     sep=",",
#     header=True,
# )

In [None]:
display(non_core_metadata_df[CELL_TYPE].value_counts(dropna=False))

In [None]:
non_core_metadata_df[CELL_TYPE] = non_core_metadata_df[CELL_TYPE].str.lower().copy()

In [None]:
# Only keep the predictions for the 16 cell types
accepted_ct = [
    "T cell",
    "neutrophil",
    "brain",
    "monocyte",
    "lymphocyte of B lineage",
    "myeloid cell",
    "venous blood",
    "macrophage",
    "mesoderm-derived structure",
    "endoderm-derived structure",
    "colon",
    "connective tissue cell",
    "hepatocyte",
    "mammary gland epithelial cell",
    "muscle organ",
    "extraembryonic cell",
]
accepted_ct = [ct.lower() for ct in accepted_ct]
print(non_core_metadata_df.shape)
metadata_16ct = non_core_metadata_df[non_core_metadata_df[CELL_TYPE].isin(accepted_ct)]
print(metadata_16ct.shape)

In [None]:
assay_counts = metadata_16ct["Assay"].value_counts(dropna=False)
display_perc(assay_counts / assay_counts.sum() * 100)

In [None]:
pred_folder

In [261]:
pred_dfs_dict = {}
for folder in pred_folder.glob("*"):
    if not folder.is_dir():
        print(f"Skipping {folder}")
        continue
    pred_file = list(folder.glob("predictions/*.csv"))

    if len(pred_file) > 1:
        print(f"More than one prediction file found in {folder}")
        continue

    if len(pred_file) == 0:
        print(f"No prediction file found in {folder}")
        continue

    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file)
    name = folder.name.replace("complete_no_valid_oversample_", "")

    for col in ["True class", "Predicted class"]:
        pred_df[col] = pred_df[col].str.lower()

    # Remove epiatlas overlap
    pred_df = pred_df[~pred_df["md5sum"].isin(in_epiatlas)]

    pred_dfs_dict[name] = pred_df

In [None]:
print(pred_dfs_dict.keys())

In [263]:
def compute_cell_type_acc(
    metadata_df: pd.DataFrame,
    pred_dfs_dict: Dict[str, pd.DataFrame],
    min_pred: float = 0.6,
) -> None:
    """Compute the accuracy of the predictions for the 16 cell types."""
    meta_df = metadata_df[metadata_df[CELL_TYPE].isin(accepted_ct)].copy()

    # print("Assay counts for 16 cell types")
    # values_count = meta_df["Assay"].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    # print("Cell types distribution")
    # values_count = meta_df[CELL_TYPE].value_counts(dropna=False)
    # display(values_count)
    # display_perc(values_count / values_count.sum() * 100)

    for name, pred_df in sorted(pred_dfs_dict.items()):
        print(name)
        pred_w_ct = pred_df.merge(
            meta_df, left_on="md5sum", right_on="ENC_ID", how="inner"
        )
        N = pred_w_ct.shape[0]

        # Calculate results for all predictions
        true, pred = pred_w_ct[CELL_TYPE], pred_w_ct["Predicted class"]

        total_correct = (true == pred).sum()
        acc = total_correct / N
        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(f"Acc (pred>0.0): {total_correct}/{N} ({acc:.2%})")
        print(f"F1 (pred>0.0): {f1:.2f}")

        # Calculate results for predictions with max_pred
        pred_w_ct_filtered = pred_w_ct[pred_w_ct["Max pred"] > min_pred]
        true, pred = pred_w_ct_filtered[CELL_TYPE], pred_w_ct_filtered["Predicted class"]

        total_correct_filtered = (true == pred).sum()
        perc_filtered = total_correct_filtered / pred_w_ct_filtered.shape[0]

        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")

        print(
            f"Acc (pred>{min_pred:.1f}): {total_correct_filtered}/{pred_w_ct_filtered.shape[0]} ({perc_filtered:.2%})"
        )
        diff = N - pred_w_ct_filtered.shape[0]
        print(f"F1 (pred>{min_pred}): {f1:.2f}")
        print(f"Samples ignored at {min_pred:.1f}: {diff} ({diff/N:.2%})\n")

In [None]:
compute_cell_type_acc(non_core_metadata_df, pred_dfs_dict)

In [None]:
compute_cell_type_acc(core_metadata_df, pred_dfs_dict)

In [None]:
limited_pred_dfs_dict = {k: v for k, v in pred_dfs_dict.items() if "-ct16" in k}
for label in core_metadata_df["Assay"].unique():
    print(label)
    compute_cell_type_acc(
        core_metadata_df[core_metadata_df["Assay"] == label], limited_pred_dfs_dict
    )

#### Confusion matrices

In [267]:
this_logdir = (
    base_fig_dir
    / "encode_predictions"
    / "confusion_matrices"
    / "sample_ontology"
    / "core"
)
if not this_logdir.exists():
    this_logdir.mkdir(parents=True)

meta_df = core_metadata_df[core_metadata_df[CELL_TYPE].isin(accepted_ct)].copy()

limited_pred_dfs_dict = {k: v for k, v in pred_dfs_dict.items() if "-ct16" in k}
for name, df in limited_pred_dfs_dict.items():
    pred_w_ct = df.merge(meta_df, left_on="md5sum", right_on="ENC_ID", how="inner")
    for threshold in [0, 0.6, 0.9]:
        sub_df = pred_w_ct[pred_w_ct["Max pred"] >= threshold]

        true, pred = sub_df[CELL_TYPE], sub_df["Predicted class"]
        f1 = f1_score(true, pred, labels=pred.unique(), average="macro")
        # cm = confusion_matrix(true, pred, labels=accepted_ct)

        # writer = ConfusionMatrixWriter(labels=accepted_ct, confusion_matrix=cm)
        # writer.to_all_formats(
        #     logdir=this_logdir,
        #     name=f"{name}-core-confusion_matrix-{threshold*100}",
        # )
        # plt.close("all")

## ASSAY

Download note
~~~bash
paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2/hg38_100kb_all_none/assay_epiclass_1l_3000n"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./*c/complete_no_valid_oversample .

paper_dir="/home/local/USHERBROOKE/rabj2301/Projects/epiclass/output/paper/data/training_results/dfreeze_v2"
cd $paper_dir
base_path="/lustre06/project/6007515/rabyj/epiclass-project/output/epiclass-logs/epiatlas-dfreeze-v2.1"
rsync -avR --exclude "*/EpiLaP/" --exclude "*.png" --exclude "*confusion*" --exclude "*.md5" narval:${base_path}/./hg38_100kb_all_none_w_encode_noncore/assay_epiclass_1l_3000n/complete_no_valid_oversample-0 .

find -type f -name "*.list*.csv" -print0 | xargs -0 rename 's/\.list//g'
~~~

In [268]:
data_dir = base_data_dir / "training_results" / "dfreeze_v2"
assay7_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/7c/complete_no_valid_oversample"
)
assay11_folder = (
    data_dir / f"hg38_100kb_all_none/{ASSAY}_1l_3000n/11c/complete_no_valid_oversample"
)
assay13_folder = (
    data_dir
    / f"hg38_100kb_all_none_w_encode_noncore/{ASSAY}_1l_3000n/complete_no_valid_oversample-0"
)

In [None]:
encode_metadata_path = encode_metadata_dir / "ENCODE_IHEC_keys.tsv"
assay_core_metadata_df = pd.read_csv(encode_metadata_path, sep="\t")
print(assay_core_metadata_df.shape)

In [None]:
assay_core_metadata_df["assay_epiclass"].value_counts(dropna=False)

In [None]:
pred_dfs_dict = {}
for name, folder in zip(
    ["7c", "11c", "13c"], [assay7_folder, assay11_folder, assay13_folder]
):
    if not folder.exists():
        print(f"Folder {folder} does not exist.")
        continue

    pred_folder = folder / "predictions" / "encode"
    if not pred_folder.exists():
        print(f"Folder {pred_folder} does not exist.")
        continue

    pred_file = list(pred_folder.glob("*.csv"))
    if len(pred_file) != 1:
        print(f"Found {len(pred_file)} files in {pred_folder}.")
        continue
    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file, sep=",")
    try:
        pred_df.drop(columns=["Same?"], inplace=True)
    except KeyError:
        pass

    # Add assay metadata
    pred_df = pred_df.merge(
        assay_core_metadata_df, left_on="md5sum", right_on="ENC_ID", how="left"
    )

    pred_df["True class"] = pred_df["assay_epiclass"]
    pred_dfs_dict[name] = pred_df

### Core7 preds

In [None]:
output_dir = data_dir = base_data_dir / "training_results" / "encode_predictions"
for name, df in pred_dfs_dict.items():
    print(name)
    # print(df.shape)

    # Only consider files already labeled with core7 assays
    df = df[df[ASSAY].isin(CORE_ASSAYS)]

    # Only consider non-EpiAtlas samples
    df = df[df["is_EpiAtlas_EpiRR"].isna()]

    # df.to_csv(output_dir / f"encode_only-core-{name}_predictions.csv", index=False)
    # break

    # Calculate results for all predictions
    correct_pred = df["Predicted class"] == df["True class"]
    total_correct = correct_pred.sum()
    total = df.shape[0]
    perc = total_correct / total
    print(f"Acc (pred>=0.0) {total_correct}/{total} ({perc:.2%})")

    for assay in CORE_ASSAYS:
        min_pred = 0.6
        df_assay = df[df[ASSAY] == assay]
        df_assay = df_assay[df_assay["Max pred"] >= min_pred]
        correct_pred = df_assay["Predicted class"] == df_assay["True class"]
        total_correct = correct_pred.sum()
        total = df_assay.shape[0]
        perc = total_correct / total
        print(
            f"Acc (pred>={min_pred:.1f}) {assay} = {total_correct}/{total} ({perc:.2%})"
        )

    # Calculate results for predictions with max_pred > 0.6
    df_filtered = df[df["Max pred"] >= 0.6]
    correct_pred_filtered = df_filtered["Predicted class"] == df_filtered["True class"]
    total_correct_filtered = correct_pred_filtered.sum()
    total_filtered = df_filtered.shape[0]
    perc_filtered = total_correct_filtered / total_filtered
    print(
        f"Acc (pred>=0.6): {total_correct_filtered}/{total_filtered} ({perc_filtered:.2%})"
    )

    # df_filtered_wrong = df_filtered[~correct_pred_filtered]
    # groupby = (
    #     df_filtered_wrong.groupby(["True class", "Predicted class"])
    #     .size()
    #     .sort_values(ascending=False)
    # )
    # display("Mislabels:", groupby)

    # df_filtered_wrong.to_csv(
    #     output_dir / f"encode_only_mislabels_minPred0.6_{name}.csv", index=False
    # )

### non-core 7c preds

In [166]:
# 7c preds on non-core assays
name = "7c"
df = pred_dfs_dict[name]

df = df.merge(non_core_metadata_df, left_on="md5sum", right_on="ENC_ID", how="left")
df = df[~df["True class"].isin(ASSAY_ORDER)]

In [None]:
print(df.columns)
display(df["Assay"].value_counts(dropna=False))

In [168]:
output_dir = data_dir = (
    base_data_dir / "training_results" / "predictions" / "encode" / "assay_epiclass"
)
for min_pred in [0, 0.6, 0.8]:
    df_filtered = df[df["Max pred"] >= min_pred]
    groupby = (
        df_filtered.groupby(["Predicted class", "Assay"])
        .size()
        .reset_index(name="Count")
        .sort_values(["Predicted class", "Count"], ascending=[True, False])
        .set_index(["Predicted class", "Assay"])["Count"]
    )
    # groupby.to_csv(
    #     output_dir / f"encode_non-core_{name}_predictions_minPred{min_pred}.csv"
    # )

In [None]:
encode_metadata_dir = base_data_dir / "metadata/encode"
non_core_categories_path = (
    encode_metadata_dir / "non-core_encode_assay_category_2024-08-29.csv"
)
if not non_core_categories_path.exists():
    raise FileNotFoundError(f"File {non_core_categories_path} does not exist.")

non_core_categories_df = pd.read_csv(non_core_categories_path, sep=",")
print(non_core_categories_df.columns)

In [170]:
non_core_categories_df.columns = ["assay", "assay_category", "note"]

In [None]:
df_w_cats = df.merge(
    non_core_categories_df[["assay", "assay_category"]],
    left_on="Assay",
    right_on="assay",
    how="left",
)
print(df_w_cats.shape)

In [None]:
if "Assay" in df_w_cats.columns:
    df_w_cats.drop(columns=["Assay"], inplace=True)
df_w_cats["assay_category"].value_counts(dropna=False)

In [None]:
# print non-core assay categories for each predicted class
min_pred = 0.6
for predicted_class, group in df_w_cats.groupby("Predicted class"):
    print(predicted_class, group.shape[0])
    group = group[group["Max pred"] >= min_pred]
    print(f"min_pred={min_pred}: {group.shape[0]} samples left")
    groupby = (
        group.groupby(["assay_category", "assay"])
        .size()
        .reset_index(name="Count")
        .sort_values(["assay_category", "Count"], ascending=[True, False])
        .set_index(["assay_category", "assay"])["Count"]
    )
    with pd.option_context(
        "display.max_rows",
        None,
    ):
        # display(groupby)
        pass

In [174]:
def create_non_core_preds_df(df: pd.DataFrame, min_pred: float = 0.6):
    """Create a DataFrame of non-core assay predictions."""
    results = {}
    assay_categories = dict(zip(df["assay"], df["assay_category"]))

    for assay, group in df.groupby("assay"):
        # N = group.shape[0]
        # if N < 3:
        #     continue

        group = group[group["Max pred"] >= min_pred]
        # N_post_filter = group.shape[0]
        # if N_post_filter == 0 or N_post_filter < min_n:
        #     continue

        groupby = (
            group.groupby(["Predicted class"])
            .size()
            .reset_index(name="Count")  # type: ignore
            .sort_values(["Count"], ascending=False)
        )

        results[assay] = dict(zip(groupby["Predicted class"], groupby["Count"]))

    result_df = pd.DataFrame(results).fillna(0)
    result_df = result_df.astype(int)
    result_df = result_df.T  # assay as row/index
    result_df["assay_category"] = result_df.index.map(assay_categories)
    return result_df

In [175]:
min_pred = 0.6
predicted_classes_df = create_non_core_preds_df(df_w_cats, min_pred=min_pred)
# predicted_classes_df.to_csv(
#     output_dir / f"encode_non-core_7c_predictions_per_assay_minPred{min_pred:.2f}.csv"
# )

In [None]:
predicted_classes_df["assay_category"].value_counts(dropna=False)

In [177]:
def create_structured_dataframe(df_w_cats):
    """Create a structured dataframe with the percentage of predictions for each assay category."""
    # Create an empty list to store our data
    data = []

    # Iterate through the grouped data
    for predicted_class, group in df_w_cats.groupby("Predicted class"):
        for min_pred in list(np.arange(0, 1, 0.05)) + [0.99]:
            df_filtered = group[group["Max pred"] >= min_pred]
            counts = df_filtered["assay_category"].value_counts(dropna=False)
            total = counts.sum()

            # Calculate percentages
            percentages = (counts / total * 100).round(2)

            # Add data for each assay category
            for assay_category, percentage in percentages.items():
                data.append(
                    {
                        "Predicted class": predicted_class,
                        "Min pred": min_pred,
                        "assay_category": assay_category,
                        "Percentage": percentage,
                        "Count": counts[assay_category],
                        "Total samples": total,
                    }
                )

    # Create the dataframe
    df_structured = pd.DataFrame(data)

    # Set the multi-index
    df_structured = df_structured.set_index(
        ["Predicted class", "Min pred", "assay_category"]
    )

    return df_structured

In [None]:
assay_category_df = create_structured_dataframe(df_w_cats)
display(assay_category_df)
# output_path = output_dir / "encode_non-core_7c_predictions_assay_category.csv"
# assay_category_df.to_csv(output_path)

In [179]:
section_fig_dir = base_fig_dir / "encode_predictions" / "assay_epiclass" / "non-core"
if not section_fig_dir.exists():
    raise FileNotFoundError(f"Directory {section_fig_dir} does not exist.")

#### X = assay_epiclass, stack = assay_category

In [None]:
fig_dir = section_fig_dir / "stacked_bar_X_assay_epiclass"
fig_dir.mkdir(parents=False, exist_ok=True)

bar_df = assay_category_df.reset_index()

predicted_class_order = [
    "h3k27ac",
    "h3k4me3",
    "h3k4me1",
    "h3k9me3",
    "h3k27me3",
    "h3k36me3",
    "input",
]
assay_category_color_map = {
    cat: px.colors.qualitative.Safe[i]
    for i, cat in enumerate(sorted(bar_df["assay_category"].unique()))
}

for min_pred in [0, 0.6, 0.9]:
    sub_df = bar_df[
        (bar_df["Min pred"] > min_pred - 0.01) & (bar_df["Min pred"] < min_pred + 0.01)
    ]
    fig = px.bar(
        sub_df,
        x="Predicted class",
        y="Percentage",
        color="assay_category",
        title=f"Assay Category Composition for Each Predicted Class at predScore >= {min_pred:.2f}",
        labels={"Percentage": "Percentage (%)", "Predicted class": "Predicted Class"},
        barmode="stack",
        category_orders={"Predicted class": predicted_class_order},
        color_discrete_map=assay_category_color_map,
    )

    figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
    fig.write_html(fig_dir / f"{figname}.html")
    fig.write_image(fig_dir / f"{figname}.png")
    fig.write_image(fig_dir / f"{figname}.svg")
    fig.show()

#### X = assay_category, stack = assay_epiclass

In [181]:
df = df_w_cats[df_w_cats["assay_category"] != "not_looked"]

In [182]:
assay_epiclass_order = [
    "h3k27ac",
    "h3k4me3",
    "h3k4me1",
    "h3k9me3",
    "h3k27me3",
    "h3k36me3",
    "input",
]
assay_epiclass_order = {assay: i for i, assay in enumerate(assay_epiclass_order)}

In [None]:
fig_dir = section_fig_dir / "stacked_bar_X_assay_category"
fig_dir.mkdir(parents=False, exist_ok=True)

assay_categories_order = [
    "trx_reg",
    "heterochrom",
    "polycomb",
    "splicing",
    "insulator",
    "other/mixed",
]

for min_pred in [0, 0.6]:
    sub_df = df[df["Max pred"] >= min_pred]
    groupby = (
        sub_df.groupby(["assay_category", "Predicted class"])
        .size()
        .reset_index(name="Count")
        .sort_values(["assay_category", "Count"], ascending=[True, False])
    )
    groupby["Percentage"] = groupby.groupby("assay_category")["Count"].transform(
        lambda x: (x / x.sum()) * 100
    )

    # Add order for plotting
    groupby["assay_order"] = groupby["Predicted class"].map(assay_epiclass_order)
    groupby = groupby.sort_values(
        ["assay_category", "assay_order"], ascending=[False, True]
    )

    # Main plot
    fig = px.bar(
        groupby,
        x="assay_category",
        y="Percentage",
        color="Predicted class",
        barmode="stack",
        category_orders={"assay_category": assay_categories_order},
        color_discrete_map=assay_colors,
        title=f"core7 predictions for non-core assays, predScore >= {min_pred:.2f}",
        labels={"Percentage": "Percentage (%)", "assay_category": "Assay Category"},
    )

    # Modify x-axis labels
    total_counts = groupby.groupby("assay_category")["Count"].sum()

    ticktext = [
        f"{assay_category} (N={total_counts[assay_category]})"
        for assay_category in assay_categories_order
    ]
    fig.update_xaxes(tickvals=assay_categories_order, ticktext=ticktext)

    # Save and display
    figname = f"histogram_encode_non-core_assay_epiclass_minPred{min_pred:.2f}"
    fig.write_html(fig_dir / f"{figname}.html")
    fig.write_image(fig_dir / f"{figname}.png")
    fig.write_image(fig_dir / f"{figname}.svg")
    fig.show()

#### Assay category evolution with min_predScore

In [184]:
def create_assay_category_graphs(df, output_dir: Path):
    """Graph assay category distribution for each predicted class."""
    # Get unique predicted classes
    predicted_classes = df.index.get_level_values("Predicted class").unique()
    assay_categories = df.index.get_level_values("assay_category").unique()

    graph_colors = {
        cat: px.colors.qualitative.Safe[i]
        for i, cat in enumerate(sorted(assay_categories))
    }

    # Create a figure for each predicted class
    for predicted_class in predicted_classes:
        df_class = df.loc[predicted_class]

        # Get unique assay categories for this predicted class
        assay_categories = df_class.index.get_level_values("assay_category").unique()

        total_samples_at_zero = df_class.xs(0, level="Min pred")["Total samples"].iloc[0]

        # Create the figure
        fig = go.Figure()

        for assay_category in assay_categories:
            df_assay = df_class.xs(assay_category, level="assay_category")

            fig.add_trace(
                go.Scatter(
                    x=df_assay.index,
                    y=df_assay["Percentage"],
                    mode="lines+markers",
                    name=assay_category,
                    marker=dict(color=graph_colors[assay_category]),
                )
            )

        conserved_percentages = (
            df_class.groupby("Min pred")["Total samples"].first()
            / total_samples_at_zero
            * 100
        )
        fig.add_trace(
            go.Scatter(
                x=conserved_percentages.index,
                y=conserved_percentages.values,
                mode="lines+markers",
                name="Samples Conserved",
                line=dict(dash="dash", color="black"),
            )
        )

        # Update layout
        fig.update_layout(
            title=f"Composition for Predicted Class: {predicted_class}",
            xaxis_title="Min pred",
            yaxis_title="Percentage Composition",
            legend_title="Assay Category",
            hovermode="x unified",
        )

        fig.update_xaxes(range=[-0.01, 1.01])
        fig.update_yaxes(range=[0, 100])

        # Save
        filename = f"encode_non-core_7c_predictions_assay_category_{predicted_class}"
        fig.write_image(output_dir / f"{filename}.png")
        fig.write_image(output_dir / f"{filename}.svg")
        fig.write_html(output_dir / f"{filename}.html")
        fig.show()

In [185]:
# Assuming df_structured is your dataframe from the previous step
fig_dir = (
    base_fig_dir
    / "encode_predictions"
    / "assay_epiclass"
    / "non-core"
    / "line_graphs_over_min_pred"
)
fig_dir.mkdir(parents=False, exist_ok=True)
# create_assay_category_graphs(df=assay_category_df, output_dir=fig_dir)

## OTHER - Sex, life stage, cancer
Throwing all the predictions together to get acc/F1 for each of 5 classifiers, on core/non-core data respectively. (for assay and cell type it gets more messy, cannot do non-core directly)

In [186]:
# metadata
full_metadata_path = encode_metadata_dir / "encode_metadata_2023-10-25.csv"
full_metadata_df = pd.read_csv(full_metadata_path)

In [None]:
full_metadata_df[ASSAY].value_counts(dropna=False)

In [189]:
pred_dfs_dict = {}
for folder in encode_predictions_dir.glob("*"):
    if not folder.is_dir():
        continue
    if any(label in folder.name for label in ["assay", "ontology"]):
        continue

    pred_file = list(folder.rglob("*.csv"))
    if len(pred_file) != 1:
        print(f"Found {len(pred_file)} files in {folder}.")
        continue

    pred_file = pred_file[0]

    pred_df = pd.read_csv(pred_file, sep=",")
    try:
        pred_df.drop(columns=["Same?"], inplace=True)
    except KeyError:
        pass

    pred_df = pred_df.merge(full_metadata_df, on="md5sum", how="left")

    pred_dfs_dict[folder.name.replace("_1l_3000n", "")] = pred_df

In [190]:
CANCER = "harmonized_sample_cancer_high"
true_class_mapping = {
    SEX: "donor_sex",
    LIFE_STAGE: "life_stage",
    CANCER: "cancer_status",
}

for name, pred_df in sorted(pred_dfs_dict.items()):
    pred_df["True class"] = pred_df[true_class_mapping[name]]

In [None]:
for name, pred_df in sorted(pred_dfs_dict.items()):
    print(name)
    display(pred_df["True class"].value_counts(dropna=False))

In [192]:
for name, pred_df in sorted(pred_dfs_dict.items()):
    is_unknown = pred_df["True class"].copy().str.contains(r"unknown|,", case=False)
    new_pred_df = pred_df[~is_unknown]
    pred_dfs_dict[name] = new_pred_df

In [None]:
for name, pred_df in sorted(pred_dfs_dict.items()):
    print(name)
    display(pred_df["True class"].value_counts(dropna=False))

Removing EpiAtlas EpiRR overlap with ENCODE dataset.

In [None]:
for name, pred_df in sorted(pred_dfs_dict.items()):
    print(name)
    new_pred_df = pred_df[~pred_df["md5sum"].isin(in_epiatlas)]
    print(pred_df.shape, new_pred_df.shape)
    print(f"Removed {pred_df.shape[0] - new_pred_df.shape[0]} EpiAtlas samples.\n")
    pred_dfs_dict[name] = new_pred_df

### Accuracies per assay

Preparing data for plotting

In [195]:
def compute_metrics(df: pd.DataFrame):
    """Compute the accuracy and f1 of the predictions."""
    acc = (df["True class"] == df["Predicted class"]).mean()
    f1 = f1_score(
        df["True class"],
        df["Predicted class"],
        labels=df["Predicted class"].unique(),
        average="macro",
    )
    return acc, f1

In [196]:
assays = ASSAY_ORDER + ["CTCF", "non-core"]

all_acc_per_assay = {}
for name, df in pred_dfs_dict.items():
    if "Max pred" not in df.columns:
        raise ValueError(f"Column 'Max pred' not found in {name}.")

    # {assay: [(min_pred, acc, f1, nb_samples), ...], ...}
    acc_per_assay: Dict[str, List[Tuple[str, float, float, int]]] = {}
    for label in assays:
        acc_per_assay[label] = []
        if label not in df[ASSAY].unique():
            continue
        assay_df = df[df[ASSAY] == label]
        for min_pred in ["0.0", "0.6", "0.9"]:
            sub_df = assay_df[assay_df["Max pred"] > float(min_pred)]
            acc, f1 = compute_metrics(sub_df)
            acc_per_assay[label].append((min_pred, acc, f1, len(sub_df)))

    # Avg accuracy
    for label in ["avg-all", "avg-core", "avg-non-core"]:
        acc_per_assay[label] = []

    for min_pred in ["0.0", "0.6", "0.9"]:
        sub_df = df[df["Max pred"] > float(min_pred)]
        acc, f1 = compute_metrics(sub_df)
        acc_per_assay["avg-all"].append((min_pred, acc, f1, len(sub_df)))

        core_df = sub_df[sub_df[ASSAY].isin(ASSAY_ORDER)]
        acc, f1 = compute_metrics(core_df)
        acc_per_assay["avg-core"].append((min_pred, acc, f1, len(sub_df)))

        non_core_df = sub_df[~sub_df[ASSAY].isin(ASSAY_ORDER)]
        acc, f1 = compute_metrics(non_core_df)
        acc_per_assay["avg-non-core"].append((min_pred, acc, f1, len(sub_df)))

    all_acc_per_assay[name] = acc_per_assay

In [197]:
# acc per assay to table
# cols = [classifier+task, assay, min_pred, acc, nb_samples]
rows = []
for name, acc_per_assay in all_acc_per_assay.items():
    for assay, values in acc_per_assay.items():
        for min_pred, acc, f1, nb_samples in values:
            rows.append([name, assay, min_pred, acc, f1, nb_samples])
df_acc_per_assay = pd.DataFrame(
    rows, columns=["task_name", ASSAY, "min_predScore", "acc", "f1-score", "nb_samples"]
)

In [198]:
# df_acc_per_assay.to_csv(
#     base_fig_dir
#     / "encode_predictions"
#     / "sex_cancer_life-stage_acc_per_assay_NO_EpiAtlas.tsv",
#     sep="\t",
#     index=False,
# )

In [199]:
min_predScore_color_map = {"0.0": "blue", "0.6": "orange", "0.9": "red"}

df_acc_per_assay["scatter_name"] = df_acc_per_assay["task_name"].replace(
    "harmonized_", "", regex=True
)

df_acc_per_assay = df_acc_per_assay.sort_values([ASSAY, "min_predScore", "scatter_name"])

#### Multiple min_predScore

In [205]:
this_fig_dir = base_fig_dir / "encode_predictions" / "acc_per_assay"
if not this_fig_dir.exists():
    raise FileNotFoundError(f"Folder {this_fig_dir} does not exist")

for graph_type in ["core", "non-core"]:
    graph_df = df_acc_per_assay.copy()
    if graph_type == "core":
        graph_df = graph_df[graph_df[ASSAY].isin(CORE_ASSAYS)]
        minY = 0.55
        maxY = 1.001
    elif graph_type == "non-core":
        graph_df = graph_df[~graph_df[ASSAY].isin(CORE_ASSAYS)]
        minY = 0
        maxY = 1
    else:
        raise ValueError(f"Invalid graph type: {graph_type}")

    unique_assays = list(graph_df[ASSAY].unique())

    # Calculate average over assays
    avg_df = (
        graph_df.groupby(["min_predScore", "scatter_name"])["acc"].mean().reset_index()
    )
    avg_df[ASSAY] = "Average"

    traces_per_assay = graph_df["scatter_name"].nunique()

    fig = go.Figure()

    for min_pred in ["0.0", "0.6", "0.9"]:
        df_subset = graph_df[graph_df["min_predScore"] == min_pred]
        avg_subset = avg_df[avg_df["min_predScore"] == min_pred]

        # Add average over assay trace
        fig.add_trace(
            go.Scatter(
                x=["Average - " + name for name in avg_subset["scatter_name"]],
                y=avg_subset["acc"],
                mode="markers",
                name=f"Avg Min Pred Score: {min_pred}",
                marker=dict(
                    color=min_predScore_color_map[min_pred],
                    size=9,
                    symbol="star",
                ),
                hoverinfo="y+x",
                showlegend=False,
            )
        )

        # Add individual assay traces
        hovertext = list(
            zip(
                df_subset[ASSAY], df_subset["nb_samples"].apply(lambda x: f"Samples: {x}")
            )
        )
        fig.add_trace(
            go.Scatter(
                x=df_subset[ASSAY] + " - " + df_subset["scatter_name"],
                y=df_subset["acc"],
                mode="markers",
                name=f"Min Pred Score: {min_pred}",
                marker=dict(
                    color=min_predScore_color_map[min_pred],
                    size=9,
                ),
                text=hovertext,
                hoverinfo="text+y+x",
            )
        )

    # Modify x-axis tick labels

    ticktext = []
    tick_group = list(df_subset["scatter_name"].unique())
    for i, tick in enumerate(tick_group):
        tick_group[i] = f"<b>{tick}</b>"

    for i in range(len(unique_assays) + 1):
        ticktext.extend(tick_group)

    fig.update_xaxes(
        tickmode="array", ticktext=ticktext, tickvals=list(range(len(ticktext)))
    )

    # Add assay labels on top + vertical lines between assay groups
    fig.add_annotation(
        x=len(tick_group) / 2 - 0.5,
        y=1.05,
        yref="paper",
        text="Average",
        showarrow=False,
        font=dict(size=14),
    )

    fig.add_vline(
        x=len(tick_group) - 0.5, line_width=2, line_dash="solid", line_color="black"
    )
    fig.add_hline(y=1, line_width=1, line_color="black")

    for i, label in enumerate(unique_assays):
        fig.add_annotation(
            x=(i + 1) * len(tick_group) + len(tick_group) / 2 - 0.5,
            y=1.05,
            yref="paper",
            text=label,
            showarrow=False,
            font=dict(size=14),
        )
        fig.add_vline(
            x=(i + 1) * len(tick_group) - 0.5,
            line_width=1,
            line_dash="dash",
            line_color="black",
        )

    # titles + yaxis range
    fig.update_layout(
        title="ENCODE data - Label match per Assay and Task",
        xaxis_title="Assay - Task",
        yaxis_title="Match %",
        xaxis_tickangle=-45,
        showlegend=True,
        height=600,
        width=1200,
        yaxis=dict(tickformat=".2%", range=[minY, maxY]),
    )

    # Show/Write the plot
    # print(f"Graphing {graph_type}")
    # figname = f"encode_{graph_type}_acc_per_assay_minY{minY:.2f}"
    # fig.write_html(this_fig_dir / f"{figname}.html")
    # fig.write_image(this_fig_dir / f"{figname}.png")
    # fig.write_image(this_fig_dir / f"{figname}.svg")
    # fig.show()

#### min_predScore = 0.6

In [None]:
this_fig_dir = base_fig_dir / "encode_predictions" / "acc_per_assay"
if not this_fig_dir.exists():
    raise FileNotFoundError(f"Folder {this_fig_dir} does not exist")

graph_df = df_acc_per_assay.copy()

graph_df = graph_df[graph_df["min_predScore"] == "0.6"]
graph_df = graph_df[graph_df[ASSAY].isin(CORE_ASSAYS)]

minY = 0.55
maxY = 1.001


fig = go.Figure()

for task in graph_df["task_name"].unique():
    task_df = graph_df[graph_df["task_name"] == task]

    fig.add_trace(
        go.Box(
            y=task_df["acc"],
            name=task,
            boxpoints="all",
            boxmean=True,
            hovertext=task_df[ASSAY],
            jitter=0.1,
        )
    )

fig.update_layout(
    # title="ENCODE data - Label match per Assay and Task",
    # xaxis_title="Assay - Task",
    # yaxis_title="Match %",
    # xaxis_tickangle=-45,
    # showlegend=True,
    # height=600,
    # width=1200,
    yaxis=dict(tickformat=".2%", range=[0.65, maxY]),
)

# Show/Write the plot
# print(f"Graphing {graph_type}")
# figname = f"encode_{graph_type}_acc_per_assay_minY{minY:.2f}"
# fig.write_html(this_fig_dir / f"{figname}.html")
# fig.write_image(this_fig_dir / f"{figname}.png")
# fig.write_image(this_fig_dir / f"{figname}.svg")
fig.show()

### Confusion matrices

In [None]:
cm_logdir = base_fig_dir / "encode_predictions" / "confusion_matrices"
for graph_type in ["core", "non-core"]:
    for name, df in pred_dfs_dict.items():
        logdir = cm_logdir / name
        if not logdir.exists():
            logdir.mkdir(parents=True)

        if "Max pred" not in df.columns:
            raise ValueError(f"Column 'Max pred' not found in {name}.")

        if graph_type == "core":
            df = df[df[ASSAY].isin(CORE_ASSAYS)].copy()
        elif graph_type == "non-core":
            df = df[~df[ASSAY].isin(CORE_ASSAYS)].copy()

        for threshold in [0, 0.6, 0.9]:
            sub_df = df[df["Max pred"] >= threshold]

            true, pred = sub_df["True class"], sub_df["Predicted class"]
            labels = sub_df["True class"].unique()
            cm = confusion_matrix(true, pred, labels=labels)

            writer = ConfusionMatrixWriter(labels=labels, confusion_matrix=cm)
            writer.to_all_formats(
                logdir=logdir,
                name=f"{name}-{graph_type}-confusion_matrix-{threshold*100}",
            )
            plt.close("all")