In [None]:
"""Workbook to create supplementary prediction files destined for the paper.

Includes most data predictions used to create paper figures.
"""

# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches

In [None]:
%load_ext autoreload
%autoreload 2

## SETUP

In [None]:
from __future__ import annotations

import functools
import gc
import json
import logging
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple

import numpy as np
import pandas as pd
from IPython.display import display

from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_MERGE_DICT,
    ASSAY_ORDER,
    CELL_TYPE,
    LIFE_STAGE,
    SEX,
    MetadataHandler,
    SplitResultsHandler,
)

In [None]:
assays = ASSAY_ORDER[0:7] + ["rna_seq", "wgbs"]

In [None]:
DISEASE = "harmonized_sample_disease_high"
CANCER = "harmonized_sample_cancer_high"
BIOMAT = "harmonized_biomaterial_type"

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"

base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
base_metadata_dir = base_data_dir / "metadata"
table_dir = base_dir / "tables"

paper_dir = base_dir

predictions_dir = base_data_dir / "training_results" / "dfreeze_v2" / "predictions"
full_preds_table_dir = table_dir / "dfreeze_v2" / "predictions"

In [None]:
split_results_handler = SplitResultsHandler()
metadata_handler = MetadataHandler(paper_dir)

In [None]:
def prepare_df_for_save(
    df: pd.DataFrame, md5sum_to_epirr: Dict[str, str], md5sum_to_uuid: Dict[str, str]
) -> pd.DataFrame:
    """Prepare predictions DataFrame for saving to CSV. Return a modified DataFrame, with:
    - Index set to md5sum
    - Expected class as the first column
    - epirr_without_version column added
    - uuid column added
    - Sorted by epirr_without_version and uuid
    """
    df = df.copy(deep=True)
    df.insert(0, "Expected class", df.pop("True class"))
    df.set_index("md5sum", inplace=True)

    df["epirr_without_version"] = df.index.map(md5sum_to_epirr)
    df["uuid"] = df.index.map(md5sum_to_uuid)
    df.sort_values(by=["epirr_without_version", "uuid"], inplace=True)
    return df

### Official metadata

In [None]:
meta_df = metadata_handler.load_metadata_df(version="v2", merge_assays=False)
md5sum_to_epirr = meta_df["epirr_id_without_version"].to_dict()
md5sum_to_uuid = meta_df["uuid"].to_dict()

In [None]:
meta_df_merged_assays = metadata_handler.load_metadata_df(version="v2", merge_assays=True)
md5sum_to_assay = meta_df_merged_assays[ASSAY].rename(ASSAY_MERGE_DICT).to_dict()

In [None]:
official_metadata_dir = base_metadata_dir / "epiatlas" / "official"

metadata_v1_1_path = (
    official_metadata_dir / "IHEC_sample_metadata_harmonization.v1.1.extended.csv"
)
metadata_v1_1 = pd.read_csv(metadata_v1_1_path, index_col=False)
metadata_v1_1.set_index("epirr_id_without_version", inplace=True)

metadata_post_correction_path = (
    official_metadata_dir / "IHEC_sample_metadata_harmonization.v1.4.extended.csv"
)
metadata_post_correction = pd.read_csv(metadata_post_correction_path, index_col=False)
metadata_post_correction.set_index("epirr_id_without_version", inplace=True)

### Important fct: metrics per assay

In [None]:
def metrics_per_assay(
    dataframe: pd.DataFrame,
    md5sum_to_assay: Dict[str, str],
    is_assay: bool = False,
    verbose: bool = False,
) -> pd.DataFrame:
    """Compute metrics for all assays for each split.

    Expects a dataframe with the following columns [optional]:
    index=md5sum, True class, Predicted class, pred vector columns, split, [uuid], [epirr]

    Expecting md5sum_to_assay dict to contain md5sum -> assay, with assay being one of 9 (wgbs/rna merged)

    Is made to work well with the output of prepare_df_for_save

    Returns a dataframe with the following columns:
    split, assay, metrics x5
    """

    df = dataframe.copy()
    for col in ["uuid", "epirr", "epirr_without_version"]:
        try:
            df.drop(col, axis=1, inplace=True)
        except KeyError:
            pass

    # For metrics fct
    if "Expected class" in df.columns:
        df.rename({"Expected class": "True class"}, axis=1, inplace=True)

    # Columns to keep for metric calculation
    desired_columns = list(df.columns)
    desired_columns.remove("split")

    df[ASSAY] = df.index.map(md5sum_to_assay)
    if np.nan in df[ASSAY].unique():
        raise ValueError("NaN found in ASSAY column")

    # Calculate metrics
    results = defaultdict(dict)

    if verbose:
        print("All splits, all assays")

    metrics = split_results_handler.calculate_metrics_for_single_df(
        df=df[desired_columns],
        logging_name="all_splits-all_assays",
    )

    results[("all", "all")] = metrics
    if verbose:
        print(metrics)

    # All folds, split per assay
    for assay, assay_df in df.groupby(by=ASSAY):
        if verbose:
            print(f"All splits, {assay}")

        metrics = split_results_handler.calculate_metrics_for_single_df(
            df=assay_df[desired_columns],
            logging_name=f"all_splits-{assay}",
        )
        if is_assay:
            metrics["F1_macro"] = np.nan

        results[("all", assay)] = metrics
        if verbose:
            print(metrics)

    # Split per fold
    for split_idx, split_df in df.groupby(by="split"):
        if verbose:
            print(f"Split {split_idx}, all assays")

        metrics = split_results_handler.calculate_metrics_for_single_df(
            df=split_df[desired_columns],
            logging_name=f"split{split_idx}-all_assays",
        )
        results[(split_idx, "all")] = metrics
        if verbose:
            print(metrics)

        # Split per fold, per assay
        for assay_label, assay_df in split_df.groupby(by=ASSAY):
            if verbose:
                print(f"Split {split_idx}, {assay_label}")

            metrics = split_results_handler.calculate_metrics_for_single_df(
                df=assay_df[desired_columns],
                logging_name=f"split{split_idx}-{assay_label}",
            )

            if is_assay:
                metrics["F1_macro"] = np.nan

            results[(split_idx, assay_label)] = metrics
            if verbose:
                print(metrics)

    values_as_rows = []
    for key, values in results.items():
        split_idx, assay_label = key
        row = [split_idx, assay_label]
        try:
            row.extend(
                [
                    values["Accuracy"],
                    values["F1_macro"],
                    values["AUC_micro"],
                    values["AUC_macro"],
                    values["count"],
                ]
            )
        except KeyError as err:
            raise ValueError(f"Missing key {key} in metrics: {values}") from err
        values_as_rows.append(row)

    df = pd.DataFrame(
        values_as_rows,
        columns=[
            "split",
            "assay",
            "Accuracy",
            "F1_macro",
            "AUC_micro",
            "AUC_macro",
            "count",
        ],
    )
    df.sort_values(["split", "assay"], inplace=True)

    return df

## EpiATLAS training metadata

In [None]:
output_dir = table_dir / "datasets_composition"

In [None]:
path_meta = (
    base_metadata_dir
    / "epiatlas"
    / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
)
with open(path_meta, "r", encoding="utf8") as f:
    records = json.load(f)["datasets"]

epiatlas_meta_df = pd.DataFrame(records)
del records
_ = gc.collect()

In [None]:
epiatlas_meta_df[CANCER] = epiatlas_meta_df[DISEASE].map(
    {"Disease": "non-cancer", "Healthy/None": "non-cancer", "Cancer": "cancer"}
)

In [None]:
# Sanity check, one epirr = one biospecimen (cell type)
for epirr, cell_type in epiatlas_meta_df.groupby("epirr_id")[CELL_TYPE].unique().items():
    if len(cell_type) != 1:
        raise ValueError(f"Dataset with multiple cell types ({epirr}): {cell_type}")

In [None]:
df_biospecimens = epiatlas_meta_df.fillna("unknown").copy(deep=True)

df_biospecimens = df_biospecimens.groupby(CELL_TYPE, dropna=False).agg(
    {
        "epirr_id": "nunique",
        "uuid": "nunique",
        "md5sum": "nunique",
    }
)

df_biospecimens.rename(
    columns={
        "epirr_id": "Biospecimen count",
        "uuid": "uuid/experiment count",
        "md5sum": "File Count",
    },
    inplace=True,
)
df_biospecimens = df_biospecimens.sort_values("Biospecimen count", ascending=False)

df_biospecimens.to_csv(output_dir / "EpiATLAS_biospecimens.csv")

In [None]:
# Sanity check, one uuid = one experiment

# Group by UUID and get unique assays per UUID
groupby_uuid = epiatlas_meta_df.groupby("uuid")[ASSAY].unique()

for uuid, assay in groupby_uuid.items():
    if len(assay) != 1:
        print(f"uuid with multiple assays ({uuid}): {assay}")
        break

one uuid `!=` one experiment --> Input files need to be handled separately

In [None]:
# Count occurrences of each assay
experiment_counter = groupby_uuid.explode().value_counts()

# Detailled composition
N_exp = experiment_counter.sum()
N_uuid = len(groupby_uuid)
N_input_alone = N_uuid + experiment_counter["input"] - N_exp

print(f"Total experiments: {N_exp}")
print(f"Total uuids (can include assay+input): {N_uuid}")
print(f"Total input: {experiment_counter.get('input', 0)}")
print(f"Total input alone: {N_input_alone}")
print(
    f"#exp - #uuid = #input - #input alone: {N_exp}-{N_uuid} = {experiment_counter.get('input', 0)}-{N_input_alone} = {N_exp - N_uuid}"
)

assert (N_exp - N_uuid) == (experiment_counter.get("input", 0) - N_input_alone)

# Convert to DataFrame
df_exp = experiment_counter.rename_axis("Experiment Assay").reset_index(
    name="Experiment Count"
)
df_exp = df_exp.sort_values("Experiment Count", ascending=False).set_index(
    "Experiment Assay"
)

In [None]:
# Count the number of files per assay
unique_files = (
    epiatlas_meta_df.groupby(ASSAY)["md5sum"]
    .count()
    .rename("File Count")
    .sort_values(ascending=False)
)

# Merge file counts with experiment counts
df_exp = df_exp.merge(
    unique_files, left_on="Experiment Assay", right_index=True, how="left"
)

# Compute the track type average count
df_exp["Track type average count"] = df_exp["File Count"] / df_exp["Experiment Count"]

df_exp.to_csv(output_dir / "EpiATLAS_assays.csv")

In [None]:
track_types = (
    epiatlas_meta_df.groupby("track_type")["md5sum"]
    .count()
    .rename("File Count")
    .sort_values(ascending=False)
)
track_types.to_csv(output_dir / "EpiATLAS_track_types.csv")

In [None]:
del groupby_uuid, df_biospecimens, experiment_counter, df_exp, unique_files, track_types
_ = gc.collect()

## Collect experiment keys for all trained classifiers

In [None]:
def extract_experiment_info(line: str) -> Tuple[str, str] | None:
    """Extract split and experiment key from a line containing checkpoint information.

    Line should have format: .../splitX/EpiLaP/[exp_key]/checkpoints/...
    """
    if "EpiLaP" not in line:
        return None

    parts = line.strip().split("/")
    for i, part in enumerate(parts):
        if part == "EpiLaP" and i > 0:
            return (parts[i - 1], parts[i + 1])
    return None


def process_log_file(file_path: Path) -> Set[Tuple[str, str]]:
    """Process a single log file and extract experiment information."""
    experiment_info = set()
    try:
        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                if result := extract_experiment_info(line):
                    experiment_info.add(result)
    except Exception as e:  # pylint: disable=broad-exception-caught
        print(f"Error processing {file_path}: {e}")

    return experiment_info


def collect_exp_keys(folder: Path) -> Dict[str, Set[Tuple[str, str]]]:
    """Collect experiment keys from log files (.o files), recursively from a given folder."""
    experiments_keys = defaultdict(set)

    log_files = folder.glob("*1l_3000n/**/*.o")
    for file in log_files:
        if experiment_info := process_log_file(file):
            experiments_keys[file.parent].update(experiment_info)

    return experiments_keys

In [None]:
def format_exp_key_context(
    experiments_keys_dict: Dict[str, Set[Tuple[str, str]]],
) -> pd.DataFrame:
    """Format experiment keys context for saving to a DataFrame."""
    data = []
    for exp_folder, exp_keys in experiments_keys_dict.items():
        for split, exp_key in exp_keys:
            data.append((str(exp_folder), split, exp_key))

    df = pd.DataFrame(data, columns=["exp_folder", "split", "exp_key"])

    # comet-ml experiment url
    df["comet-url"] = "https://www.comet.com/rabyj/epiclass/" + df["exp_key"].astype(str)

    # Remove useless part of paths
    to_remove_path = (
        str(Path.home() / "Projects/epiclass/output/paper/data/training_results") + "/"
    )
    df["complete_experiment_context"] = df["exp_folder"].str.replace(to_remove_path, "")
    df.drop(columns="exp_folder", inplace=True)

    # Split path into named parts
    df[["release", "feature_set_name", "metadata_category"]] = (
        df["complete_experiment_context"].str.split("/", expand=True).loc[:, [0, 1, 2]]
    )

    df["experiment_specification"] = (
        df["complete_experiment_context"]
        .str.split("/", n=3, expand=True)[3]
        .str.replace("/", ",")
    )

    # Remove redundant info (all MLP exp are 1 hidden layer 3000 nodes)
    df["metadata_category"] = df["metadata_category"].str.replace("_1l_3000n", "")

    # Reorder columns
    df_new_col_order = df.columns.to_list()[-4:] + df.columns.to_list()[:-4]
    df = df[df_new_col_order]

    return df  # type: ignore

In [None]:
all_exp_keys_dfs = []
for folder in ["dfreeze_v2", "2023-01-epiatlas-freeze", "imputation"]:
    data_dir = base_data_dir / "training_results" / folder
    for subfolder in data_dir.glob("*"):
        if subfolder.is_file():
            continue
        # print(subfolder)
        exp_key_dict = collect_exp_keys(subfolder)
        df = format_exp_key_context(exp_key_dict)
        all_exp_keys_dfs.append(df)

exp_keys_df = pd.concat(all_exp_keys_dfs, ignore_index=True)

exp_keys_df.to_csv(table_dir / "training_experiment_keys.csv", index=False)

See `collect_all.ipynb` for download of training results from comet-ml.  
Next, merging some comet-ml info with the chosen experiments.

In [None]:
exp_info_path = (
    base_data_dir
    / "training_results"
    / "all_results_cometml_filtered_oversampling-fixed.csv"
)
exp_info = pd.read_csv(exp_info_path)

In [None]:
exp_info = pd.merge(
    exp_keys_df,
    exp_info,
    how="left",
    left_on="exp_key",
    right_on="Experience key",
    suffixes=("", "_DROP"),
)
for col in exp_info.columns:
    if col.endswith("_DROP"):
        exp_info.drop(columns=col, inplace=True)

exp_info = exp_info.dropna(axis=0, how="all")
exp_info = exp_info.dropna(axis=1, how="all")

exp_info = exp_info.sort_values(["SLURM_JOB_ID", "split"])

exp_info.to_csv(table_dir / "detailled_training_info.csv", index=False)

## assay_epiclass + sample ontology for all 5 model types - 100kb_all_none

In [None]:
root_logger = logging.getLogger()
root_logger.setLevel(logging.ERROR)

In [None]:
data_dir_100kb = base_data_dir / "training_results" / "dfreeze_v2" / "hg38_100kb_all_none"
logdir = table_dir / "dfreeze_v2" / "100kb_all_none"
if not logdir.exists():
    logdir.mkdir(parents=True)

split_md5sums = []
all_metrics = {}
all_assay_metrics = {}
for category in [ASSAY, CELL_TYPE]:
    all_split_dfs = split_results_handler.gather_split_results_across_methods(
        results_dir=data_dir_100kb,
        label_category=category,
        only_NN=False,
    )

    # Sanity check, same shape, same input files for each method
    for split_dict in all_split_dfs.values():
        ref_dict = split_dict["NN"]
        ref_md5sums = sorted(ref_dict.index.values.tolist())
        ref_shape = ref_dict.shape
        for method, df in split_dict.items():
            if not ref_md5sums == sorted(df.index.values.tolist()):
                raise ValueError("MD5sums do not match")
            if ref_shape != df.shape:
                raise ValueError("Shapes do not match")

    all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(all_split_dfs)  # type: ignore

    # Save to file
    for method, df in all_split_dfs_concat.items():
        # continue
        print(f"Method: {method}. Category: {category}.")
        df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

        if method == "NN":
            method = "MLP"

        is_assay = ASSAY in category
        # print(f"Saving {method} predictions for {category}. Assay: {is_assay}")

        assay_metrics = metrics_per_assay(
            df, md5sum_to_assay, is_assay=is_assay, verbose=False
        )
        assay_metrics.insert(0, "Metadata category", category)
        assay_metrics.insert(0, "Classifier type", method)
        all_assay_metrics[(method, category)] = assay_metrics

        filename = f"10fold_predictions_{category}_{method}.csv"
        df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

#### Saving aggregated metrics

In [None]:
def rename_metrics_df(df: pd.DataFrame) -> pd.DataFrame:
    """Rename columns and values for more legibility."""
    # Rename columns
    df = df.rename(
        {
            "count": "Validation size",
            "assay": "target/assay",
            "split": "Fold idx",
        },
        axis=1,
    )

    # Rename classifiers
    df = df.replace(
        {
            "Classifier type": {
                "RF": "Random Forest",
                "LGBM": "LightGBM",
                "LR": "Logistic Regression",
                "LinearSVC": "Linear SVM",
                "NN": "Multilayer Perceptron",
                "MLP": "Multilayer Perceptron",
            }
        },
    )

    return df

In [None]:
# Transform to long format
full_assay_metrics_df = pd.concat(
    all_assay_metrics.values(), keys=all_assay_metrics.keys(), ignore_index=True
)

# Rename columns/values
full_assay_metrics_df = rename_metrics_df(full_assay_metrics_df)
full_assay_metrics_df.fillna("NA", inplace=True)

# Save
full_assay_metrics_df.to_csv(
    table_dir / "dfreeze_v2" / "dfreeze_v2_100kb_all_none_5algo_metrics.csv",
    index=False,
)

## Other MLP results - 100kb_all_none

In [None]:
data_dir_100kb = base_data_dir / "training_results" / "dfreeze_v2" / "hg38_100kb_all_none"
logdir = table_dir / "dfreeze_v2" / "100kb_all_none"
if not logdir.exists():
    logdir.mkdir(parents=True)

In [None]:
categories = [
    ASSAY,
    CELL_TYPE,
    "paired_end",
    "harmonized_sample_cancer_high",
    LIFE_STAGE,
    SEX,
    "harmonized_biomaterial_type",
    "project",
]

# Select 10-fold oversampling runs
all_split_dfs = split_results_handler.general_split_metrics(
    results_dir=data_dir_100kb,
    merge_assays=False,
    include_categories=categories,
    exclude_names=["reg", "no-mixed", "chip", "16ct", "27ct"],
    return_type="split_results",
    oversampled_only=True,
    verbose=False,
)
all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(all_split_dfs, concat_first_level=True)  # type: ignore

In [None]:
# for task names that don't exactly fit metadata categories
cat_mapper = {
    key: (SEX if "sex" in key else ASSAY if "assay" in key else key)
    for key in all_split_dfs_concat
}

In [None]:
# Save to file
new_dfs = {}
for category, df in all_split_dfs_concat.items():
    new_df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

    # we want to write down the expected class for both metadata versions,
    # since there were changes in-between
    if category in [LIFE_STAGE, SEX]:
        for version, metadata in [
            ("v1.4", metadata_post_correction),
            ("v1.1", metadata_v1_1),
        ]:
            idx = new_df.index.map(md5sum_to_epirr).values
            values = metadata.loc[idx, category].to_list()  # type: ignore
            new_df.insert(loc=0, column=f"Expected class {version}", value=values)
        new_df.drop("Expected class", axis=1, inplace=True)

    # using training metadata
    else:
        cat = cat_mapper[category]
        values = epiatlas_meta_df[["md5sum", cat]].set_index("md5sum")
        values = values.loc[new_df.index, cat]
        new_df["Expected class"] = values

    new_dfs[category] = new_df

    print(f"Saving predictions for {category}")
    filename = f"10fold_predictions_{category}_MLP.csv"
    new_df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

Computing metrics for all classification tasks. We compute metrics post-correction for sex and life stage.

In [None]:
all_assay_metrics = {}
for category, df in new_dfs.items():
    df = df.copy(deep=True)
    if category in [LIFE_STAGE, SEX]:
        df = df.drop("Expected class v1.1", axis=1)
        df.rename(columns={"Expected class v1.4": "Expected class"}, inplace=True)

    is_assay = "assay" in category
    assay_metrics = metrics_per_assay(df, md5sum_to_assay, is_assay)

    assay_metrics.insert(0, "Metadata category", category)
    assay_metrics.insert(0, "Classifier type", "MLP")

    all_assay_metrics[category] = assay_metrics

In [None]:
full_assay_metrics_df = pd.concat(
    all_assay_metrics.values(), keys=all_assay_metrics.keys(), ignore_index=True
)

full_assay_metrics_df = rename_metrics_df(full_assay_metrics_df)
full_assay_metrics_df.fillna("NA", inplace=True)

full_assay_metrics_df.to_csv(
    table_dir / "dfreeze_v2" / "dfreeze_v2_100kb_all_none_all_MLP_metrics.csv",
    index=False,
)

Merging all predictions together

In [None]:
same_col_len = 4

final_dfs = {}
for category, df in new_dfs.items():
    new_df = df.copy(deep=True)
    new_df.drop(["uuid", "epirr_without_version"], axis=1, inplace=True)
    new_df.insert(0, "split", new_df.pop("split"))

    try:
        new_df = new_df.reset_index()
    except ValueError:
        new_df = new_df.reset_index(drop=True)

    # All dataframes need to have same shape for rest of code to work
    if category in [LIFE_STAGE, SEX]:
        new_df.drop("Expected class v1.1", axis=1, inplace=True)
        new_df.rename(columns={"Expected class v1.4": "Expected class"}, inplace=True)

    new_df.insert(loc=0, column="md5sum", value=new_df.pop("md5sum"))

    # this only works if all the columns to the right correspond to pred vector
    pred_cols = new_df.columns[same_col_len:].tolist()
    new_df.insert(
        loc=same_col_len, column="Max pred", value=new_df[pred_cols].max(axis=1)
    )

    old_names = new_df.columns[1 : same_col_len + 1]
    new_names = [f"{old_name} ({category})" for old_name in old_names]
    new_df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

    final_dfs[category] = new_df

For proper merging of full predictions values, assay classifiers need to have their column names modified, since they share output classes

In [None]:
category = f"{ASSAY}_11c"
assays = set(final_dfs[category][f"Expected class ({category})"].unique())

for suffix in ["_7c", "_11c"]:
    df = final_dfs[f"{ASSAY}{suffix}"]

    renamer = {assay: f"{assay}{suffix}" for assay in assays}

    df = df.rename(columns=renamer)
    final_dfs[f"{ASSAY}{suffix}"] = df

Now we can merge every df together, horizontally, with the md5sums.

In [None]:
final_df = functools.reduce(merge_dataframes, final_dfs.values())

final_df["epirr"] = final_df.loc[:, "md5sum"].map(md5sum_to_epirr)  # type: ignore
final_df["uuid"] = final_df.loc[:, "md5sum"].map(md5sum_to_uuid)  # type: ignore

In [None]:
final_df.to_csv(
    logdir / "all_10fold_predictions_MLP.csv",
    index=False,
    sep=",",
    float_format="%.4f",
    na_rep="NA",
)

### Class imbalance

In [None]:
class_imbalance = []
for category, df in all_split_dfs_concat.items():
    for col in ["Expected class", "True class", "Expected class v1.4"]:
        if col in df.columns:
            class_counts = df[col].value_counts()
            break

    N = class_counts.sum()
    for i, (class_label, count) in enumerate(sorted(class_counts.items())):
        class_imbalance.append((category, i, class_label, count, count / N * 100))

class_imbalance_df = pd.DataFrame(
    class_imbalance,
    columns=[
        "Classification task",
        "Class index",
        "Class label",
        "Count",
        "Fraction (%)",
    ],
)

class_imbalance_df.to_csv(
    table_dir / "dfreeze_v2_100kb_all_none_MLP_class_imbalance.csv", index=False
)

## Results for other feature sets (MLP)

In [None]:
def verify_splits_identity(
    all_results: Dict[str, Dict[str, Dict[str, pd.DataFrame]]],
    task_names: List[str],
    verbose: bool | None = None,
) -> None:
    """Verify that the splits are identical between feature sets for each task.

    all_results: {feature_set: {task_name: {split_name: results_dataframe}}}
    task_names: list of task names to verify
    verbose: print additional information
    """
    # Sanity check : MD5sums and shapes should match between reference and other feature sets, for each split
    for task_name in task_names:
        if verbose:
            print(f"Verifying task '{task_name}'")
        # Select a reference feature set and use its splits as the baseline for comparison
        reference_feature_set = "hg38_100kb_all_none"
        reference_splits = all_results[reference_feature_set][task_name]

        # Create reference MD5sums and shapes for each split in the reference feature set
        reference_md5sums = {
            split_name: sorted(df.index.tolist())
            for split_name, df in reference_splits.items()
        }
        reference_shapes = {
            split_name: df.shape for split_name, df in reference_splits.items()
        }

        # Iterate over each feature set and compare its splits against the reference
        for feature_set_name, tasks_dict in all_results.items():
            if verbose:
                print(
                    f"Verifying feature set '{feature_set_name}' against reference feature set '{reference_feature_set}'"
                )
            for split_name, df in tasks_dict[task_name].items():
                if reference_shapes[split_name] != df.shape:
                    print(
                        f"WARNING: Shape mismatch in task '{task_name}', split '{split_name}', "
                        f"between reference feature set '{reference_feature_set}' and feature set '{feature_set_name}'",
                    )
                if reference_md5sums[split_name] != sorted(df.index.tolist()):
                    print(
                        f"WARNING: MD5sums mismatch in task '{task_name}', split '{split_name}', "
                        f"between reference feature set '{reference_feature_set}' and feature set '{feature_set_name}'",
                    )

In [None]:
categories = [ASSAY, CELL_TYPE]
include_sets = [
    "hg38_10mb_all_none_1mb_coord",
    "hg38_100kb_random_n316_none",
    "hg38_1mb_all_none",
    "hg38_100kb_random_n3044_none",
    "hg38_100kb_all_none",
    "hg38_gene_regions_100kb_coord_n19864",
    "hg38_10kb_random_n30321_none",
    "hg38_regulatory_regions_n30321",
    "hg38_1kb_random_n30321_none",
    "hg38_cpg_topvar_200bp_10kb_coord_n30k",
    "hg38_10kb_all_none",
    "hg38_regulatory_regions_n303114",
    "hg38_1kb_random_n303114_none",
    "hg38_cpg_topvar_200bp_10kb_coord_n300k",
]
exclude_names = ["7c", "chip-seq-only", "27ct", "16ct"]

# Select 10-fold oversampling runs
# expected result shape: {feature_set: {task_name: {split_name: results_dataframe}}}
all_results: Dict[
    str, Dict[str, Dict[str, pd.DataFrame]]
] = split_results_handler.obtain_all_feature_set_data(
    return_type="split_results",
    parent_folder=data_dir_100kb.parent,
    merge_assays=False,
    include_categories=categories,
    include_sets=include_sets,
    exclude_names=exclude_names,
    verbose=False,
)  # type: ignore

In [None]:
# replace assay_11c with assay
for feature_set in all_results.values():
    try:
        feature_set[ASSAY] = feature_set.pop(f"{ASSAY}_11c")
    except KeyError:
        pass

In [None]:
verify_splits_identity(all_results, categories)

In [None]:
logdir = table_dir / "dfreeze_v2" / "other_feature_sets"
logdir.mkdir(parents=True, exist_ok=True)

all_assay_metrics = {}
for feature_set_name, tasks_dict in all_results.items():
    # continue

    # if feature_set_name == "hg38_100kb_all_none":
    #     continue
    all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )  # type: ignore
    for task_name, df in all_split_dfs_concat.items():
        is_assay = "assay" in task_name
        print(f"Task: {task_name}. Feature set: {feature_set_name}. Assay: {is_assay}")

        df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

        metrics = metrics_per_assay(df, md5sum_to_assay, is_assay)
        metrics.insert(0, "Classifier type", "MLP")
        metrics.insert(1, "Metadata category", task_name)
        metrics.insert(2, "Feature set", feature_set_name)

        all_assay_metrics[(feature_set_name, task_name)] = metrics

        filename = f"{feature_set_name}_10fold_predictions_{task_name}.csv"
        df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

In [None]:
full_assay_metrics_df = pd.concat(
    all_assay_metrics.values(), keys=all_assay_metrics.keys(), ignore_index=True
)
full_assay_metrics_df = rename_metrics_df(full_assay_metrics_df)
full_assay_metrics_df.fillna("NA", inplace=True)

In [None]:
full_assay_metrics_df.to_csv(
    table_dir / "dfreeze_v2" / "dfreeze_v2_100kb_all_none_all_feature_sets_metrics.csv",
    index=False,
)

## Winsorized files and/or blacklist zeroed

Winsorization was applied after blacklisted regions were put to zero, we did not test a winsorized only version.

In [None]:
categories = [ASSAY, CELL_TYPE, SEX, "harmonized_biomaterial_type"]
include_sets = [
    "hg38_100kb_all_none",
    "hg38_100kb_all_none_0blklst",
    "hg38_100kb_all_none_0blklst_winsorized",
]

results_folder = base_data_dir / "training_results" / "2023-01-epiatlas-freeze"
if not results_folder.exists():
    raise FileNotFoundError(f"Folder '{results_folder}' not found")

logdir = table_dir / "2023-01-epiatlas-freeze"
if not logdir.exists():
    logdir.mkdir()

In [None]:
# Select 10-fold oversampling runs
# expected result shape: {feature_set: {task_name: {split_name: results_dataframe}}}
all_results: Dict[
    str, Dict[str, Dict[str, pd.DataFrame]]
] = split_results_handler.obtain_all_feature_set_data(
    return_type="split_results",
    parent_folder=results_folder,
    merge_assays=False,
    include_categories=categories,
    include_sets=include_sets,
    oversampled_only=False,
    verbose=False,
)  # type: ignore

display(all_results.keys())

In [None]:
tasks_collected = list(all_results["hg38_100kb_all_none"])
verify_splits_identity(all_results, tasks_collected, verbose=True)

In [None]:
# save concatenated result
all_assay_metrics = {}
for feature_set_name, tasks_dict in all_results.items():
    # continue

    concatenated_dfs = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )
    for task_name, concatenated_df in concatenated_dfs.items():
        concatenated_df = prepare_df_for_save(concatenated_df, md5sum_to_epirr, md5sum_to_uuid)  # type: ignore

        is_assay = "assay" in task_name  # type: ignore
        metrics = metrics_per_assay(concatenated_df, md5sum_to_assay, is_assay=is_assay)
        metrics.insert(0, "Classifier type", "MLP")
        metrics.insert(1, "Metadata category", task_name)  # type: ignore
        metrics.insert(2, "Input type", feature_set_name)

        all_assay_metrics[(feature_set_name, task_name)] = metrics

        filename = f"{feature_set_name}_10fold_predictions_{task_name}.csv"
        print(f"Saving {filename}")
        concatenated_df.to_csv(
            logdir / filename, index=True, sep=",", float_format="%.4f"
        )

In [None]:
full_assay_metrics_df = pd.concat(
    all_assay_metrics.values(), keys=all_assay_metrics.keys(), ignore_index=True
)
full_assay_metrics_df = rename_metrics_df(full_assay_metrics_df)
full_assay_metrics_df.fillna("NA", inplace=True)

In [None]:
full_assay_metrics_df.to_csv(
    table_dir
    / "2023-01-epiatlas-freeze"
    / "2023-01-epiatlas-freeze_100kb_all_none_2variants_MLP_metrics.csv",
    index=False,
)

### Evaluate input dataset discrepancy in assay_epiclass

In [None]:
metadata_handler = MetadataHandler(paper_dir)
metadata_df = metadata_handler.load_metadata_df("v2", merge_assays=False)

In [None]:
values_counts = {}
for feature_set_name, tasks_dict in all_results.items():
    concatenated_dfs = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )
    md5sums = concatenated_dfs[ASSAY].index.tolist()
    print(f"{feature_set_name}: {len(md5sums)}")

    metadata_subset = metadata_df[metadata_df.index.isin(md5sums)]
    values_counts[feature_set_name] = metadata_subset[ASSAY].value_counts()

In [None]:
display(
    values_counts["hg38_100kb_all_none_0blklst"] - values_counts["hg38_100kb_all_none"]
)

## ENCODE predictions

See:
- src/python/epi_ml/utils/notebooks/paper/encode_metadata_creation.ipynb
- src/python/epi_ml/utils/notebooks/paper/encode_pred_analysis.ipynb  

In [None]:
output_dir = table_dir / "datasets_composition"
metadata_dir = base_metadata_dir / "encode"

encode_preds_path = (
    full_preds_table_dir / "complete_encode_predictions_augmented_2025-02_metadata.csv.gz"
)

In [None]:
encode_preds_df = pd.read_csv(
    encode_preds_path, encoding="utf8", low_memory=False, compression="gzip"
)
print(encode_preds_df.head())
print(encode_preds_df.shape)

In [None]:
encode_id_cols = ["EXPERIMENT_accession", "BIOSAMPLE_accession", "FILE_accession"]

In [None]:
N_files = encode_preds_df.shape[0]
N_exp = encode_preds_df["EXPERIMENT_accession"].nunique()
N_biospecimen = encode_preds_df["BIOSAMPLE_accession"].nunique()
print(N_files, N_exp, N_biospecimen)

In [None]:
assays_df = encode_preds_df.groupby(ASSAY, dropna=False).agg(
    {
        "EXPERIMENT_accession": "nunique",
        "FILE_accession": "nunique",
    }
)
for label in encode_id_cols:
    try:
        assays_df.rename(
            columns={label: label.replace("_accession", " count")}, inplace=True
        )
    except KeyError:
        pass


assays_df["file_per_experiment"] = assays_df["FILE count"] / assays_df["EXPERIMENT count"]

assays_df.to_csv(output_dir / "ENCODE_assays.csv")

In [None]:
df_biospecimens = encode_preds_df.fillna("unknown").copy(deep=True)

groupby = df_biospecimens.groupby(
    ["BIOSAMPLE_TYPE_name", "BIOSAMPLE_TYPE_term_name"], dropna=False
).agg(
    {
        "BIOSAMPLE_accession": "nunique",
        "EXPERIMENT_accession": "nunique",
        "FILE_accession": "nunique",
    }
)

for label in encode_id_cols:
    try:
        groupby.rename(
            columns={label: label.replace("_accession", " count")}, inplace=True
        )
    except KeyError:
        pass

groupby.to_csv(output_dir / "ENCODE_biospecimens.csv")

## ChIP-Atlas predictions

In [None]:
metadata_dir = base_metadata_dir / "chip_atlas"

Using
- downloaded metadata (see `src/python/epi_ml/utils/notebooks/paper/c-a_metadata.ipynb`)
- 4DB target and ID metadata created by Gabriella
- Extracted metadata by Frede
- Pre-joined predictions from multiple classifiers

In [None]:
pred_path = predictions_dir / "C-A" / "ChIP-Atlas_merged_predictions_20240606.csv"
full_df = pd.read_csv(pred_path, sep=",", low_memory=False)
display(full_df.head())
print(full_df.shape)

In [None]:
extracted_meta_path = metadata_dir / "CA.full_info_metadata.freeze1.tsv"
meta_df = pd.read_csv(extracted_meta_path, sep="\t", low_memory=False)
display(meta_df.head())
print(meta_df.shape)

full_df = pd.merge(full_df, meta_df, how="left", left_on="Experimental-id", right_on="ID")

In [None]:
downloaded_meta_path = metadata_dir / "CA_metadata_downloaded_20250306.tsv"
meta_df = pd.read_csv(downloaded_meta_path, sep="\t", low_memory=False)
display(meta_df.head())
print(meta_df.shape)

full_df = pd.merge(full_df, meta_df, how="left", on="Experimental-id")

In [None]:
target_meta_path = metadata_dir / "CA_minimal_4DB_metadata_20240606_mod.tsv"
meta_df = pd.read_csv(target_meta_path, sep="\t", low_memory=False)
display(meta_df.head())
print(meta_df.shape, "\n")

meta_df["in_epiatlas"] = meta_df["is_EpiAtlas_EpiRR"].astype(str) != "0"
print(meta_df["in_epiatlas"].value_counts(dropna=False))

full_df = pd.merge(full_df, meta_df, how="left", on="Experimental-id")
print(full_df.shape)

In [None]:
full_df.to_csv(
    full_preds_table_dir
    / "ChIP-Atlas_predictions_20240606_merge_metadata_freeze1.csv.gz",
    compression="gzip",
    index=False,
)

## ChIP-Seq_imputed_with_RNA-Seq_only predictions

In [None]:
output_dir = table_dir / "dfreeze_v2" / "epiatlas_imputed"

Predictions are from epiclass_11c complete training (with oversampling) MLP classifer  
Training details at 0f8e5eb996114868a17057bebe64f87c (comet-ml id)

In [None]:
pred_folder = base_data_dir / "training_results" / "predictions" / "epiatlas_imputed"
pred_file = "complete_no_valid_oversample_test_prediction_100kb_all_none_ChIP-Seq_imputed_with_RNA-Seq_only.csv"
pred_df = pd.read_csv(pred_folder / pred_file)
print(pred_df.shape)

In [None]:
pred_df.rename(columns={"Unnamed: 0": "filename"}, inplace=True)

# filename of format 'impute_[ihec-id]_[expected-class]_[resolution]_[filter_in]_[filter_out].csv'
pred_df["True class"] = pred_df["filename"].str.split("_", expand=True)[2].str.lower()
pred_df.rename(columns={"True class": "Expected class"}, inplace=True)

In [None]:
idx_pred_col = np.where(pred_df.columns == "Predicted class")[0][0]
pred_df.insert(
    loc=int(idx_pred_col + 1),
    column="Same?",
    value=pred_df["Expected class"] == pred_df["Predicted class"],
)

In [None]:
print(f"Accuracy: {pred_df['Same?'].sum() / pred_df.shape[0]:.2%}")

In [None]:
non_pred_vector_cols = 4
nb_classes = 11
pred_df.insert(
    loc=non_pred_vector_cols,
    column="Max pred",
    value=pred_df.iloc[:, non_pred_vector_cols : non_pred_vector_cols + nb_classes].max(
        axis=1  # type: ignore
    ),
)

In [None]:
pred_df.to_csv(output_dir / "epiatlas_imputed_w_rna_only_predictions.csv", index=False)

## recount3

In [None]:
split_results_handler = SplitResultsHandler()

In [None]:
recount3_folder = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "recount3"
    / "hg38_100kb_all_none"
)
if not recount3_folder.exists():
    raise FileNotFoundError()

recount3 predictions were initially done in chunk because there are too many files.  
The below code handles pre-concatenated or still split, with a variable that needs to be set.

In [None]:
to_concat = False
nb_cat = 5
categories = [ASSAY, SEX, LIFE_STAGE, CANCER, BIOMAT]

if to_concat:
    nb_split = 7

    split_pred_files = {}
    for cat in categories:
        pred_files = list(
            recount3_folder.rglob(f"{cat}*/**/recount3/complete_*split*.csv")
        )
        if len(pred_files) != nb_split:
            raise FileNotFoundError(
                f"Expected {nb_split} files for {cat}, found {len(pred_files)}"
            )
        split_pred_files[cat] = pred_files

    if len(split_pred_files) != nb_cat:
        raise FileNotFoundError(
            f"Expected {nb_cat} categories, found {len(split_pred_files)}"
        )

    pred_dfs = {}
    for cat, pred_files in split_pred_files.items():
        dfs = []
        for pred_file in pred_files:
            df = pd.read_csv(pred_file, low_memory=False)
            dfs.append(df)
        concat_df = pd.concat(dfs, ignore_index=True)
        pred_dfs[cat] = concat_df

else:
    pred_dfs = {}
    for cat in categories:
        pred_files = list(recount3_folder.rglob(f"{cat}*/**/recount3/complete_*.csv"))
        if len(pred_files) != 1:
            raise FileNotFoundError(f"Expected 1 file for {cat}, found {len(pred_files)}")
        pred_df = pd.read_csv(pred_files[0], low_memory=False)
        pred_dfs[cat] = pred_df

In [None]:
for cat, pred_df in list(pred_dfs.items()):
    try:
        pred_df = pred_df.drop("True class", axis=1)
    except KeyError:
        pass
    pred_df = pred_df.rename(columns={"Unnamed: 0": "filename"})

    # Add max pred + move it to front
    pred_df = split_results_handler.add_max_pred(pred_df, target_label="Predicted class")
    pred_df.insert(2, "Max pred", pred_df.pop("Max pred"))
    pred_df = pred_df[pred_df["Max pred"] >= 0]  # in case of empty rows

    # Get id columns
    id_cols = (
        pred_df["filename"].str.split(".", expand=True)[2].str.split("_", expand=True)
    )

    pred_df.insert(1, "id1", id_cols.loc[:, 0])
    pred_df.insert(2, "id2", id_cols.loc[:, 1])

    pred_dfs[cat] = pred_df

In [None]:
# display(pred_dfs[ASSAY]["filename"].str.split(".", expand=True)[2].str.split("_",expand=True).head())
# display(pred_dfs[ASSAY]["id1"].nunique(), pred_dfs[ASSAY]["id2"].nunique())
# display(pred_dfs[ASSAY]["id2"].str.slice(0,3).value_counts())
assert pred_dfs[ASSAY]["id2"].nunique() == pred_dfs[ASSAY].shape[0]

In [None]:
meta_filename = "recount.full_info_metadata.freeze1.tsv"
metadata_file = base_metadata_dir / "recount3" / meta_filename
recount_metadata_df = pd.read_csv(metadata_file, sep="\t", low_memory=False)
recount_metadata_df.fillna("unknown", inplace=True)

In [None]:
recount_metadata_df.rename(
    mapper={
        "expected_assay": ASSAY,
        "expected_lifestage": LIFE_STAGE,
        "expected_sex": SEX,
        "expected_cancer": CANCER,
        "expected_biomat": BIOMAT,
    },
    axis=1,
    inplace=True,
)

In [None]:
recount_metadata_df["extracted_terms_biospecimen"].value_counts(dropna=False).to_csv(
    table_dir / "datasets_composition" / "recount3_biospecimen.csv"
)

In [None]:
recount_metadata_df[ASSAY].value_counts(dropna=False).to_csv(
    table_dir / "datasets_composition" / "recount3_assays.csv"
)

In [None]:
def merge_all_recount3_preds(
    pred_dfs: Dict[str, pd.DataFrame], full_metadata_df: pd.DataFrame
) -> pd.DataFrame:
    """Merge all recount3 predictions into a single DataFrame."""
    same_col_len = 5
    # Make all different columns have unique relevant names except for the pred vector
    new_dfs = {}
    for cat, df in pred_dfs.items():
        df = df.copy(deep=True)

        df["ID"] = df["id2"]  # needs to be at end

        df = df.drop(["id1", "id2"], axis=1)
        try:
            df = df.drop(columns=["Same?"])
        except KeyError:
            pass
        df = df.merge(
            full_metadata_df[["ID", cat]],
            left_on="ID",
            right_on="ID",
            how="inner",
        )
        df.insert(1, "Expected class", df[cat])
        df = df.drop(columns=[cat])

        old_names = df.columns[1 : same_col_len - 1]
        new_names = [f"{old_name} ({cat})" for old_name in old_names]
        df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

        new_dfs[cat] = df

    df_order = [ASSAY, SEX, CANCER, LIFE_STAGE, BIOMAT]
    df_list = [new_dfs[cat] for cat in df_order]

    merge_dataframes_func = functools.partial(merge_dataframes, on="external_id")
    full_merged_df = functools.reduce(merge_dataframes_func, df_list)
    full_merged_df.reset_index(drop=True, inplace=True)

    full_merged_df = full_merged_df.merge(
        full_metadata_df,
        on="ID",
        how="inner",
        suffixes=("", "_DROP"),
    )
    for col in full_merged_df.columns:
        if col.endswith("_DROP"):
            full_merged_df.drop(columns=col, inplace=True)

    full_merged_df.insert(1, "ID", full_merged_df.pop("ID"))

    return full_merged_df

In [None]:
# This is the merging process when prediction files are all available
# final_df = merge_all_recount3_preds(pred_dfs, recount_metadata_df)
# print(f"Final df shape: {final_df.shape}")

### Manual merge

If didn't merge with `merge_all_recount3_preds`

In [None]:
# Life stage metadata already has embryo/fetal/newborn merged into perinatal
recount_metadata_df[LIFE_STAGE].value_counts(dropna=False)
recount_metadata_df.rename(columns={LIFE_STAGE: f"{LIFE_STAGE}_merged"}, inplace=True)

In [None]:
preds_only_path = table_dir / "dfreeze_v2" / "predictions" / "recount3_merged_preds.csv"
preds_df = pd.read_csv(preds_only_path, sep=",", low_memory=False)
display(preds_df.head())
print(preds_df.shape)

In [None]:
merged_df = pd.merge(
    preds_df,
    recount_metadata_df,
    left_on="ID",
    right_on="ID",
    how="inner",
)

In [None]:
filename = "recount3_merged_preds_metadata_freeze1.csv.gz"
out_path = table_dir / "dfreeze_v2" / "predictions" / filename
merged_df.to_csv(out_path, sep=",", index=False, compression="gzip")
print(f"Saved to {out_path}")