In [None]:
"""Workbook to create supplementary prediction files destined for the paper.

Includes most data predictions used to create paper figures.
"""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines

In [None]:
%load_ext autoreload
%autoreload 2

## SETUP

In [None]:
from __future__ import annotations

import functools
import gc
import json
import shutil
from collections import defaultdict
from pathlib import Path
from typing import Dict, List, Set, Tuple

import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import classification_report, confusion_matrix as sk_cm

from epi_ml.utils.classification_merging_utils import merge_dataframes
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    CELL_TYPE,
    LIFE_STAGE,
    SEX,
    MetadataHandler,
    SplitResultsHandler,
)

In [None]:
DISEASE = "harmonized_sample_disease_high"
CANCER = "harmonized_sample_cancer_high"
BIOMAT = "harmonized_biomaterial_type"

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
metadata_dir = base_data_dir / "metadata"
paper_dir = base_dir
table_dir = paper_dir / "tables"
predictions_dir = base_data_dir / "training_results" / "predictions"

In [None]:
split_results_handler = SplitResultsHandler()
metadata_handler = MetadataHandler(paper_dir)

In [None]:
def prepare_df_for_save(
    df: pd.DataFrame, md5sum_to_epirr: Dict[str, str], md5sum_to_uuid: Dict[str, str]
) -> pd.DataFrame:
    """Prepare predictions DataFrame for saving to CSV. Return a modified DataFrame, with:
    - Index set to md5sum
    - Expected class as the first column
    - epirr_without_version column added
    - uuid column added
    - Sorted by epirr_without_version and uuid
    """
    df = df.copy(deep=True)
    df.insert(0, "Expected class", df.pop("True class"))
    df.set_index("md5sum", inplace=True)

    df["epirr_without_version"] = df.index.map(md5sum_to_epirr)
    df["uuid"] = df.index.map(md5sum_to_uuid)
    df.sort_values(by=["epirr_without_version", "uuid"], inplace=True)
    return df

### Official metadata

In [None]:
meta_df = metadata_handler.load_metadata_df(version="v2", merge_assays=False)
md5sum_to_epirr = meta_df["epirr_id_without_version"].to_dict()
md5sum_to_uuid = meta_df["uuid"].to_dict()
del meta_df

In [None]:
official_metadata_dir = base_data_dir / "metadata" / "official"

metadata_v1_1_path = (
    official_metadata_dir / "IHEC_metadata_harmonization.v1.1.extended.csv"
)
metadata_v1_1 = pd.read_csv(metadata_v1_1_path, index_col=False)
metadata_v1_1.set_index("epirr_id_without_version", inplace=True)

metadata_v1_2_path = (
    official_metadata_dir / "IHEC_metadata_harmonization.v1.2.extended.csv"
)
metadata_v1_2 = pd.read_csv(metadata_v1_2_path, index_col=False)
metadata_v1_2.set_index("epirr_id_without_version", inplace=True)

## EpiATLAS training metadata

In [None]:
output_dir = table_dir / "datasets_composition"

In [None]:
path_meta = metadata_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
with open(path_meta, "r", encoding="utf8") as f:
    records = json.load(f)["datasets"]

epiatlas_meta_df = pd.DataFrame(records)
del records
_ = gc.collect()

In [None]:
epiatlas_meta_df[CANCER] = epiatlas_meta_df[DISEASE].map(
    {"Disease": "non-cancer", "Healthy/None": "non-cancer", "Cancer": "cancer"}
)

In [None]:
# Sanity check, one epirr = one biospecimen (cell type)
for epirr, cell_type in epiatlas_meta_df.groupby("epirr_id")[CELL_TYPE].unique().items():
    if len(cell_type) != 1:
        raise ValueError(f"Dataset with multiple cell types ({epirr}): {cell_type}")

In [None]:
df_biospecimens = epiatlas_meta_df.fillna("unknown").copy(deep=True)

df_biospecimens = df_biospecimens.groupby(CELL_TYPE, dropna=False).agg(
    {
        "epirr_id": "nunique",
        "uuid": "nunique",
        "md5sum": "nunique",
    }
)

df_biospecimens.rename(
    columns={
        "epirr_id": "Biospecimen count",
        "uuid": "uuid/experiment count",
        "md5sum": "File Count",
    },
    inplace=True,
)
df_biospecimens = df_biospecimens.sort_values("Biospecimen count", ascending=False)

df_biospecimens.to_csv(output_dir / "EpiATLAS_biospecimens.csv")

In [None]:
# Sanity check, one uuid = one experiment

# Group by UUID and get unique assays per UUID
groupby_uuid = epiatlas_meta_df.groupby("uuid")[ASSAY].unique()

for uuid, assay in groupby_uuid.items():
    if len(assay) != 1:
        print(f"uuid with multiple assays ({uuid}): {assay}")
        break

one uuid `!=` one experiment --> Input files need to be handled separately

In [None]:
# Count occurrences of each assay
experiment_counter = groupby_uuid.explode().value_counts()

# Detailled composition
N_exp = experiment_counter.sum()
N_uuid = len(groupby_uuid)
N_input_alone = N_uuid + experiment_counter["input"] - N_exp

print(f"Total experiments: {N_exp}")
print(f"Total uuids (can include assay+input): {N_uuid}")
print(f"Total input: {experiment_counter.get('input', 0)}")
print(f"Total input alone: {N_input_alone}")
print(
    f"#exp - #uuid = #input - #input alone: {N_exp}-{N_uuid} = {experiment_counter.get('input', 0)}-{N_input_alone} = {N_exp - N_uuid}"
)

assert (N_exp - N_uuid) == (experiment_counter.get("input", 0) - N_input_alone)

# Convert to DataFrame
df_exp = experiment_counter.rename_axis("Experiment Assay").reset_index(
    name="Experiment Count"
)
df_exp = df_exp.sort_values("Experiment Count", ascending=False).set_index(
    "Experiment Assay"
)

In [None]:
# Count the number of files per assay
unique_files = (
    epiatlas_meta_df.groupby(ASSAY)["md5sum"]
    .count()
    .rename("File Count")
    .sort_values(ascending=False)
)

# Merge file counts with experiment counts
df_exp = df_exp.merge(
    unique_files, left_on="Experiment Assay", right_index=True, how="left"
)

# Compute the track type average count
df_exp["Track type average count"] = df_exp["File Count"] / df_exp["Experiment Count"]

df_exp.to_csv(output_dir / "EpiATLAS_assays.csv")

In [None]:
track_types = (
    epiatlas_meta_df.groupby("track_type")["md5sum"]
    .count()
    .rename("File Count")
    .sort_values(ascending=False)
)
track_types.to_csv(output_dir / "EpiATLAS_track_types.csv")

In [None]:
del groupby_uuid, df_biospecimens, experiment_counter, df_exp, unique_files, track_types
_ = gc.collect()

## Collect experiment keys for all trained classifiers

In [None]:
def extract_experiment_info(line: str) -> Tuple[str, str] | None:
    """Extract split and experiment key from a line containing checkpoint information.

    Line should have format: .../splitX/EpiLaP/[exp_key]/checkpoints/...
    """
    if "EpiLaP" not in line:
        return None

    parts = line.strip().split("/")
    for i, part in enumerate(parts):
        if part == "EpiLaP" and i > 0:
            return (parts[i - 1], parts[i + 1])
    return None


def process_log_file(file_path: Path) -> Set[Tuple[str, str]]:
    """Process a single log file and extract experiment information."""
    experiment_info = set()
    try:
        with open(file_path, "r", encoding="utf8") as f:
            for line in f:
                if result := extract_experiment_info(line):
                    experiment_info.add(result)
    except Exception as e:  # pylint: disable=broad-exception-caught
        print(f"Error processing {file_path}: {e}")

    return experiment_info


def collect_exp_keys(folder: Path) -> Dict[str, Set[Tuple[str, str]]]:
    """Collect experiment keys from log files (.o files), recursively from a given folder."""
    experiments_keys = defaultdict(set)

    log_files = folder.glob("*1l_3000n/**/*.o")
    for file in log_files:
        if experiment_info := process_log_file(file):
            experiments_keys[file.parent].update(experiment_info)

    return experiments_keys

In [None]:
def format_exp_key_context(
    experiments_keys_dict: Dict[str, Set[Tuple[str, str]]]
) -> pd.DataFrame:
    """Format experiment keys context for saving to a DataFrame."""
    data = []
    for exp_folder, exp_keys in experiments_keys_dict.items():
        for split, exp_key in exp_keys:
            data.append((str(exp_folder), split, exp_key))

    df = pd.DataFrame(data, columns=["exp_folder", "split", "exp_key"])

    # comet-ml experiment url
    df["comet-url"] = "https://www.comet.com/rabyj/epiclass/" + df["exp_key"].astype(str)

    # Remove useless part of paths
    to_remove_path = (
        str(Path.home() / "Projects/epiclass/output/paper/data/training_results") + "/"
    )
    df["complete_experiment_context"] = df["exp_folder"].str.replace(to_remove_path, "")
    df.drop(columns="exp_folder", inplace=True)

    # Split path into named parts
    df[["release", "feature_set_name", "metadata_category"]] = (
        df["complete_experiment_context"].str.split("/", expand=True).loc[:, [0, 1, 2]]
    )

    df["experiment_specification"] = (
        df["complete_experiment_context"]
        .str.split("/", n=3, expand=True)[3]
        .str.replace("/", ",")
    )

    # Remove redundant info (all MLP exp are 1 hidden layer 3000 nodes)
    df["metadata_category"] = df["metadata_category"].str.replace("_1l_3000n", "")

    # Reorder columns
    df_new_col_order = df.columns.to_list()[-4:] + df.columns.to_list()[:-4]
    df = df[df_new_col_order]

    return df  # type: ignore

In [None]:
all_exp_keys_dfs = []
for folder in ["dfreeze_v2", "2023-01-epiatlas-freeze", "imputation"]:
    data_dir = base_data_dir / "training_results" / folder
    for subfolder in data_dir.glob("*"):
        if subfolder.is_file():
            continue
        # print(subfolder)
        exp_key_dict = collect_exp_keys(subfolder)
        df = format_exp_key_context(exp_key_dict)
        all_exp_keys_dfs.append(df)

exp_keys_df = pd.concat(all_exp_keys_dfs, ignore_index=True)

In [None]:
# for col in ["release", "feature_set_name", "metadata_category", "split"]:
#     display(exp_keys_df[col].value_counts(dropna=False))

# exp_keys_df.to_csv(table_dir / "training_experiment_keys.csv", index=False)

## assay_epiclass + sample ontology for all 5 model types - 100kb_all_none

In [None]:
data_dir_100kb = base_data_dir / "training_results" / "dfreeze_v2" / "hg38_100kb_all_none"
logdir = table_dir / "dfreeze_v2" / "100kb_all_none"
if not logdir.exists():
    logdir.mkdir(parents=True)

split_md5sums = []
all_metrics = {}
for category in [ASSAY, CELL_TYPE]:
    all_split_dfs = split_results_handler.gather_split_results_across_methods(
        results_dir=data_dir_100kb,
        label_category=category,
        only_NN=False,
    )

    # Sanity check, same shape, same input files for each method
    for split_dict in all_split_dfs.values():
        ref_dict = split_dict["NN"]
        ref_md5sums = sorted(ref_dict.index.values.tolist())
        ref_shape = ref_dict.shape
        for method, df in split_dict.items():
            if not ref_md5sums == sorted(df.index.values.tolist()):
                raise ValueError("MD5sums do not match")
            if ref_shape != df.shape:
                raise ValueError("Shapes do not match")

    split_metrics = split_results_handler.compute_split_metrics(all_split_dfs)
    all_metrics[category] = split_metrics

    all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(all_split_dfs)  # type: ignore

    # Save to file
    for method, df in all_split_dfs_concat.items():
        df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

        if method == "NN":
            method = "MLP"

        filename = f"10fold_predictions_{category}_{method}.csv"
        df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

In [None]:
# Classifier type, Classification task, split, acc, F!, AUC-micro, AUC-macro, N
metrics_as_rows = []
for category, split_metrics in all_metrics.items():
    for split_name, metrics in split_metrics.items():
        for method, metric_dict in metrics.items():
            metric_values = []
            for metric_name in [
                "Accuracy",
                "F1_macro",
                "AUC_micro",
                "AUC_macro",
                "count",
            ]:
                metric_values.append(metric_dict[metric_name])
            metrics_as_rows.append((method, category, split_name, *metric_values))

In [None]:
metrics_df = pd.DataFrame(
    metrics_as_rows,
    columns=[
        "Classifier type",
        "Classification task",
        "Split name",
        "Accuracy",
        "F1-score macro",
        "AUC micro",
        "AUC macro",
        "Validation size",
    ],
)
print(metrics_df.shape)

metrics_df.replace("NN", "MLP", inplace=True)
metrics_df.replace("RF", "RandomForest", inplace=True)
metrics_df.replace("LGBM", "LightGBM", inplace=True)
metrics_df.replace("LR", "LogisticRegression", inplace=True)
metrics_df.replace("LinearSVC", "LinearSVM", inplace=True)

In [None]:
# metrics_df.to_csv(table_dir / "dfreeze_v2_100kb_all_none_5algo_metrics.csv", index=False)

## Other MLP results - 100kb_all_none

In [None]:
data_dir_100kb = base_data_dir / "training_results" / "dfreeze_v2" / "hg38_100kb_all_none"
logdir = table_dir / "dfreeze_v2" / "100kb_all_none"
if not logdir.exists():
    logdir.mkdir(parents=True)

In [None]:
categories = [
    "paired_end",
    "harmonized_sample_cancer_high",
    LIFE_STAGE,
    SEX,
    "harmonized_biomaterial_type",
    "project",
    CELL_TYPE,
]

# Select 10-fold oversampling runs
all_split_dfs = split_results_handler.general_split_metrics(
    results_dir=data_dir_100kb,
    merge_assays=False,
    include_categories=categories,
    exclude_names=["reg", "no-mixed", "chip", "16ct", "27ct"],
    return_type="split_results",
    oversampled_only=True,
    verbose=False,
)
all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(all_split_dfs, concat_first_level=True)  # type: ignore

In [None]:
cat_mapper = {key: (key if "sex" not in key else SEX) for key in all_split_dfs_concat}

In [None]:
# Save to file
new_dfs = {}
for category, df in all_split_dfs_concat.items():
    new_df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

    # we want to write down the expected class for both metadata versions,
    # since there were changes in-between
    if category in [LIFE_STAGE, SEX]:
        for version, metadata in [("v1.2", metadata_v1_2), ("v1.1", metadata_v1_1)]:
            idx = new_df.index.map(md5sum_to_epirr).values
            values = metadata.loc[idx, category].to_list()  # type: ignore
            new_df.insert(loc=0, column=f"Expected class {version}", value=values)
        new_df.drop("Expected class", axis=1, inplace=True)

    # using training metadata
    # necessary because some cell line samples got life stage labels
    # in further versions, which would make the 10fold files
    # different than was was actually used for training
    else:
        cat = cat_mapper[category]
        values = epiatlas_meta_df[["md5sum", cat]].set_index("md5sum")
        values = values.loc[new_df.index, cat]
        new_df["Expected class"] = values

    new_dfs[category] = new_df

    filename = f"10fold_predictions_{category}_MLP.csv"
    new_df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

In [None]:
same_col_len = 4

final_dfs = {}
for category, df in new_dfs.items():
    new_df = df.copy(deep=True)
    new_df.drop(["uuid", "epirr_without_version"], axis=1, inplace=True)
    new_df.insert(0, "split", new_df.pop("split"))

    try:
        new_df = new_df.reset_index()
    except ValueError:
        new_df = new_df.reset_index(drop=True)

    # All dataframes need to have same shape for rest of code to work
    if category in [LIFE_STAGE, SEX]:
        new_df.drop("Expected class v1.1", axis=1, inplace=True)
        new_df.rename(columns={"Expected class v1.2": "Expected class"}, inplace=True)

    new_df.insert(loc=0, column="md5sum", value=new_df.pop("md5sum"))

    # this only works if all the columns to the right correspond to pred vector
    pred_cols = new_df.columns[same_col_len:].tolist()
    new_df.insert(
        loc=same_col_len, column="Max pred", value=new_df[pred_cols].max(axis=1)
    )

    old_names = new_df.columns[1 : same_col_len + 1]
    new_names = [f"{old_name} ({category})" for old_name in old_names]
    new_df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

    final_dfs[category] = new_df

In [None]:
final_df = functools.reduce(merge_dataframes, final_dfs.values())

final_df["epirr"] = final_df.loc[:, "md5sum"].map(md5sum_to_epirr)
final_df["uuid"] = final_df.loc[:, "md5sum"].map(md5sum_to_uuid)

In [None]:
final_df.to_csv(
    logdir / "all_10fold_predictions_MLP.csv", index=False, sep=",", float_format="%.4f"
)

## Results for other feature sets (MLP)

In [None]:
def verify_splits_identity(
    all_results: Dict[str, Dict[str, Dict[str, pd.DataFrame]]],
    task_names: List[str],
    verbose: bool | None = None,
) -> None:
    """Verify that the splits are identical between feature sets for each task.

    all_results: {feature_set: {task_name: {split_name: results_dataframe}}}
    task_names: list of task names to verify
    verbose: print additional information
    """
    # Sanity check : MD5sums and shapes should match between reference and other feature sets, for each split
    for task_name in task_names:
        if verbose:
            print(f"Verifying task '{task_name}'")
        # Select a reference feature set and use its splits as the baseline for comparison
        reference_feature_set = "hg38_100kb_all_none"
        reference_splits = all_results[reference_feature_set][task_name]

        # Create reference MD5sums and shapes for each split in the reference feature set
        reference_md5sums = {
            split_name: sorted(df.index.tolist())
            for split_name, df in reference_splits.items()
        }
        reference_shapes = {
            split_name: df.shape for split_name, df in reference_splits.items()
        }

        # Iterate over each feature set and compare its splits against the reference
        for feature_set_name, tasks_dict in all_results.items():
            if verbose:
                print(
                    f"Verifying feature set '{feature_set_name}' against reference feature set '{reference_feature_set}'"
                )
            for split_name, df in tasks_dict[task_name].items():
                if reference_shapes[split_name] != df.shape:
                    print(
                        f"WARNING: Shape mismatch in task '{task_name}', split '{split_name}', "
                        f"between reference feature set '{reference_feature_set}' and feature set '{feature_set_name}'",
                    )
                if reference_md5sums[split_name] != sorted(df.index.tolist()):
                    print(
                        f"WARNING: MD5sums mismatch in task '{task_name}', split '{split_name}', "
                        f"between reference feature set '{reference_feature_set}' and feature set '{feature_set_name}'",
                    )

In [None]:
categories = [ASSAY, CELL_TYPE]
include_sets = [
    "hg38_10mb_all_none_1mb_coord",
    "hg38_100kb_random_n316_none",
    "hg38_1mb_all_none",
    "hg38_100kb_random_n3044_none",
    "hg38_100kb_all_none",
    "hg38_gene_regions_100kb_coord_n19864",
    "hg38_10kb_random_n30321_none",
    "hg38_regulatory_regions_n30321",
    "hg38_1kb_random_n30321_none",
    "hg38_cpg_topvar_200bp_10kb_coord_n30k",
    "hg38_10kb_all_none",
    "hg38_regulatory_regions_n303114",
    "hg38_1kb_random_n303114_none",
    "hg38_cpg_topvar_200bp_10kb_coord_n300k",
]
exclude_names = ["7c", "chip-seq-only", "27ct", "16ct"]

# Select 10-fold oversampling runs
# expected result shape: {feature_set: {task_name: {split_name: results_dataframe}}}
all_results: Dict[
    str, Dict[str, Dict[str, pd.DataFrame]]
] = split_results_handler.obtain_all_feature_set_data(
    return_type="split_results",
    parent_folder=data_dir_100kb.parent,
    merge_assays=False,
    include_categories=categories,
    include_sets=include_sets,
    exclude_names=exclude_names,
    verbose=False,
)  # type: ignore

In [None]:
# replace assay_11c with assay
for feature_set in all_results.values():
    try:
        feature_set[ASSAY] = feature_set.pop(f"{ASSAY}_11c")
    except KeyError:
        pass

In [None]:
verify_splits_identity(all_results, categories)

In [None]:
logdir = table_dir / "dfreeze_v2" / "other_feature_sets"
logdir.mkdir(parents=True, exist_ok=True)

for feature_set_name, tasks_dict in all_results.items():
    if feature_set_name == "hg38_100kb_all_none":
        continue
    all_split_dfs_concat: Dict = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )  # type: ignore
    for task_name, df in all_split_dfs_concat.items():
        df = prepare_df_for_save(df, md5sum_to_epirr, md5sum_to_uuid)

        filename = f"{feature_set_name}_10fold_predictions_{task_name}.csv"
        df.to_csv(logdir / filename, index=True, sep=",", float_format="%.4f")

## Winsorized files and/or blacklist zeroed

In [None]:
categories = [ASSAY, CELL_TYPE, SEX, "harmonized_biomaterial_type"]
include_sets = [
    "hg38_100kb_all_none",
    "hg38_100kb_all_none_0blklst",
    "hg38_100kb_all_none_0blklst_winsorized",
]

results_folder = base_data_dir / "training_results" / "2023-01-epiatlas-freeze"
if not results_folder.exists():
    raise FileNotFoundError(f"Folder '{results_folder}' not found")

logdir = table_dir / "2023-01-epiatlas-freeze"
if not logdir.exists():
    logdir.mkdir()

In [None]:
# Select 10-fold oversampling runs
# expected result shape: {feature_set: {task_name: {split_name: results_dataframe}}}
all_results: Dict[
    str, Dict[str, Dict[str, pd.DataFrame]]
] = split_results_handler.obtain_all_feature_set_data(
    return_type="split_results",
    parent_folder=results_folder,
    merge_assays=False,
    include_categories=categories,
    include_sets=include_sets,
    oversampled_only=False,
    verbose=False,
)  # type: ignore

display(all_results.keys())

In [None]:
tasks_collected = list(all_results["hg38_100kb_all_none"])
verify_splits_identity(all_results, tasks_collected, verbose=True)

In [None]:
# save concatenated result
for feature_set_name, tasks_dict in all_results.items():
    concatenated_dfs = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )
    for task_name, concatenated_df in concatenated_dfs.items():
        concatenated_df = prepare_df_for_save(concatenated_df, md5sum_to_epirr, md5sum_to_uuid)  # type: ignore
        filename = f"{feature_set_name}_10fold_predictions_{task_name}.csv"
        print(f"Saving {filename}")
        concatenated_df.to_csv(
            logdir / filename, index=True, sep=",", float_format="%.4f"
        )

### Evaluate input dataset discrepancy in assay_epiclass

In [None]:
metadata_handler = MetadataHandler(paper_dir)
metadata_df = metadata_handler.load_metadata_df("v2", merge_assays=False)

In [None]:
values_counts = {}
for feature_set_name, tasks_dict in all_results.items():
    concatenated_dfs = split_results_handler.concatenate_split_results(
        tasks_dict, concat_first_level=True
    )
    md5sums = concatenated_dfs[ASSAY].index.tolist()
    print(f"{feature_set_name}: {len(md5sums)}")

    metadata_subset = metadata_df[metadata_df.index.isin(md5sums)]
    values_counts[feature_set_name] = metadata_subset[ASSAY].value_counts()

In [None]:
display(
    values_counts["hg38_100kb_all_none_0blklst"] - values_counts["hg38_100kb_all_none"]
)

## ENCODE predictions

In [None]:
other_preds_dir = table_dir / "dfreeze_v2" / "predictions"

See:
- src/python/epi_ml/utils/notebooks/paper/encode_metadata_creation.ipynb
- src/python/epi_ml/utils/notebooks/paper/encode_pred_analysis.ipynb  

In [None]:
output_dir = table_dir / "datasets_composition"

In [None]:
encode_id_cols = ["EXPERIMENT_accession", "BIOSAMPLE_accession", "FILE_accession"]

In [None]:
encode_preds_path = (
    predictions_dir
    / "encode"
    / "complete_encode_predictions_augmented_2025-02_metadata.csv"
)
shutil.copy(encode_preds_path, other_preds_dir)

In [None]:
encode_preds_df = pd.read_csv(encode_preds_path, encoding="utf8", low_memory=False)
print(encode_preds_df.shape)

In [None]:
N_files = encode_preds_df.shape[0]
N_exp = encode_preds_df["EXPERIMENT_accession"].nunique()
N_biospecimen = encode_preds_df["BIOSAMPLE_accession"].nunique()
print(N_files, N_exp, N_biospecimen)

In [None]:
assays_df = encode_preds_df.groupby(ASSAY, dropna=False).agg(
    {
        "EXPERIMENT_accession": "nunique",
        "FILE_accession": "nunique",
    }
)
for label in encode_id_cols:
    try:
        assays_df.rename(
            columns={label: label.replace("_accession", " count")}, inplace=True
        )
    except KeyError:
        pass


assays_df["file_per_experiment"] = assays_df["FILE count"] / assays_df["EXPERIMENT count"]

assays_df.to_csv(output_dir / "ENCODE_assays.csv")

In [None]:
df_biospecimens = encode_preds_df.fillna("unknown").copy(deep=True)

groupby = df_biospecimens.groupby(
    ["BIOSAMPLE_TYPE_name", "BIOSAMPLE_TYPE_term_name"], dropna=False
).agg(
    {
        "BIOSAMPLE_accession": "nunique",
        "EXPERIMENT_accession": "nunique",
        "FILE_accession": "nunique",
    }
)

for label in encode_id_cols:
    try:
        groupby.rename(
            columns={label: label.replace("_accession", " count")}, inplace=True
        )
    except KeyError:
        pass

groupby.to_csv(output_dir / "ENCODE_biospecimens.csv")

## ChIP-Atlas predictions

See
- `src/python/epi_ml/utils/notebooks/paper/c-a_pred_analysis.ipynb`
- `src/python/epi_ml/utils/notebooks/paper/c-a_metadata.ipynb`

Predictions pre-joined and then merged with custom metadata (with info on 4 databases).

In [None]:
meta_path = metadata_dir / "chip_atlas" / "CA_metadata_joined_20250306.tsv"
meta_df = pd.read_csv(meta_path, sep="\t", low_memory=False)
print(meta_df.shape)

pred_path = predictions_dir / "C-A" / "CA_only_pred_20240606.tsv"
pred_df = pd.read_csv(pred_path, sep="\t", low_memory=False)
print(pred_df.shape, pred_df.columns[0])

In [None]:
col1 = meta_df.columns[0]
col2 = pred_df.columns[0]

full_df = pd.merge(pred_df, meta_df, how="left", left_on=col1, right_on=col2)
full_df.to_csv(other_preds_dir / "ChIP-Atlas_predictions.csv.gz", compression="gzip")

## ChIP-Seq_imputed_with_RNA-Seq_only predictions

In [None]:
output_dir = table_dir / "dfreeze_v2" / "epiatlas_imputed"

Predictions are from epiclass_11c complete training (with oversampling) MLP classifer  
Training details at 0f8e5eb996114868a17057bebe64f87c (comet-ml id)

In [None]:
pred_folder = base_data_dir / "training_results" / "predictions" / "epiatlas_imputed"
pred_file = "complete_no_valid_oversample_test_prediction_100kb_all_none_ChIP-Seq_imputed_with_RNA-Seq_only.csv"
pred_df = pd.read_csv(pred_folder / pred_file)
print(pred_df.shape)

In [None]:
pred_df.rename(columns={"Unnamed: 0": "filename"}, inplace=True)

# filename of format 'impute_[ihec-id]_[expected-class]_[resolution]_[filter_in]_[filter_out].csv'
pred_df["True class"] = pred_df["filename"].str.split("_", expand=True)[2].str.lower()
pred_df.rename(columns={"True class": "Expected class"}, inplace=True)

In [None]:
idx_pred_col = np.where(pred_df.columns == "Predicted class")[0][0]
pred_df.insert(
    loc=int(idx_pred_col + 1),
    column="Same?",
    value=pred_df["Expected class"] == pred_df["Predicted class"],
)

In [None]:
print(f"Accuracy: {pred_df['Same?'].sum() / pred_df.shape[0]:.2%}")

In [None]:
non_pred_vector_cols = 4
nb_classes = 11
pred_df.insert(
    loc=non_pred_vector_cols,
    column="Max pred",
    value=pred_df.iloc[:, non_pred_vector_cols : non_pred_vector_cols + nb_classes].max(
        axis=1  # type: ignore
    ),
)

In [None]:
# pred_df.to_csv(output_dir / "epiatlas_imputed_w_rna_only_predictions.csv", index=False)

## recount3

In [None]:
split_results_handler = SplitResultsHandler()

In [None]:
recount3_folder = (
    base_data_dir
    / "training_results"
    / "predictions"
    / "recount3"
    / "hg38_100kb_all_none"
)
if not recount3_folder.exists():
    raise FileNotFoundError()

In [None]:
split_pred_files = {}
for cat in [ASSAY, SEX, LIFE_STAGE, CANCER]:
    pred_files = list(recount3_folder.rglob(f"{cat}*/**/recount3/complete_*.csv"))
    if len(pred_files) != 7:
        raise FileNotFoundError(f"Expected 7 files for {cat}, found {len(pred_files)}")
    split_pred_files[cat] = pred_files

if len(split_pred_files) != 4:
    raise FileNotFoundError(f"Expected 4 categories, found {len(split_pred_files)}")

In [None]:
pred_dfs = {}
for cat, pred_files in split_pred_files.items():
    dfs = []
    for pred_file in pred_files:
        df = pd.read_csv(pred_file, low_memory=False)
        dfs.append(df)
    concat_df = pd.concat(dfs, ignore_index=True)
    pred_dfs[cat] = concat_df

In [None]:
for cat, pred_df in list(pred_dfs.items()):
    try:
        pred_df = pred_df.drop("True class", axis=1)
    except KeyError:
        pass
    pred_df = pred_df.rename(columns={"Unnamed: 0": "filename"})

    # Add max pred + move it to front
    pred_df = split_results_handler.add_max_pred(pred_df, target_label="Predicted class")
    pred_df.insert(2, "Max pred", pred_df.pop("Max pred"))
    pred_df = pred_df[pred_df["Max pred"] >= 0]

    # Get id columns
    id_cols = (
        pred_df["filename"].str.split(".", expand=True)[2].str.split("_", expand=True)
    )

    pred_df.insert(1, "id1", id_cols.loc[:, 0])
    pred_df.insert(2, "id2", id_cols.loc[:, 1])

    pred_dfs[cat] = pred_df

In [None]:
# display(pred_dfs[ASSAY]["filename"].str.split(".", expand=True)[2].str.split("_",expand=True).head())
# display(pred_dfs[ASSAY]["id1"].nunique(), pred_dfs[ASSAY]["id2"].nunique())
# display(pred_dfs[ASSAY]["id2"].str.slice(0,3).value_counts())
assert pred_dfs[ASSAY]["id2"].nunique() == pred_dfs[ASSAY].shape[0]

In [None]:
meta_name = "harmonized_metadata_20250122_leuk2"
metadata_file = metadata_dir / "recount3" / f"recount_{meta_name}.tsv"
recount_metadata_df = pd.read_csv(metadata_file, sep="\t")

In [None]:
# display(recount_metadata_df["cell_line_flag"].value_counts(dropna=False))

In [None]:
recount_metadata_df.rename(
    mapper={
        "harmonized_assay": ASSAY,
        "harmonized_lifestage": LIFE_STAGE,
        "harmonized_sex": SEX,
        "harmonized_cancer": CANCER,
    },
    axis=1,
    inplace=True,
)
recount_metadata_df.fillna("unknown", inplace=True)

In [None]:
recount_metadata_df["tissue_keyword"].str.split(":", expand=True)[0].value_counts(
    dropna=False
).to_csv(table_dir / "datasets_composition" / "recount3_biospecimen.csv")

In [None]:
recount_metadata_df["assay_epiclass"].value_counts(dropna=False).to_csv(
    table_dir / "datasets_composition" / "recount3_assays.csv"
)

In [None]:
def merge_all_recount3_preds(
    pred_dfs: Dict[str, pd.DataFrame], full_metadata_df: pd.DataFrame
) -> pd.DataFrame:
    """Merge all recount3 predictions into a single DataFrame."""
    same_col_len = 5
    # Make all different columns have unique relevant names except for the pred vector
    new_dfs = {}
    for cat, df in pred_dfs.items():
        df = df.copy()
        df["ID"] = df["id2"]
        df = df.drop(["id1", "id2"], axis=1)
        try:
            df = df.drop(columns=["Same?"])
        except KeyError:
            pass
        df = df.merge(
            full_metadata_df[["ID", cat]],
            left_on="ID",
            right_on="ID",
            how="inner",
        )
        df.insert(1, "Expected class", df[cat])
        df = df.drop(columns=[cat])

        old_names = df.columns[1 : same_col_len - 1]
        new_names = [f"{old_name} ({cat})" for old_name in old_names]
        df.rename(columns=dict(zip(old_names, new_names)), inplace=True)

        new_dfs[cat] = df

    df_order = [ASSAY, SEX, CANCER, LIFE_STAGE]
    df_list = [new_dfs[cat] for cat in df_order]

    merge_dataframes_func = functools.partial(merge_dataframes, on="external_id")
    full_merged_df = functools.reduce(merge_dataframes_func, df_list)
    full_merged_df.reset_index(drop=True, inplace=True)

    full_merged_df = full_merged_df.merge(
        full_metadata_df,
        on="ID",
        how="inner",
        suffixes=("", "_DROP"),
    )
    for col in full_merged_df.columns:
        if col.endswith("_DROP"):
            full_merged_df.drop(columns=col, inplace=True)
    return full_merged_df

In [None]:
final_df = merge_all_recount3_preds(pred_dfs, recount_metadata_df)
print(f"Final df shape: {final_df.shape}")

In [None]:
final_df.insert(1, "ID", final_df.pop("ID"))

In [None]:
out_path = recount3_folder / f"recount3_merged_preds_{meta_name}.tsv.gz"
# final_df.to_csv(out_path, sep="\t", index=False, compression="gzip")

### accuracy

In [None]:
preds_path = recount3_folder / f"recount3_merged_preds_{meta_name}.tsv.gz"
full_df = pd.read_csv(preds_path, sep="\t")

In [None]:
assay_df = full_df[full_df[ASSAY] != "unknown"]
N = assay_df.shape[0]

for max_pred in [0, 0.6, 0.8]:
    subset = assay_df[assay_df[f"Max pred ({ASSAY})"] >= max_pred]
    counts = subset[f"Predicted class ({ASSAY})"].value_counts()

    N_subset = counts.sum()
    counts_perc = counts / N_subset
    correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
    print(f"min_PredScore >= {max_pred} ({N_subset/N:.2%} left): {correct_perc:.2%}\n")

    print("Predictions grouped, assay types left as is")
    groupby = (
        subset.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Predictions grouped, all rna types = rna")
    tmp_df = subset.copy()
    tmp_df.loc[:, ASSAY] = "rna_seq"
    tmp_df.loc[:, f"Predicted class ({ASSAY})"].replace(
        "mrna_seq", "rna_seq", inplace=True
    )
    groupby = (
        tmp_df.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Breakdown by assay type")
    assay_breakdown = subset[ASSAY].value_counts(dropna=False)
    print(assay_breakdown / assay_breakdown.sum(), "\n")
    for assay_type in assay_breakdown.index:
        assay_type_subset = subset[subset[ASSAY] == assay_type].copy()

        counts = assay_type_subset[f"Predicted class ({ASSAY})"].value_counts()
        N_subset = counts.sum()
        counts_perc = counts / N_subset
        correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
        print(f"{assay_type} acc: {correct_perc:.2%}\n")
        print(f"{assay_type} preds:\n{counts_perc}\n")
    print()

In [None]:
for max_pred in [0, 0.6, 0.8]:
    subset = full_df[full_df[f"Max pred ({ASSAY})"] >= max_pred]
    print(f"min_PredScore >= {max_pred}")

    for cat in [SEX, CANCER, LIFE_STAGE]:
        pred_label = f"Predicted class ({cat})"
        true_label = f"Expected class ({cat})"

        if cat == CANCER:
            subset = subset.replace("healthy", "non-cancer")

        known_pred = subset[subset[true_label] != "unknown"]
        if cat == LIFE_STAGE:
            known_pred = known_pred[known_pred[true_label] != "children"]
        # print(known_pred[true_label].value_counts(dropna=False))

        classes = sorted(
            set(known_pred[pred_label].unique()) | set(known_pred[pred_label].unique())
        )

        N_known = known_pred.shape[0]
        N_unknown = subset.shape[0] - N_known
        # print(f"Unknown (%): {(N_unknown)/subset.shape[0]*100:.2f}")

        y_pred = known_pred[pred_label]
        y_true = known_pred[true_label]
        N_correct = (y_pred == y_true).sum()
        print(f"{cat} prediction match (%): {N_correct/N_known*100:.2f}")

        print(classification_report(y_true, y_pred, target_names=classes, zero_division=0) + "\n")  # type: ignore

        print(f"confusion matrix classes row order: {classes}")
        cm = sk_cm(y_true, y_pred, normalize="true", labels=classes)
        print(str(cm) + "\n")

    print()

## Sanity check for RNA Unique_raw tracks (unstranded data).

In [None]:
outfile = table_dir / "experiments_including_unique_raw_files.list"
outfile.unlink(missing_ok=True)
outfile.touch()

v2_meta_df = metadata_handler.load_metadata_df("v2", merge_assays=False)
md5_unique_raw = v2_meta_df[v2_meta_df["track_type"] == "Unique_raw"].index.unique()

with outfile.open("w", encoding="utf-8") as out:
    print(f"Total Unique_raw md5sums: {len(md5_unique_raw)}", file=out)
    for pred_file in table_dir.rglob("*pred*.csv"):
        if any(label in str(pred_file) for label in ["recount3", "encode"]):
            continue
        df = pd.read_csv(pred_file)

        # Get md5sums
        try:
            md5sums = set(df["md5sum"])
        except KeyError:
            if isinstance(df.index[0], str) and len(df.index[0]) == 32:
                md5sums = set(df.index)
            else:
                print(f"Could not find md5sum column in {pred_file}", file=out)
                continue

        shared_md5sums = md5sums.intersection(md5_unique_raw)

        pred_file_relpath = pred_file.relative_to(table_dir)
        print(f"{pred_file_relpath}: {len(shared_md5sums)}", file=out)