In [None]:
"""Get ENCODE EpiRRs, and determine which datasets are in EpiATLAS"""
# pylint: disable=import-error

In [None]:
import json
from collections import Counter
from pathlib import Path

import pandas as pd
import requests
from IPython.display import display
from tqdm import tqdm

from epi_ml.core.metadata import Metadata

In [None]:
ASSAY = "assay_epiclass"

First, download summary of all EpiRR epigenomes: https://www.ebi.ac.uk/epirr/docs  
This was already done.

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
metadata_dir = base_dir / "data/metadata"
if not metadata_dir.exists():
    raise ValueError(f"Path {metadata_dir} does not exist.")

### Parse EpiRR general metadata file

In [None]:
encode_metadata_dir = metadata_dir / "encode"

filename = "epirr_epigenomes_2025-02"
epigenomes_summary_path = encode_metadata_dir / f"{filename}.json"

with open(epigenomes_summary_path, "r", encoding="utf-8") as f:
    epigenomes_summary = json.load(f)

epigenomes_summary_df = pd.DataFrame(epigenomes_summary)
epigenomes_summary_df.to_csv(encode_metadata_dir / f"{filename}.csv", index=False)

In [None]:
display(epigenomes_summary_df["project"].value_counts(dropna=False))

In [None]:
encode_epirrs = epigenomes_summary_df[epigenomes_summary_df["project"] == "ENCODE"][
    "accession"
].tolist()

In [None]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")

## Download specific experiments metadata

Download metadata for all encode epigenomes.

In [None]:
encode_metadata_path = encode_metadata_dir / "encode_epigenomes_metadata_2025-02.json"

In [None]:
if not encode_metadata_path.exists():
    # Base URL
    base_url = "https://www.ebi.ac.uk/epirr/api/v1/epigenome?accession={}"

    # Collect metadata in a list
    metadata_list = []

    # Use tqdm for a progress bar
    for epirr in tqdm(encode_epirrs, desc="Fetching Metadata", unit="entry"):
        response = requests.get(
            base_url.format(epirr), headers={"accept": "application/json"}
        )
        if response.status_code == 200:
            metadata_list.append(response.json())  # Append parsed JSON
        else:
            print(f"Failed to fetch {epirr}: {response.status_code}")

    with open(encode_metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata_list, f, indent=2)

    print(f"Metadata saved to {encode_metadata_path}")

## Parse specific metadata for accessions

In [None]:
encode_metadata_path = encode_metadata_dir / "encode_epigenomes_metadata_2025-02.json"
with open(encode_metadata_path, "r", encoding="utf-8") as f:
    encode_metadata = json.load(f)

In [None]:
accessions_and_epirr = []
for dset in encode_metadata:
    epirr = dset["accession"]
    primary_ids = [file["primary_id"] for file in dset["raw_data"]]
    for primary_id in primary_ids:
        accessions_and_epirr.append((primary_id, epirr))

    # # it's an input file, multiple occurences is fine
    # if "ENCSR266XMB" in primary_ids:
    #     print(dset["raw_data"])
print("ENCODE total accessions:", len(accessions_and_epirr))

In [None]:
primary_ids_count = Counter([primary_id for primary_id, _ in accessions_and_epirr])
print("ENCODE unique accessions:", len(set(primary_ids_count.keys())))
print(primary_ids_count.most_common(5))

In [None]:
assert set(epirr for _, epirr in accessions_and_epirr) == set(encode_epirrs)

## Compare with EpiATLAS

In [None]:
epiatlas_metadata_path = (
    metadata_dir / "official" / "IHEC_metadata_harmonization.v1.2.extended.csv"
)
epiatlas_df = pd.read_csv(epiatlas_metadata_path, index_col=False)

In [None]:
epiatlas_epirrs = set(epiatlas_df["epirr_id_without_version"].tolist())
common_epirrs = set(encode_epirrs).intersection(epiatlas_epirrs)
diff_epirr = set(encode_epirrs).difference(epiatlas_epirrs)

In [None]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")
print(f"EpiATLAS EpiRRs: {len(epiatlas_epirrs)}")
print(f"ENCODE EpiRRs in EpiATLAS: {len(common_epirrs)}")

In [None]:
encode_accessions_df = pd.DataFrame.from_records(
    accessions_and_epirr, columns=["experiment_accession", "epirr_no_version"]
)
print(encode_accessions_df.shape)

encode_accessions_df["in_epiatlas"] = encode_accessions_df["epirr_no_version"].isin(
    common_epirrs
)
display(encode_accessions_df["in_epiatlas"].value_counts(dropna=False))

In [None]:
display(encode_accessions_df.head())

In [None]:
encode_accessions_df.to_csv(
    encode_metadata_dir / "encode_epirrs_2025-02.csv", index=False
)

EpiRR is less useful because ENCODE only submitted complete epigenomes. EpiATLAS also includes partial ones.

## Compare with previous ENCODE metadata

In [None]:
encode_meta_df = pd.read_csv(
    encode_metadata_dir / "encode_metadata_2023-10-25_clean-v2.csv"
)
encode_ihec_df = pd.read_csv(encode_metadata_dir / "ENCODE_IHEC_keys.tsv", sep="\t")
print(encode_meta_df.shape)
print(encode_ihec_df.shape)

In [None]:
display(encode_meta_df.head())
display(encode_ihec_df.head())

In [None]:
N_accession_1 = encode_meta_df["experiment_accession"].nunique()
N_accession_2 = encode_ihec_df["accession"].nunique()
print(f"ENCODE metadata 2023-10-25 accessions: {N_accession_1}")
print(f"ENCODE-IHEC file accessions: {N_accession_2}")

In [None]:
display(encode_ihec_df[ASSAY].value_counts(dropna=False))

In [None]:
display(encode_meta_df[ASSAY].value_counts(dropna=False))
display(
    encode_meta_df[~encode_meta_df["md5sum"].isin(encode_ihec_df["ENC_ID"])][
        "Assay"
    ].value_counts(dropna=False)
)

Conclusion: non-core files are not included in ENCODE_IHEC_keys.tsv. That's okay because these files were only used for training assay13, and were not included in any other classifier training. We now have enough information to create an almost complete "in_epiatlas" column.

## `in_epiatlas` creation

In [None]:
encode_ihec_df["in_epiatlas"] = encode_ihec_df["is_EpiAtlas_EpiRR"].notnull()
display(encode_ihec_df["in_epiatlas"].value_counts(dropna=False))

### Sanity check: accession, in_epiatlas pairs consistent (accessions are not unique)

In [None]:
encode_ihec_df_pairs = encode_ihec_df[["accession", "in_epiatlas"]].values.tolist()
encode_ihec_df_pairs = tuple(zip(*encode_ihec_df_pairs))
if len(encode_ihec_df_pairs) != len(set(encode_ihec_df_pairs)):
    raise ValueError("Inconsistent 'in_epiatlas' values:", encode_ihec_df_pairs)

In [None]:
epirr_in_epiatlas = encode_accessions_df[["in_epiatlas", "experiment_accession"]]
alt_in_epiatlas = encode_ihec_df[["in_epiatlas", "accession"]]

common_accessions = set(epirr_in_epiatlas["experiment_accession"]).intersection(
    set(alt_in_epiatlas["accession"])
)
alt_in_epiatlas_common = alt_in_epiatlas[
    alt_in_epiatlas["accession"].isin(common_accessions)
]
epirr_in_epiatlas_common = epirr_in_epiatlas[
    epirr_in_epiatlas["experiment_accession"].isin(common_accessions)
]

inconsistent_accession_tuples = []
for accession in common_accessions:
    in_epitlas_1 = epirr_in_epiatlas_common[
        epirr_in_epiatlas_common["experiment_accession"] == accession
    ]["in_epiatlas"].values
    in_epitlas_2 = alt_in_epiatlas_common[
        alt_in_epiatlas_common["accession"] == accession
    ]["in_epiatlas"].values
    if len(in_epitlas_1) != 1:
        # print(accession, in_epitlas_1)
        in_epitlas_1 = any(in_epitlas_1)
    else:
        in_epitlas_1 = in_epitlas_1[0]
    if len(in_epitlas_2) != 1:
        # print(accession, in_epitlas_2)
        in_epitlas_2 = any(in_epitlas_2)
    else:
        in_epitlas_2 = in_epitlas_2[0]

    if in_epitlas_1 != in_epitlas_2:
        inconsistent_accession_tuples.append((accession, in_epitlas_1, in_epitlas_2))
        # raise ValueError("Inconsistent 'in_epiatlas' values:", accession, in_epitlas_1, in_epitlas_2)

print("Inconsistent 'in_epiatlas' values:", len(inconsistent_accession_tuples))

In [None]:
inconsistent_accession_values = [dset[0] for dset in inconsistent_accession_tuples]
suspect_df = encode_ihec_df[
    encode_ihec_df["accession"].isin(inconsistent_accession_values)
]
display(suspect_df.head())

In [None]:
one_biosample_accs = suspect_df[suspect_df["biosample_accession"].str.endswith("DMP")][
    "accession"
].values.tolist()

In [None]:
one_epirr_inputs_acc = ["ENCSR000AHE", "ENCSR000DMW", "ENCSR000EWW", "ENCSR768LHG"]
for acc in one_epirr_inputs_acc:
    print(acc, acc in inconsistent_accession_values)

display(encode_ihec_df[encode_ihec_df["accession"].isin(one_epirr_inputs_acc)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_epirr_inputs_acc)])

In [None]:
display(encode_ihec_df[encode_ihec_df["accession"].isin(one_biosample_accs)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_biosample_accs)])
display(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]
)

In [None]:
problematic_epirr_example = set(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]["epirr_no_version"].values.tolist()
)
if len(problematic_epirr_example) > 1:
    raise ValueError("One biosample with multiple epirrs", problematic_epirr_example)

problematic_epirr_example = problematic_epirr_example.pop()

## EpiClass actual training metadata

In [None]:
epiclass_metadata_path = (
    metadata_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
)
epiclass_metadata = Metadata(epiclass_metadata_path)
epiclass_df = pd.DataFrame.from_records(list(epiclass_metadata.datasets))

In [None]:
epiclass_epirrs = set(epiclass_df["epirr_id_without_version"].tolist())

In [None]:
print(problematic_epirr_example in epiclass_epirrs)

Conclusion: Some errors have been made during the creation of "ENCODE_IHEC_keys.tsv". As demonstrated by having a set of files from a biosamples being marked as not having an epirr, when we found the corresponding epirr in the training metadata. We need to recreate the metadata from zero to guarantee the right values.