In [None]:
"""Get ENCODE EpiRRs, and determine which datasets are in EpiATLAS.

Found incoherences, so rest of script is metadata re-creation.
"""

# pylint: disable=import-error, redefined-outer-name, too-many-lines

In [None]:
from __future__ import annotations

import copy
import gc
import json
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Set, Tuple

import pandas as pd
import requests
from IPython.display import display
from tqdm import tqdm

from epiclass.core.metadata import Metadata
from epiclass.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    ASSAY_ORDER,
    CELL_TYPE,
    LIFE_STAGE,
    SEX,
)

In [None]:
BIOMAT = "harmonized_biomaterial_type"

First, download summary of all EpiRR epigenomes: https://www.ebi.ac.uk/epirr/docs  
This was already done.

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
metadata_dir = base_dir / "data/metadata"
if not metadata_dir.exists():
    raise ValueError(f"Path {metadata_dir} does not exist.")

In [None]:
encode_metadata_dir = metadata_dir / "encode"
if not encode_metadata_dir.exists():
    raise ValueError(f"Path {encode_metadata_dir} does not exist.")

# Attempt to establish EpiRR overlap between EpiATLAS and ENCODE, using 'old' metadata.

### Cleanup old ChIP metadata

Fix experiment accession values

In [None]:
full_metadata_path = encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25.csv"
chip_metadata_df = pd.read_csv(full_metadata_path)
print(chip_metadata_df.shape)

In [None]:
if chip_metadata_df["md5sum"].nunique() != chip_metadata_df.shape[0]:
    raise ValueError("Duplicate filenames")

In [None]:
for col in chip_metadata_df.columns:
    if chip_metadata_df[col].str.slice(0, 5).isin(["ENCSR"]).sum() > 0:
        print(col)

In [None]:
accession_cols = [
    "Accession",
    "accession",
    "Experiment accession",
    "Experiment_accession",
    "experiment_accession",
]  # ENCSR[VAL]
for col in accession_cols:
    if (
        chip_metadata_df.loc[:, col].str.slice(0, 5).isin(["ENCSR", "unkno"]).sum()
        != chip_metadata_df.shape[0]
    ):
        raise ValueError(f"Column {col} is not in the correct format")

chip_metadata_df.drop(columns=accession_cols, inplace=True)
chip_metadata_df.loc[:, "experiment_accession"] = (
    chip_metadata_df["uuid"].str.split("-", n=1).str[0]
)

In [None]:
new_df_path = encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
if not new_df_path.exists():
    chip_metadata_df.to_csv(new_df_path, index=False)

### Parse EpiRR general metadata file

In [None]:
filename = "epirr_epigenomes_2025-02"
epigenomes_summary_path = encode_metadata_dir / "new_meta" / f"{filename}.json"

with open(epigenomes_summary_path, "r", encoding="utf-8") as f:
    epigenomes_summary = json.load(f)

epigenomes_summary_df = pd.DataFrame(epigenomes_summary)
epigenomes_summary_df.to_csv(epigenomes_summary_path.with_suffix(".csv"), index=False)

In [None]:
display(epigenomes_summary_df["project"].value_counts(dropna=False))

In [None]:
encode_epirrs = epigenomes_summary_df[epigenomes_summary_df["project"] == "ENCODE"][
    "accession"
].tolist()

In [None]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")

In [None]:
del epigenomes_summary_df

## Download specific experiments metadata

Download metadata for all encode epigenomes.

In [None]:
encode_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_epigenomes_metadata_2025-02.json"
)

In [None]:
if not encode_metadata_path.exists():
    # Base URL
    base_url = "https://www.ebi.ac.uk/epirr/api/v1/epigenome?accession={}"

    # Collect metadata in a list
    metadata_list = []

    # Use tqdm for a progress bar
    for epirr in tqdm(encode_epirrs, desc="Fetching Metadata", unit="entry"):
        response = requests.get(
            base_url.format(epirr), headers={"accept": "application/json"}
        )
        if response.status_code == 200:
            metadata_list.append(response.json())  # Append parsed JSON
        else:
            print(f"Failed to fetch {epirr}: {response.status_code}")

    with open(encode_metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata_list, f, indent=2)

    print(f"Metadata saved to {encode_metadata_path}")

## Parse specific metadata for accessions

In [None]:
encode_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_epigenomes_metadata_2025-02.json"
)
with open(encode_metadata_path, "r", encoding="utf-8") as f:
    encode_metadata = json.load(f)

In [None]:
accessions_and_epirr = []
for dset in encode_metadata:
    epirr = dset["accession"]
    primary_ids = [file["primary_id"] for file in dset["raw_data"]]
    for primary_id in primary_ids:
        accessions_and_epirr.append((primary_id, epirr))

    # # it's an input file, multiple occurences is fine
    # if "ENCSR266XMB" in primary_ids:
    #     print(dset["raw_data"])
print("ENCODE total accessions:", len(accessions_and_epirr))

In [None]:
primary_ids_count = Counter([primary_id for primary_id, _ in accessions_and_epirr])
print("ENCODE unique accessions:", len(set(primary_ids_count.keys())))
print(primary_ids_count.most_common(5))

In [None]:
assert set(epirr for _, epirr in accessions_and_epirr) == set(encode_epirrs)

## Compare with EpiATLAS

In [None]:
epiatlas_metadata_path = (
    metadata_dir / "official" / "IHEC_sample_metadata_harmonization.v1.2.extended.csv"
)
epiatlas_df = pd.read_csv(epiatlas_metadata_path, index_col=False)

In [None]:
epiatlas_epirrs = set(epiatlas_df["epirr_id_without_version"].tolist())
common_epirrs = set(encode_epirrs).intersection(epiatlas_epirrs)
diff_epirr = set(encode_epirrs).difference(epiatlas_epirrs)

In [None]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")
print(f"EpiATLAS EpiRRs: {len(epiatlas_epirrs)}")
print(f"ENCODE EpiRRs in EpiATLAS: {len(common_epirrs)}")

In [None]:
encode_accessions_df = pd.DataFrame.from_records(
    accessions_and_epirr, columns=["experiment_accession", "epirr_no_version"]
)
print(encode_accessions_df.shape)

encode_accessions_df["in_epiatlas"] = encode_accessions_df["epirr_no_version"].isin(
    common_epirrs
)
display(encode_accessions_df["in_epiatlas"].value_counts(dropna=False))

In [None]:
display(encode_accessions_df.head())

In [None]:
encode_accessions_df.to_csv(
    encode_metadata_dir / "new_meta" / "encode_epirrs_2025-02.csv", index=False
)

EpiRR is less useful because ENCODE only submitted complete epigenomes. EpiATLAS also includes partial ones.

In [None]:
del epiatlas_df
gc.collect()

## Compare with previous ENCODE metadata

In [None]:
encode_meta_df = pd.read_csv(
    encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
)
encode_ihec_df = pd.read_csv(
    encode_metadata_dir / "old_meta" / "ENCODE_IHEC_keys.tsv", sep="\t"
)
print(encode_meta_df.shape)
print(encode_ihec_df.shape)

In [None]:
display(encode_meta_df.head())
display(encode_ihec_df.head())

In [None]:
N_accession_1 = encode_meta_df["experiment_accession"].nunique()
N_accession_2 = encode_ihec_df["accession"].nunique()
print(f"ENCODE metadata 2023-10-25 accessions: {N_accession_1}")
print(f"ENCODE-IHEC file accessions: {N_accession_2}")

In [None]:
display(encode_ihec_df[ASSAY].value_counts(dropna=False))

In [None]:
display(encode_meta_df[ASSAY].value_counts(dropna=False))
display(
    encode_meta_df[~encode_meta_df["md5sum"].isin(encode_ihec_df["ENC_ID"])][
        "Assay"
    ].value_counts(dropna=False)
)

Conclusion: non-core files are not included in ENCODE_IHEC_keys.tsv. That's okay because these files were only used for training assay13, and were not included in any other classifier training. We now have enough information to create an almost complete "in_epiatlas" column.

## `in_epiatlas` creation

In [None]:
encode_ihec_df["in_epiatlas"] = encode_ihec_df["is_EpiAtlas_EpiRR"].notnull()
display(encode_ihec_df["in_epiatlas"].value_counts(dropna=False))

### Sanity check: accession, in_epiatlas pairs consistent (accessions are not unique)

In [None]:
encode_ihec_df_pairs = encode_ihec_df[["accession", "in_epiatlas"]].values.tolist()
encode_ihec_df_pairs = tuple(zip(*encode_ihec_df_pairs))
if len(encode_ihec_df_pairs) != len(set(encode_ihec_df_pairs)):
    raise ValueError("Inconsistent 'in_epiatlas' values:", encode_ihec_df_pairs)

In [None]:
def check_epirr_in_epiatlas(
    encode_accessions_df: pd.DataFrame, encode_ihec_df: pd.DataFrame
) -> List[Tuple[str, str, str]]:
    """Determine which ENCODE datasets are included in EpiATLAS."""
    epirr_in_epiatlas = encode_accessions_df[["in_epiatlas", "experiment_accession"]]
    alt_in_epiatlas = encode_ihec_df[["in_epiatlas", "accession"]]

    common_accessions = set(epirr_in_epiatlas["experiment_accession"]).intersection(
        set(alt_in_epiatlas["accession"])
    )
    alt_in_epiatlas_common = alt_in_epiatlas[
        alt_in_epiatlas["accession"].isin(common_accessions)
    ]
    epirr_in_epiatlas_common = epirr_in_epiatlas[
        epirr_in_epiatlas["experiment_accession"].isin(common_accessions)
    ]

    inconsistent_accession_tuples = []
    for accession in common_accessions:
        in_epitlas_1 = epirr_in_epiatlas_common[
            epirr_in_epiatlas_common["experiment_accession"] == accession
        ]["in_epiatlas"].values
        in_epitlas_2 = alt_in_epiatlas_common[
            alt_in_epiatlas_common["accession"] == accession
        ]["in_epiatlas"].values

        if len(in_epitlas_1) != 1:
            # print(accession, in_epitlas_1)
            in_epitlas_1 = any(in_epitlas_1)

        else:
            in_epitlas_1 = in_epitlas_1[0]

        if len(in_epitlas_2) != 1:
            # print(accession, in_epitlas_2)
            in_epitlas_2 = any(in_epitlas_2)
        else:
            in_epitlas_2 = in_epitlas_2[0]

        if in_epitlas_1 != in_epitlas_2:
            inconsistent_accession_tuples.append((accession, in_epitlas_1, in_epitlas_2))
            # raise ValueError("Inconsistent 'in_epiatlas' values:", accession, in_epitlas_1, in_epitlas_2)

    return inconsistent_accession_tuples

In [None]:
inconsistent_accession_tuples = check_epirr_in_epiatlas(
    encode_accessions_df, encode_ihec_df
)

In [None]:
inconsistent_accession_values = [dset[0] for dset in inconsistent_accession_tuples]
suspect_df = encode_ihec_df[
    encode_ihec_df["accession"].isin(inconsistent_accession_values)
]
display(suspect_df.head())

In [None]:
one_biosample_accs = suspect_df[suspect_df["biosample_accession"].str.endswith("DMP")][
    "accession"
].values.tolist()

In [None]:
one_epirr_inputs_acc = ["ENCSR000AHE", "ENCSR000DMW", "ENCSR000EWW", "ENCSR768LHG"]
for acc in one_epirr_inputs_acc:
    print(acc, acc in inconsistent_accession_values)

display(encode_ihec_df[encode_ihec_df["accession"].isin(one_epirr_inputs_acc)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_epirr_inputs_acc)])

In [None]:
display(encode_ihec_df[encode_ihec_df["accession"].isin(one_biosample_accs)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_biosample_accs)])
display(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]
)

In [None]:
problematic_epirr_example = set(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]["epirr_no_version"].values.tolist()
)
if len(problematic_epirr_example) > 1:
    raise ValueError("One biosample with multiple epirrs", problematic_epirr_example)

problematic_epirr_example = problematic_epirr_example.pop()

## EpiClass actual training metadata

In [None]:
epiclass_metadata_path = (
    metadata_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
)
epiclass_metadata = Metadata(epiclass_metadata_path)
epiclass_df = pd.DataFrame.from_records(list(epiclass_metadata.datasets))
print(epiclass_df.shape)

In [None]:
epiclass_epirrs = set(epiclass_df["epirr_id_without_version"].tolist())

In [None]:
print(problematic_epirr_example in epiclass_epirrs)

Conclusion: Some errors have been made during the creation of "ENCODE_IHEC_keys.tsv". As demonstrated by having a set of files from a biosamples being marked as not having an epirr, when we found the corresponding epirr in the training metadata. We need to recreate the metadata from zero to guarantee the right values.

# Recreating metadata from file accessions

## Collect/Combine accessions

### ChIP

In [None]:
chip_file_accessions = set(encode_meta_df["md5sum"].unique().tolist())
print("CHIP file accessions:", len(chip_file_accessions))

In [None]:
del encode_meta_df
del epiclass_df
del epiclass_metadata
del encode_accessions_df
del encode_ihec_df
gc.collect()

### RNA

In [None]:
encode_rna_file_meta_path = (
    encode_metadata_dir / "old_meta" / "metadata--ENCODE_RNA_2023mar_hg38_BW_default.tsv"
)
encode_rna_meta_df = pd.read_csv(encode_rna_file_meta_path, sep="\t")
print(encode_rna_meta_df.shape)

In [None]:
rna_file_accessions = set(encode_rna_meta_df["File accession"].unique().tolist())
print("RNA file accessions:", len(rna_file_accessions))

In [None]:
del encode_rna_meta_df
gc.collect()

WGBS accessions

In [None]:
wgbs_accessions_path = (
    encode_metadata_dir / "old_meta" / "ENCODE_WGBS_2023mar_hg38_BW_default.list"
)
with open(wgbs_accessions_path, "r", encoding="utf-8") as f:
    wgbs_file_accessions = f.read().splitlines()

print("WGBS accessions:", len(wgbs_file_accessions))

## Download and create new ENCODE metadata

In [None]:
all_file_accessions = (
    chip_file_accessions | rna_file_accessions | set(wgbs_file_accessions)
)
try:
    all_file_accessions.remove("unknown")
except KeyError:
    pass

print("All file accessions:", len(all_file_accessions))

In [None]:
url_exp_template_object = (
    "https://www.encodeproject.org/experiments/{}/?frame=object&format=json"
)
url_exp_template_embedded = (
    "https://www.encodeproject.org/experiments/{}/?frame=embedded&format=json"
)
url_biosample_template_object = (
    "https://www.encodeproject.org/biosamples/{}/?frame=object&format=json"
)
url_file_template_object = (
    "https://www.encodeproject.org/files/{}/?frame=object&format=json"
)
url_biosample_type_template_object = (
    "https://www.encodeproject.org/biosample-types/{}/?frame=object&format=json"
)

In [None]:
def fetch_json(url: str, headers: Dict[str, str]) -> Dict[str, Any] | None:
    """Helper function to fetch JSON data from a URL."""
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()

    print(f"Failed to fetch {url}: {response.status_code}")
    return None


def fetch_experiment_metadata(
    experiment_acc: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch experiment metadata from ENCODE API."""
    return fetch_json(url_exp_template_object.format(experiment_acc), headers)


def fetch_biosample_type_metadata(
    term_id: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch biosample type metadata from ENCODE API.

    Args:
        term_id (str): Biosample type term ID, of the form "[classification]_[ontology_id]", e.g. "cell_line_EFO_0001203".
        headers (Dict[str, str]): HTTP headers, such as authorization, to be passed.
    """
    return fetch_json(url_biosample_type_template_object.format(term_id), headers)


def fetch_file_metadata(file_acc: str, headers: Dict[str, str]) -> Dict[str, Any] | None:
    """Fetch experiment metadata from ENCODE API."""
    return fetch_json(url_file_template_object.format(file_acc), headers)


def fetch_replicate_biosample(
    experiment_acc: str, headers: Dict[str, str]
) -> Dict[str, List[str]] | None:
    """Fetch biosample accessions for an experiment."""
    data = fetch_json(url_exp_template_embedded.format(experiment_acc), headers)
    if data:
        accession_list = [
            replicate["library"]["biosample"]["accession"]
            for replicate in data.get("replicates", [])
        ]
        return {experiment_acc: accession_list}
    return None


def fetch_biosample_metadata(
    biosample_acc: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch biosample metadata from ENCODE API."""
    return fetch_json(url_biosample_template_object.format(biosample_acc), headers)


def parallel_fetch(
    func: Callable[[str, Dict[str, str]], Any],
    identifiers: Iterable[str],
    headers: Dict[str, str],
    max_workers: int = 10,
    task_name: str = "Fetching Data",
) -> List[Any]:
    """
    Generic function to fetch data in parallel.

    Args:
        func (Callable): The function to execute in parallel.
        identifiers (Iterable[str]): List of input identifiers for the function.
        headers (Dict[str, str]): HTTP headers, such as authorization, to be passed.
        max_workers (int): Number of parallel threads.
        task_name (str): Name for progress bar.

    Returns:
        List of results.
    """
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_id = {executor.submit(func, id_, headers): id_ for id_ in identifiers}

        for future in tqdm(
            as_completed(future_to_id),
            total=len(future_to_id),
            desc=task_name,
            unit="entry",
        ):
            result = future.result()
            if result is not None:
                results.append(result)

    return results

In [None]:
def fetch_and_update_metadata(
    func: Callable[[str, Dict[str, str]], Dict[str, Any] | None],
    all_accession_set: Set[str],
    output_filepath: Path,
    max_workers: int = 5,
    task_name: str = "Fetching metadata",
) -> List[Dict[str, Any]]:
    """
    General function to fetch new metadata and update an existing JSON/TSV file.

    Args:
        func (Callable): Function to fetch metadata for a single accession.
        all_accession_set (Set[str]): Set of all accessions to process.
        output_filepath (Path): Path to the JSON file storing metadata.
        max_workers (int): Number of parallel workers (default: 5).
        task_name (str): Progress bar name.

    Returns:
        List[Dict[str, Any]]: The combined metadata after fetching new entries.
    """
    headers = {"accept": "application/json"}

    # Step 1: Load existing metadata if the file exists
    if output_filepath.exists():
        with open(output_filepath, "r", encoding="utf-8") as f:
            existing_metadata = json.load(f)
    else:
        existing_metadata = []

    # Step 2: Identify already fetched accessions
    attempt_keys = ["accession", "obo_id", "name"]
    for key in attempt_keys:
        try:
            existing_accessions = {entry[key] for entry in existing_metadata}
            break
        except KeyError:
            continue
    else:
        print(
            f"No {attempt_keys} keys in existing metadata. Using first key of each entry instead."
        )
        existing_accessions = {list(entry.keys())[0] for entry in existing_metadata}

    # Step 3: Find missing accessions
    missing_accessions = all_accession_set - existing_accessions

    # Step 4: Fetch new metadata if needed
    if missing_accessions:
        print(f"Fetching {len(missing_accessions)} new records...")
        new_metadata = parallel_fetch(
            func=func,
            identifiers=missing_accessions,
            headers=headers,
            max_workers=max_workers,
            task_name=task_name,
        )

        # Merge old and new metadata
        combined_metadata = existing_metadata + new_metadata

        # Save updated JSON file
        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(combined_metadata, f, indent=2)

        # Save updated TSV file
        file_df = pd.DataFrame.from_records(combined_metadata)
        file_df.to_csv(output_filepath.with_suffix(".tsv"), sep="\t", index=False)

        return combined_metadata

    print("No new metadata to fetch. Data is already up-to-date.")
    return existing_metadata

In [None]:
output_file = encode_metadata_dir / "new_meta" / "encode_file_metadata_2025-02.json"
encode_file_metadata_list = fetch_and_update_metadata(
    func=fetch_file_metadata,
    all_accession_set=all_file_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching File metadata",
)
encode_file_metadata: Dict[str, Dict[str, Any]] = {
    dset["accession"]: dset for dset in encode_file_metadata_list
}

In [None]:
assay_title_counter = Counter()
for dset in encode_file_metadata.values():
    assay_title = dset["assay_title"]
    assay_title_counter[assay_title] += 1

print(assay_title_counter)

Find experiment accessions from file metadata. Make sure exp accessions are unique.

In [None]:
all_experiment_accessions = set()
for file_acc, dataset in encode_file_metadata.items():
    acc_str = dataset["dataset"]
    if acc_str.count("ENCSR") > 1:
        raise ValueError("Multiple experiments per file:", dataset)

    exp_acc = acc_str.split("/")[-2]
    all_experiment_accessions.add(exp_acc)
    encode_file_metadata[file_acc]["experiment_accession"] = exp_acc

for val in all_experiment_accessions:
    if not val.startswith("ENCSR"):
        raise ValueError("Experiment accessions do not start with ENCSR:", val)

In [None]:
output_file = encode_metadata_dir / "new_meta" / "encode_experiment_metadata_2025-02.json"
encode_exp_metadata_list = fetch_and_update_metadata(
    func=fetch_experiment_metadata,
    all_accession_set=all_experiment_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Experiment metadata",
)
encode_exp_metadata: Dict[str, Dict[str, Any]] = {
    dset["accession"]: dset for dset in encode_exp_metadata_list
}

In [None]:
output_file = (
    encode_metadata_dir / "new_meta" / "encode_biosample_accessions_2025-02.json"
)
encode_biosample_accessions_list = fetch_and_update_metadata(
    func=fetch_replicate_biosample,
    all_accession_set=all_experiment_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Experiment Biosample Accessions",
)
encode_biosample_accessions_dict = {
    list(dset.keys())[0]: list(dset.values())[0]
    for dset in encode_biosample_accessions_list
}

In [None]:
all_biosamples_accessions = set()
for biosample_accs in encode_biosample_accessions_dict.values():
    all_biosamples_accessions.update(biosample_accs)

In [None]:
output_file = encode_metadata_dir / "new_meta" / "encode_biosample_metadata_2025-02.json"
encode_biosample_metadata_list = fetch_and_update_metadata(
    func=fetch_biosample_metadata,
    all_accession_set=all_biosamples_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Biosample Metadata",
)
encode_biosample_metadata = {
    dset["accession"]: dset for dset in encode_biosample_metadata_list
}

In [None]:
def get_biosample_type(biosample_ontology: str) -> str:
    """Extract the term ID from a biosample ontology string.

    Takes string of format '/biosample-types/cell_line_EFO_0001203/'
    and returns 'cell_line_EFO_0001203'.
    """
    biosample_ontology_no_prefix = biosample_ontology.replace("/", "").replace(
        "biosample-types", ""
    )
    return biosample_ontology_no_prefix


def get_biosample_term_id(biosample_ontology: str) -> str:
    """Extract the term ID from a biosample ontology string.

    Takes string of format '/biosample-types/cell_line_EFO_0001203/'
    and returns 'EFO:0001203'.
    """
    biosample_ontology_no_prefix = get_biosample_type(biosample_ontology)
    biosample_term_id = ":".join(biosample_ontology_no_prefix.split("_")[-2:])
    return biosample_term_id

In [None]:
biosample_types = [
    get_biosample_type(dset["biosample_ontology"])
    for dset in encode_biosample_metadata.values()
]

In [None]:
new_file = encode_metadata_dir / "new_meta" / "biosample_types_metadata.json"

biosample_type_metadata_list = fetch_and_update_metadata(
    func=fetch_biosample_type_metadata,
    all_accession_set=set(biosample_types),
    output_filepath=new_file,
    max_workers=5,
    task_name="Fetching missing biosample type info",
)
encode_biosample_type_metadata = {
    dset["term_id"]: dset for dset in biosample_type_metadata_list
}

### Combine experiment and biosample metadata

Exclude datasets with incoherent biosamples, can't do a case by case basis

In [None]:
verbose = False

N_counter = Counter()
missing_biosample_metadata = set()
problematic_experiments = set()
for exp_acc, biosample_accs in encode_biosample_accessions_dict.items():
    N = len(biosample_accs)
    N_counter[N] += 1
    biosample_meta = []
    if N > 1:
        for biosample_acc in biosample_accs:
            try:
                meta = encode_biosample_metadata[biosample_acc]
            except KeyError:
                missing_biosample_metadata.add(biosample_acc)
                biosample_meta.append(("unknown", "unknown", "unknown"))
                if verbose:
                    print("Missing biosample metadata:", biosample_acc)
                continue
            sex = meta.get("sex", "unknown")
            life_stage = meta.get("life_stage", "unknown")
            health_status = meta.get("health_status", "unknown")
            biosample_meta.append((sex, life_stage, health_status))

        # biosample info needs to be the same
        if len(set(biosample_meta)) > 1:
            problematic_experiments.add(exp_acc)
            if verbose:
                print("Inconsistent biosample metadata:", exp_acc, biosample_meta)

In [None]:
print(missing_biosample_metadata)

Remove experiments with mixed biosample metadata

In [None]:
print("Nb exp to remove:", len(problematic_experiments))
print("Nb exp before:", len(encode_exp_metadata))
experiment_metadata = {
    k: v for k, v in encode_exp_metadata.items() if k not in problematic_experiments
}
print("Nb exp after:", len(experiment_metadata))

Remove experiments with no known biosamples

In [None]:
to_remove = []
for exp_acc in list(experiment_metadata.keys()):
    biosample_accs = encode_biosample_accessions_dict[exp_acc]
    # print(experiment_accession, biosample_accs)

    known_biosamples_accs = set(biosample_accs) - set(missing_biosample_metadata)
    if not known_biosamples_accs:
        to_remove.append(exp_acc)
        print(f"Experiment {exp_acc} has no known biosamples: {biosample_accs}")

print("Number of experiments to remove:", len(to_remove))
print("to_remove:", to_remove)
print("Nb exp before:", len(experiment_metadata))
for acc in to_remove:
    del experiment_metadata[acc]
print("Nb exp after:", len(experiment_metadata))

#### Prepend name to columns from file, experiment vs biosample metadata.

In [None]:
file_unique_keys = set()
for dset in encode_file_metadata.values():
    file_unique_keys.update(dset.keys())

exp_unique_keys = set()
for dset in encode_exp_metadata.values():
    exp_unique_keys.update(dset.keys())

biosample_unique_keys = set()
for dset in encode_biosample_metadata.values():
    biosample_unique_keys.update(dset.keys())

biosample_type_unique_keys = set()
for dset in encode_biosample_type_metadata.values():
    biosample_type_unique_keys.update(dset.keys())

In [None]:
full_metadata = copy.deepcopy(encode_file_metadata)
invalid_files = set()
for file_acc, dset_metadata in list(full_metadata.items()):
    exp_acc = dset_metadata["experiment_accession"]

    exp_metadata = copy.deepcopy(encode_exp_metadata[exp_acc])

    biosample_accs = encode_biosample_accessions_dict[exp_acc]
    known_biosamples_accs = set(biosample_accs) - set(missing_biosample_metadata)
    if not known_biosamples_accs:
        print(
            f"File {file_acc} with Experiment {exp_acc} has no known biosamples: {biosample_accs}"
        )
        invalid_files.add(file_acc)
        continue

    biosample_acc = (
        known_biosamples_accs.pop()
    )  # choose random one, coherence check previously
    try:
        biosample_metadata = copy.deepcopy(encode_biosample_metadata[biosample_acc])
    except KeyError:
        print(f"ERROR:{exp_acc}:{biosample_acc}")
        continue

    biosample_term_id = get_biosample_term_id(biosample_metadata["biosample_ontology"])
    try:
        biosample_type_metadata = copy.deepcopy(
            encode_biosample_type_metadata[biosample_term_id]
        )
    except KeyError:
        print(
            f"Missing biosample type metadata: {file_acc}-{exp_acc}-{biosample_acc}-{biosample_term_id}"
        )
        biosample_type_metadata = {}

    # create unique names
    for unique_meta_labels, str_prepend, dset in zip(
        [
            file_unique_keys,
            exp_unique_keys,
            biosample_unique_keys,
            biosample_type_unique_keys,
        ],
        ["FILE", "EXPERIMENT", "BIOSAMPLE", "BIOSAMPLE_TYPE"],
        [dset_metadata, exp_metadata, biosample_metadata, biosample_type_metadata],
    ):
        for key in unique_meta_labels:
            try:
                dset[f"{str_prepend}_{key}"] = dset[key]
                del dset[key]
            except KeyError:
                pass

    dset_metadata.update(exp_metadata)
    dset_metadata.update(biosample_metadata)
    dset_metadata.update(biosample_type_metadata)

    dset_metadata["biosamples"] = biosample_accs

print("Nb of invalid files:", len(invalid_files))
for file_acc in invalid_files:
    del full_metadata[file_acc]

Remove line breaks from values, it made reading file difficult.

In [None]:
full_metadata_df = pd.DataFrame(list(full_metadata.values()))
cols_with_line_breaks = set()
for col in full_metadata_df.columns:
    for specific_value in full_metadata_df[col].astype(str).unique():
        if "\n" in specific_value:
            cols_with_line_breaks.add(col)
            break

In [None]:
for col in cols_with_line_breaks:
    for idx, value in full_metadata_df[col].items():
        if "\n" in str(value):
            full_metadata_df.at[idx, col] = value.replace("\n", ";")

Finally, save the combined metadata

In [None]:
rewrite_metadata = True

In [None]:
full_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_full_metadata_2025-02.json"
)
if rewrite_metadata:
    full_metadata_path.unlink(missing_ok=True)

if not full_metadata_path.exists():
    full_metadata_df.to_json(full_metadata_path, orient="records", indent=2)
    full_metadata_df.to_csv(full_metadata_path.with_suffix(".csv"), sep=",", index=False)
else:
    print(f"Loading existing full metadata: {full_metadata_path}")
    with open(full_metadata_path, "r", encoding="utf-8") as f:
        full_metadata = json.load(f)
        full_metadata_df = pd.DataFrame(full_metadata)

In [None]:
if full_metadata_df.shape[0] != full_metadata_df["FILE_accession"].nunique():
    print("MAJOR ERROR")

In [None]:
display(full_metadata_df["FILE_assay_title"].value_counts(dropna=False))

### Check relevant metadata categories

In [None]:
potential_cols = []
for col in full_metadata_df.columns:
    if any(
        label in col.lower()
        for label in [
            "cancer",
            "health",
            "status",
            "life",
            "sex",
            "biosample_ontology",
            "assay",
        ]
    ):
        print(col)
        potential_cols.append(col)

for col in potential_cols:
    display(full_metadata_df[col].value_counts(dropna=False))

Sex, life stage, and biomaterial type metadata categories already available. 

In [None]:
full_metadata_df[LIFE_STAGE] = full_metadata_df["BIOSAMPLE_life_stage"]
full_metadata_df[SEX] = full_metadata_df["BIOSAMPLE_sex"]

In [None]:
full_metadata_df[BIOMAT] = full_metadata_df["BIOSAMPLE_TYPE_classification"]
full_metadata_df[BIOMAT].value_counts(dropna=False)

In [None]:
to_replace = {
    "tissue": "primary tissue",
    "in vitro differentiated cells": "other",
    "organoid": "other",
}
full_metadata_df[BIOMAT].replace(to_replace, inplace=True)

Note: must remove revoked files/experiments.

In [None]:
new_output_name = "encode_full_metadata_2025-02_no_revoked.csv"
new_output_path = full_metadata_path.with_name(new_output_name)

if rewrite_metadata:
    new_output_path.unlink(missing_ok=True)

if not new_output_path.exists():
    N_before = full_metadata_df.shape[0]
    for cat_type in ["FILE", "EXPERIMENT", "BIOSAMPLE"]:
        cat = f"{cat_type}_status"
        full_metadata_df = full_metadata_df[full_metadata_df[cat] != "revoked"]
    N_after = full_metadata_df.shape[0]

    print(f"Removed {N_before - N_after} revoked entries")

    full_metadata_df.to_csv(new_output_path, index=False)

In [None]:
potential_cols = []
for col in full_metadata_df.columns:
    if any(label in col.lower() for label in ["assay", "target", "antibody"]):
        potential_cols.append(col)

for col in potential_cols:
    display(full_metadata_df[col].value_counts(dropna=False))

In [None]:
unknown_target_df = full_metadata_df[full_metadata_df["FILE_target"].isnull()].copy()
for col in potential_cols:
    display(unknown_target_df[col].value_counts(dropna=False))

Note: No EXPERIMENT_target means RNA-seq or input.

### Create 'assay' and 'assay_epiclass' categories

In [None]:
all_core_assays = set(ASSAY_ORDER) | {"mrna_seq", "wgbs_standard", "wgbs_pbat"}
print(all_core_assays)

In [None]:
no_target_mapping = {
    "Control ChIP-seq": "input",
    "total RNA-seq": "rna_seq",
    "polyA plus RNA-seq": "mrna_seq",
    "WGBS": "wgbs",
}
unknown_target_df["assay"] = unknown_target_df["EXPERIMENT_assay_title"].map(
    no_target_mapping
)
display(unknown_target_df["assay"].value_counts(dropna=False))

In [None]:
known_target_df = full_metadata_df[~full_metadata_df["FILE_target"].isnull()].copy()

Sanity check: are all targets marked as human?

In [None]:
non_human_target = []
for val in known_target_df["FILE_target"].value_counts(dropna=False).keys():
    if "human" not in val:
        non_human_target.append(val)

for val in non_human_target:
    sub_df = known_target_df[known_target_df["FILE_target"] == val]
    print(f"{val}: {sub_df.shape[0]} files")
    # display(
    #     known_target_df[known_target_df["FILE_target"] == val].head()
    # )

In [None]:
for vals in known_target_df[
    known_target_df["FILE_target"].str.lower().str.contains("cebpa")
][
    ["FILE_accession", "EXPERIMENT_accession", "BIOSAMPLE_accession", "FILE_target"]
].values:
    print("\t".join(vals))

c'est en effet inhabituel, c'est possiblement une erreur d'annotation ou encore que l'anticorps utilisé pour faire le ChIP reconnaît à la fois la prt humaine et celle du rat car les séquences de protéines sont ~95% identique; le détail important est que l'expérience a été faite dans cellules humaines, ce qui est le cas ici alors je propose de simplement ignorer la présence de rat et combiner les résultats.

In [None]:
known_target_df["assay"] = (
    known_target_df["FILE_target"]
    .str.split(r"/targets/", expand=True)[1]
    .str.split("-", expand=True)[0]
    .str.lower()
)
display(known_target_df["assay"].value_counts(dropna=False))

In [None]:
full_metadata_df = pd.concat([known_target_df, unknown_target_df], axis=0)

In [None]:
unique_names = list(all_core_assays) + ["ctcf"]
full_metadata_df[ASSAY] = [
    label if label in unique_names else "non-core" for label in full_metadata_df["assay"]
]
display(full_metadata_df[ASSAY].value_counts(dropna=False))

### Add 'sample_ontology' category

In [None]:
for col in full_metadata_df.columns:
    print(col)

In [None]:
curie_def_df = pd.read_csv(
    encode_metadata_dir / "EpiAtlas_list-curie_term_HSOI.tsv",
    sep="\t",
    names=["biosample_term_id", "biosample_term_name", "epiclass_sample_ontology"],
)

In [None]:
new_df = full_metadata_df.merge(
    right=curie_def_df[["biosample_term_id", "epiclass_sample_ontology"]],
    left_on="BIOSAMPLE_TYPE_term_id",
    right_on="biosample_term_id",
    how="left",
)

In [None]:
new_df = new_df.drop(columns=["biosample_term_id"])
new_df.rename(columns={"epiclass_sample_ontology": CELL_TYPE}, inplace=True)

### Add 'in_epiatlas" category.

In [None]:
new_df["in_epiatlas"] = (
    new_df["EXPERIMENT_related_series"].astype(str).str.contains("reference-epigenomes")
) & (new_df[ASSAY].isin(ASSAY_ORDER))

Sanity check, reference epigenomes always mean IHEC?

yes: https://www.encodeproject.org/profiles/reference_epigenome

In [None]:
new_df["in_epiatlas"].value_counts(dropna=False)

Final save

In [None]:
output_name = "encode_full_metadata_2025-02_no_revoked.csv"
output_path = full_metadata_path.with_name(output_name)
new_df.to_csv(output_path, sep=",", index=False)

In [None]:
json_obj = {"datasets": list(new_df.to_dict("records"))}
with open(output_path.with_suffix(".json"), "w", encoding="utf-8") as f:
    json.dump(json_obj, f)