In [285]:
"""Get ENCODE EpiRRs, and determine which datasets are in EpiATLAS.

Found incoherences, so rest of script is metadata re-creation.
"""
# pylint: disable=import-error, redefined-outer-name, too-many-lines

'Get ENCODE EpiRRs, and determine which datasets are in EpiATLAS.\n\nFound incoherences, so rest of script is metadata re-creation.\n'

In [286]:
from __future__ import annotations

import copy
import gc
import json
from collections import Counter
from concurrent.futures import ThreadPoolExecutor, as_completed
from pathlib import Path
from typing import Any, Callable, Dict, Iterable, List, Set, Tuple

import pandas as pd
import requests
from IPython.display import display
from tqdm import tqdm

from epi_ml.core.metadata import Metadata
from epi_ml.utils.notebooks.paper.paper_utilities import ASSAY, ASSAY_ORDER, CELL_TYPE

First, download summary of all EpiRR epigenomes: https://www.ebi.ac.uk/epirr/docs  
This was already done.

In [287]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
metadata_dir = base_dir / "data/metadata"
if not metadata_dir.exists():
    raise ValueError(f"Path {metadata_dir} does not exist.")

### Parse EpiRR general metadata file

In [288]:
encode_metadata_dir = metadata_dir / "encode"
if not encode_metadata_dir.exists():
    raise ValueError(f"Path {encode_metadata_dir} does not exist.")

filename = "epirr_epigenomes_2025-02"
epigenomes_summary_path = encode_metadata_dir / "new_meta" / f"{filename}.json"

with open(epigenomes_summary_path, "r", encoding="utf-8") as f:
    epigenomes_summary = json.load(f)

epigenomes_summary_df = pd.DataFrame(epigenomes_summary)
epigenomes_summary_df.to_csv(epigenomes_summary_path.with_suffix(".csv"), index=False)

In [289]:
display(epigenomes_summary_df["project"].value_counts(dropna=False))

project
BLUEPRINT                         1249
CEEHRC                             486
GIS                                291
AMED-CREST                         278
ENCODE                             215
DEEP                               157
NIH Roadmap Epigenomics            147
Korea Epigenome Project (KNIH)      37
EpiHK                               19
Name: count, dtype: int64

In [290]:
encode_epirrs = epigenomes_summary_df[epigenomes_summary_df["project"] == "ENCODE"][
    "accession"
].tolist()

In [291]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")

ENCODE EpiRRs: 215


In [292]:
del epigenomes_summary_df

## Download specific experiments metadata

Download metadata for all encode epigenomes.

In [293]:
encode_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_epigenomes_metadata_2025-02.json"
)

In [294]:
if not encode_metadata_path.exists():
    # Base URL
    base_url = "https://www.ebi.ac.uk/epirr/api/v1/epigenome?accession={}"

    # Collect metadata in a list
    metadata_list = []

    # Use tqdm for a progress bar
    for epirr in tqdm(encode_epirrs, desc="Fetching Metadata", unit="entry"):
        response = requests.get(
            base_url.format(epirr), headers={"accept": "application/json"}
        )
        if response.status_code == 200:
            metadata_list.append(response.json())  # Append parsed JSON
        else:
            print(f"Failed to fetch {epirr}: {response.status_code}")

    with open(encode_metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata_list, f, indent=2)

    print(f"Metadata saved to {encode_metadata_path}")

## Parse specific metadata for accessions

In [295]:
encode_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_epigenomes_metadata_2025-02.json"
)
with open(encode_metadata_path, "r", encoding="utf-8") as f:
    encode_metadata = json.load(f)

In [296]:
accessions_and_epirr = []
for dset in encode_metadata:
    epirr = dset["accession"]
    primary_ids = [file["primary_id"] for file in dset["raw_data"]]
    for primary_id in primary_ids:
        accessions_and_epirr.append((primary_id, epirr))

    # # it's an input file, multiple occurences is fine
    # if "ENCSR266XMB" in primary_ids:
    #     print(dset["raw_data"])
print("ENCODE total accessions:", len(accessions_and_epirr))

ENCODE total accessions: 2588


In [297]:
primary_ids_count = Counter([primary_id for primary_id, _ in accessions_and_epirr])
print("ENCODE unique accessions:", len(set(primary_ids_count.keys())))
print(primary_ids_count.most_common(5))

ENCODE unique accessions: 2587
[('ENCSR266XMB', 2), ('ENCSR876SYO', 1), ('ENCSR706SZI', 1), ('ENCSR199LBO', 1), ('ENCSR823VEE', 1)]


In [298]:
assert set(epirr for _, epirr in accessions_and_epirr) == set(encode_epirrs)

## Compare with EpiATLAS

In [299]:
epiatlas_metadata_path = (
    metadata_dir / "official" / "IHEC_metadata_harmonization.v1.2.extended.csv"
)
epiatlas_df = pd.read_csv(epiatlas_metadata_path, index_col=False)

In [300]:
epiatlas_epirrs = set(epiatlas_df["epirr_id_without_version"].tolist())
common_epirrs = set(encode_epirrs).intersection(epiatlas_epirrs)
diff_epirr = set(encode_epirrs).difference(epiatlas_epirrs)

In [301]:
print(f"ENCODE EpiRRs: {len(encode_epirrs)}")
print(f"EpiATLAS EpiRRs: {len(epiatlas_epirrs)}")
print(f"ENCODE EpiRRs in EpiATLAS: {len(common_epirrs)}")

ENCODE EpiRRs: 215
EpiATLAS EpiRRs: 2279
ENCODE EpiRRs in EpiATLAS: 123


In [302]:
encode_accessions_df = pd.DataFrame.from_records(
    accessions_and_epirr, columns=["experiment_accession", "epirr_no_version"]
)
print(encode_accessions_df.shape)

encode_accessions_df["in_epiatlas"] = encode_accessions_df["epirr_no_version"].isin(
    common_epirrs
)
display(encode_accessions_df["in_epiatlas"].value_counts(dropna=False))

(2588, 2)


in_epiatlas
True     1415
False    1173
Name: count, dtype: int64

In [303]:
display(encode_accessions_df.head())

Unnamed: 0,experiment_accession,epirr_no_version,in_epiatlas
0,ENCSR876SYO,IHECRE00001772,False
1,ENCSR706SZI,IHECRE00001772,False
2,ENCSR199LBO,IHECRE00001772,False
3,ENCSR823VEE,IHECRE00001772,False
4,ENCSR473ZCM,IHECRE00001772,False


In [304]:
encode_accessions_df.to_csv(
    encode_metadata_dir / "new_meta" / "encode_epirrs_2025-02.csv", index=False
)

EpiRR is less useful because ENCODE only submitted complete epigenomes. EpiATLAS also includes partial ones.

In [305]:
del epiatlas_df
gc.collect()

41

## Compare with previous ENCODE metadata

In [306]:
encode_meta_df = pd.read_csv(
    encode_metadata_dir / "old_meta" / "encode_metadata_2023-10-25_clean-v2.csv"
)
encode_ihec_df = pd.read_csv(
    encode_metadata_dir / "old_meta" / "ENCODE_IHEC_keys.tsv", sep="\t"
)
print(encode_meta_df.shape)
print(encode_ihec_df.shape)

(9619, 183)
(5552, 12)


In [307]:
display(encode_meta_df.head())
display(encode_ihec_df.head())

Unnamed: 0,md5sum,File format,File type,Output type,File assembly,Assay,Donor(s),Biosample term id,Biosample term name_x,Biosample organism,...,biosample_treatment_duration,biosample_treatment_duration_units,biosample_modification_site_target_organism,biosample_modification_site_introduced_gene_organism,replicates,cellular_component,files_replace,donor_sex,cancer_status,experiment_accession
0,ENCFF072UFE,bigWig,bigWig,signal p-value,GRCh38,H3K4me2,/human-donors/ENCDO000AAZ/,EFO:0001086,A549,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,male,cancer,ENCSR918VQU
1,ENCFF114UFW,bigWig,bigWig,signal p-value,GRCh38,KDM5A,/human-donors/ENCDO000AAZ/,EFO:0001086,A549,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,male,cancer,ENCSR933MHJ
2,ENCFF604SWJ,bigWig,bigWig,signal p-value,GRCh38,H3K4me3,/human-donors/ENCDO000AAB/,EFO:0002791,HeLa-S3,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,cancer,ENCSR000AOF
3,ENCFF562UDF,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,48,hour,unknown,unknown,/replicates/7cdba585-65b5-439c-bb03-463c8ae8da40/,unknown,"ENCFF044QFZ,ENCFF562UDF,ENCFF778HQR,ENCFF954SAX",unknown,unknown,ENCSR086GBI
4,ENCFF654CFQ,bigWig,bigWig,signal p-value,GRCh38,H3K27ac,/human-donors/ENCDO640RUC/,UBERON:0006483,middle frontal area 46,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR004YQD


Unnamed: 0,ENC_ID,is_EpiAtlas_EpiRR,Trackhub_validation,assay_epiclass,assay_name,accession,biosample_accession,contributing_files,controls,derived_from,files,original_files
0,ENCFF002QSR,,,h3k27me3,Mint-ChIP-seq,ENCSR758YNR,"ENCBS958FWY,ENCBS601OTT","/files/ENCFF490YMM/,/files/ENCFF643CGH/,/files...",/experiments/ENCSR901NYG/,"/files/ENCFF453TYP/, /files/ENCFF874PSQ/, /fil...","/files/ENCFF676FLB/,/files/ENCFF793DTA/,/files...","/files/ENCFF676FLB/,/files/ENCFF793DTA/,/files..."
1,ENCFF003AUO,,,h3k4me1,ChIP-seq,ENCSR130XNJ,ENCBS700NCU,"/files/ENCFF110MCL/,/files/ENCFF702DHO/,/files...",/experiments/ENCSR696HNV/,"/files/ENCFF532RKH/, /files/ENCFF702DHO/","/files/ENCFF487GVW/,/files/ENCFF022DLD/,/files...","/files/ENCFF487GVW/,/files/ENCFF022DLD/,/files..."
2,ENCFF003JSV,IHECRE00003714.4,,h3k36me3,ChIP-seq,ENCSR581PUR,"ENCBS615MBV,ENCBS385OLC","/files/ENCFF643CGH/,/files/ENCFF042EJM/,/files...",/experiments/ENCSR196CXJ/,"/files/ENCFF425EYY/, /files/ENCFF145ZRI/, /fil...","/files/ENCFF170GQM/,/files/ENCFF438MMK/,/files...","/files/ENCFF170GQM/,/files/ENCFF438MMK/,/files..."
3,ENCFF003LZR,IHECRE00004659.3,,h3k4me1,ChIP-seq,ENCSR000FCG,"ENCBS481AAA,ENCBS482AAA","/files/ENCFF144WYV/,/files/ENCFF643CGH/,/files...",/experiments/ENCSR000EVA/,"/files/ENCFF493CYY/, /files/ENCFF400CMC/, /fil...","/files/ENCFF002AAV/,/files/ENCFF002AAW/,/files...","/files/ENCFF002AAV/,/files/ENCFF002AAW/,/files..."
4,ENCFF003MCE,,,h3k9me3,ChIP-seq,ENCSR703KLE,ENCBS212NYF,"/files/ENCFF127TJQ/,/files/ENCFF110MCL/,/files...",/experiments/ENCSR462ZQL/,"/files/ENCFF127TJQ/, /files/ENCFF711BOA/","/files/ENCFF793AZP/,/files/ENCFF413WGQ/,/files...","/files/ENCFF793AZP/,/files/ENCFF413WGQ/,/files..."


In [308]:
N_accession_1 = encode_meta_df["experiment_accession"].nunique()
N_accession_2 = encode_ihec_df["accession"].nunique()
print(f"ENCODE metadata 2023-10-25 accessions: {N_accession_1}")
print(f"ENCODE-IHEC file accessions: {N_accession_2}")

ENCODE metadata 2023-10-25 accessions: 9082
ENCODE-IHEC file accessions: 5111


In [309]:
display(encode_ihec_df[ASSAY].value_counts(dropna=False))

assay_epiclass
input       1981
h3k4me3      706
h3k27ac      624
h3k27me3     587
h3k4me1      560
h3k36me3     556
h3k9me3      538
Name: count, dtype: int64

In [310]:
display(encode_meta_df[ASSAY].value_counts(dropna=False))
display(
    encode_meta_df[~encode_meta_df["md5sum"].isin(encode_ihec_df["ENC_ID"])][
        "Assay"
    ].value_counts(dropna=False)
)

assay_epiclass
non-core    3599
input       1981
h3k4me3      706
h3k27ac      624
h3k27me3     587
h3k4me1      560
h3k36me3     556
h3k9me3      538
CTCF         468
Name: count, dtype: int64

Assay
CTCF        468
H3K9ac      117
POLR2A      115
H3K4me2      77
H4K20me1     57
           ... 
ZNF837        1
BAZ2A         1
SKI           1
HES4          1
TMF1          1
Name: count, Length: 1171, dtype: int64

Conclusion: non-core files are not included in ENCODE_IHEC_keys.tsv. That's okay because these files were only used for training assay13, and were not included in any other classifier training. We now have enough information to create an almost complete "in_epiatlas" column.

## `in_epiatlas` creation

In [311]:
encode_ihec_df["in_epiatlas"] = encode_ihec_df["is_EpiAtlas_EpiRR"].notnull()
display(encode_ihec_df["in_epiatlas"].value_counts(dropna=False))

in_epiatlas
False    4367
True     1185
Name: count, dtype: int64

### Sanity check: accession, in_epiatlas pairs consistent (accessions are not unique)

In [312]:
encode_ihec_df_pairs = encode_ihec_df[["accession", "in_epiatlas"]].values.tolist()
encode_ihec_df_pairs = tuple(zip(*encode_ihec_df_pairs))
if len(encode_ihec_df_pairs) != len(set(encode_ihec_df_pairs)):
    raise ValueError("Inconsistent 'in_epiatlas' values:", encode_ihec_df_pairs)

In [313]:
def check_epirr_in_epiatlas(
    encode_accessions_df: pd.DataFrame, encode_ihec_df: pd.DataFrame
) -> List[Tuple[str, str, str]]:
    """Determine which ENCODE datasets are included in EpiATLAS."""
    epirr_in_epiatlas = encode_accessions_df[["in_epiatlas", "experiment_accession"]]
    alt_in_epiatlas = encode_ihec_df[["in_epiatlas", "accession"]]

    common_accessions = set(epirr_in_epiatlas["experiment_accession"]).intersection(
        set(alt_in_epiatlas["accession"])
    )
    alt_in_epiatlas_common = alt_in_epiatlas[
        alt_in_epiatlas["accession"].isin(common_accessions)
    ]
    epirr_in_epiatlas_common = epirr_in_epiatlas[
        epirr_in_epiatlas["experiment_accession"].isin(common_accessions)
    ]

    inconsistent_accession_tuples = []
    for accession in common_accessions:
        in_epitlas_1 = epirr_in_epiatlas_common[
            epirr_in_epiatlas_common["experiment_accession"] == accession
        ]["in_epiatlas"].values
        in_epitlas_2 = alt_in_epiatlas_common[
            alt_in_epiatlas_common["accession"] == accession
        ]["in_epiatlas"].values

        if len(in_epitlas_1) != 1:
            # print(accession, in_epitlas_1)
            in_epitlas_1 = any(in_epitlas_1)

        else:
            in_epitlas_1 = in_epitlas_1[0]

        if len(in_epitlas_2) != 1:
            # print(accession, in_epitlas_2)
            in_epitlas_2 = any(in_epitlas_2)
        else:
            in_epitlas_2 = in_epitlas_2[0]

        if in_epitlas_1 != in_epitlas_2:
            inconsistent_accession_tuples.append((accession, in_epitlas_1, in_epitlas_2))
            # raise ValueError("Inconsistent 'in_epiatlas' values:", accession, in_epitlas_1, in_epitlas_2)

    return inconsistent_accession_tuples

In [314]:
inconsistent_accession_tuples = check_epirr_in_epiatlas(
    encode_accessions_df, encode_ihec_df
)

In [315]:
inconsistent_accession_values = [dset[0] for dset in inconsistent_accession_tuples]
suspect_df = encode_ihec_df[
    encode_ihec_df["accession"].isin(inconsistent_accession_values)
]
display(suspect_df.head())

Unnamed: 0,ENC_ID,is_EpiAtlas_EpiRR,Trackhub_validation,assay_epiclass,assay_name,accession,biosample_accession,contributing_files,controls,derived_from,files,original_files,in_epiatlas
82,ENCFF027AKL,,,h3k9me3,ChIP-seq,ENCSR657OGA,"ENCBS324TZB,ENCBS500EWH","/files/ENCFF997KYP/,/files/ENCFF807MUK/,/files...",/experiments/ENCSR185OHN/,"/files/ENCFF845ESW/, /files/ENCFF126TEH/, /fil...","/files/ENCFF532XGG/,/files/ENCFF636RWU/,/files...","/files/ENCFF532XGG/,/files/ENCFF636RWU/,/files...",False
112,ENCFF035PZL,,,h3k9me3,ChIP-seq,ENCSR726EEM,ENCBS952IFF,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",/experiments/ENCSR152HLZ/,"/files/ENCFF637ERD/, /files/ENCFF814XGL/","/files/ENCFF814XGL/,/files/ENCFF623JAN/,/files...","/files/ENCFF814XGL/,/files/ENCFF623JAN/,/files...",False
404,ENCFF111ZPB,,,h3k4me1,ChIP-seq,ENCSR061UOM,"ENCBS859QEM,ENCBS586HSP","/files/ENCFF579DGX/,/files/ENCFF110MCL/,/files...",/experiments/ENCSR269OND/,"/files/ENCFF579DGX/, /files/ENCFF469FAN/, /fil...","/files/ENCFF135XXE/,/files/ENCFF556WTJ/,/files...","/files/ENCFF135XXE/,/files/ENCFF556WTJ/,/files...",False
475,ENCFF132QIU,,,h3k9me3,ChIP-seq,ENCSR708BMV,"ENCBS677JLV,ENCBS955ZON","/files/ENCFF047RIR/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF324KXJ/, /files/ENCFF047RIR/, /fil...","/files/ENCFF679MAY/,/files/ENCFF596WYF/,/files...","/files/ENCFF679MAY/,/files/ENCFF596WYF/,/files...",False
594,ENCFF168WFN,,,h3k27me3,ChIP-seq,ENCSR330MAM,ENCBS633XWN,"/files/ENCFF643CGH/,/files/ENCFF571OYN/,/files...",,"/files/ENCFF253VOU/, /files/ENCFF571OYN/, /fil...","/files/ENCFF611JBQ/,/files/ENCFF860EWD/,/files...","/files/ENCFF611JBQ/,/files/ENCFF860EWD/,/files...",False


In [316]:
one_biosample_accs = suspect_df[suspect_df["biosample_accession"].str.endswith("DMP")][
    "accession"
].values.tolist()

In [317]:
one_epirr_inputs_acc = ["ENCSR000AHE", "ENCSR000DMW", "ENCSR000EWW", "ENCSR768LHG"]
for acc in one_epirr_inputs_acc:
    print(acc, acc in inconsistent_accession_values)

display(encode_ihec_df[encode_ihec_df["accession"].isin(one_epirr_inputs_acc)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_epirr_inputs_acc)])

ENCSR000AHE True
ENCSR000DMW True
ENCSR000EWW True
ENCSR768LHG False


Unnamed: 0,ENC_ID,is_EpiAtlas_EpiRR,Trackhub_validation,assay_epiclass,assay_name,accession,biosample_accession,contributing_files,controls,derived_from,files,original_files,in_epiatlas
3878,ENCFF153LTH,,,input,ChIP-seq,ENCSR000AHE,"ENCBS001AAA,ENCBS000AAA,ENCBS056AAA",,,"/files/ENCFF000QQG/, /files/ENCFF110MCL/","/files/ENCFF000QPY/,/files/ENCFF000QPZ/,/files...",,False
4038,ENCFF238STO,,,input,ChIP-seq,ENCSR000EWW,ENCBS105ENC,,,"/files/ENCFF110MCL/, /files/ENCFF000VHM/","/files/ENCFF000VGM/,/files/ENCFF000VGO/,/files...",,False
4176,ENCFF310KKN,,,input,ChIP-seq,ENCSR000AHE,"ENCBS001AAA,ENCBS000AAA,ENCBS056AAA",,,"/files/ENCFF000QQI/, /files/ENCFF110MCL/","/files/ENCFF000QPY/,/files/ENCFF000QPZ/,/files...",,False
4419,ENCFF433BQD,IHECRE00001853.4,,input,ChIP-seq,ENCSR768LHG,"ENCBS789UPK,ENCBS967MVZ",,,"/files/ENCFF595BTS/, /files/ENCFF110MCL/, /fil...","/files/ENCFF640MKX/,/files/ENCFF433BQD/,/files...",,True
4502,ENCFF474LAV,IHECRE00001853.4,,input,ChIP-seq,ENCSR768LHG,"ENCBS789UPK,ENCBS967MVZ",,,"/files/ENCFF640MKX/, /files/ENCFF318ZNB/, /fil...","/files/ENCFF640MKX/,/files/ENCFF433BQD/,/files...",,True
4585,ENCFF506VER,,,input,ChIP-seq,ENCSR000EWW,ENCBS105ENC,,,"/files/ENCFF110MCL/, /files/ENCFF000VHL/","/files/ENCFF000VGM/,/files/ENCFF000VGO/,/files...",,False
4908,ENCFF671SEQ,,,input,ChIP-seq,ENCSR000AHE,"ENCBS001AAA,ENCBS000AAA,ENCBS056AAA",,,"/files/ENCFF000QQK/, /files/ENCFF110MCL/","/files/ENCFF000QPY/,/files/ENCFF000QPZ/,/files...",,False
5018,ENCFF731WWK,,,input,ChIP-seq,ENCSR000AHE,"ENCBS001AAA,ENCBS000AAA,ENCBS056AAA",,,"/files/ENCFF000QQJ/, /files/ENCFF110MCL/","/files/ENCFF000QPY/,/files/ENCFF000QPZ/,/files...",,False
5146,ENCFF797MVP,,,input,ChIP-seq,ENCSR000DMW,ENCBS252AAA,,,"/files/ENCFF000SAZ/, /files/ENCFF110MCL/","/files/ENCFF000SAV/,/files/ENCFF000SAW/,/files...",,False


Unnamed: 0,md5sum,File format,File type,Output type,File assembly,Assay,Donor(s),Biosample term id,Biosample term name_x,Biosample organism,...,biosample_treatment_duration,biosample_treatment_duration_units,biosample_modification_site_target_organism,biosample_modification_site_introduced_gene_organism,replicates,cellular_component,files_replace,donor_sex,cancer_status,experiment_accession
471,ENCFF731WWK,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/13167a1b-ed51-42a4-ff8b-c35755bda4...,unknown,"ENCFF000QPY,ENCFF000QPZ,ENCFF000QQB,ENCFF000QQ...",female,cancer,ENCSR000AHE
1166,ENCFF474LAV,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/09526e3d-f12f-486a-a0ac-b8cf1ebda4...,unknown,"ENCFF640MKX,ENCFF433BQD,ENCFF318ZNB,ENCFF222VR...",unknown,unknown,ENCSR768LHG
1390,ENCFF506VER,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/3ae0e615-943c-4ad8-9cb2-3e7d9509b8...,unknown,"ENCFF000VGM,ENCFF000VGO,ENCFF000VGP,ENCFF000VH...",female,cancer,ENCSR000EWW
4225,ENCFF433BQD,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/09526e3d-f12f-486a-a0ac-b8cf1ebda4...,unknown,"ENCFF640MKX,ENCFF433BQD,ENCFF318ZNB,ENCFF222VR...",unknown,unknown,ENCSR768LHG
4767,ENCFF671SEQ,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/13167a1b-ed51-42a4-ff8b-c35755bda4...,unknown,"ENCFF000QPY,ENCFF000QPZ,ENCFF000QQB,ENCFF000QQ...",female,cancer,ENCSR000AHE
5741,ENCFF797MVP,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/5a207111-75e0-40ee-8a3d-453012087339/,unknown,"ENCFF000SAV,ENCFF000SAW,ENCFF000SAZ,ENCFF028DN...",female,cancer,ENCSR000DMW
6869,ENCFF310KKN,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/13167a1b-ed51-42a4-ff8b-c35755bda4...,unknown,"ENCFF000QPY,ENCFF000QPZ,ENCFF000QQB,ENCFF000QQ...",female,cancer,ENCSR000AHE
8104,ENCFF238STO,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/3ae0e615-943c-4ad8-9cb2-3e7d9509b8...,unknown,"ENCFF000VGM,ENCFF000VGO,ENCFF000VGP,ENCFF000VH...",female,cancer,ENCSR000EWW
9142,ENCFF153LTH,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/13167a1b-ed51-42a4-ff8b-c35755bda4...,unknown,"ENCFF000QPY,ENCFF000QPZ,ENCFF000QQB,ENCFF000QQ...",female,cancer,ENCSR000AHE


In [318]:
display(encode_ihec_df[encode_ihec_df["accession"].isin(one_biosample_accs)])
display(encode_meta_df[encode_meta_df["experiment_accession"].isin(one_biosample_accs)])
display(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]
)

Unnamed: 0,ENC_ID,is_EpiAtlas_EpiRR,Trackhub_validation,assay_epiclass,assay_name,accession,biosample_accession,contributing_files,controls,derived_from,files,original_files,in_epiatlas
1673,ENCFF462ERS,,,h3k9me3,ChIP-seq,ENCSR558AXI,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF044TOH/, /files/ENCFF910RQR/","/files/ENCFF964RNY/,/files/ENCFF462ERS/,/files...","/files/ENCFF964RNY/,/files/ENCFF462ERS/,/files...",False
1945,ENCFF536JGG,,,h3k27me3,ChIP-seq,ENCSR586DVD,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF044TOH/, /files/ENCFF451FTP/","/files/ENCFF284PTZ/,/files/ENCFF451FTP/,/files...","/files/ENCFF284PTZ/,/files/ENCFF451FTP/,/files...",False
2099,ENCFF578GZL,,,h3k27ac,ChIP-seq,ENCSR500YBS,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF044TOH/, /files/ENCFF408NBC/","/files/ENCFF578GZL/,/files/ENCFF774RLX/,/files...","/files/ENCFF578GZL/,/files/ENCFF774RLX/,/files...",False
2485,ENCFF687HLG,,,h3k4me1,ChIP-seq,ENCSR497OVD,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF401CCX/, /files/ENCFF044TOH/","/files/ENCFF959LCL/,/files/ENCFF689PZB/,/files...","/files/ENCFF959LCL/,/files/ENCFF689PZB/,/files...",False
2771,ENCFF771GOG,,,h3k4me3,ChIP-seq,ENCSR309UVT,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF993OFD/, /files/ENCFF044TOH/","/files/ENCFF046KBB/,/files/ENCFF235BRW/,/files...","/files/ENCFF046KBB/,/files/ENCFF235BRW/,/files...",False
3040,ENCFF846KJP,,,h3k36me3,ChIP-seq,ENCSR942UIL,ENCBS541DMP,"/files/ENCFF643CGH/,/files/ENCFF807MUK/,/files...",,"/files/ENCFF166JAN/, /files/ENCFF044TOH/","/files/ENCFF941XLF/,/files/ENCFF377FKO/,/files...","/files/ENCFF941XLF/,/files/ENCFF377FKO/,/files...",False
3650,ENCFF044TOH,,,input,ChIP-seq,ENCSR347VUG,ENCBS541DMP,,,"/files/ENCFF116ZVM/, /files/ENCFF519MHN/, /fil...","/files/ENCFF898MQI/,/files/ENCFF759PNS/,/files...",,False


Unnamed: 0,md5sum,File format,File type,Output type,File assembly,Assay,Donor(s),Biosample term id,Biosample term name_x,Biosample organism,...,biosample_treatment_duration,biosample_treatment_duration_units,biosample_modification_site_target_organism,biosample_modification_site_introduced_gene_organism,replicates,cellular_component,files_replace,donor_sex,cancer_status,experiment_accession
150,ENCFF846KJP,bigWig,bigWig,signal p-value,GRCh38,H3K36me3,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR942UIL
4728,ENCFF687HLG,bigWig,bigWig,signal p-value,GRCh38,H3K4me1,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR497OVD
6483,ENCFF462ERS,bigWig,bigWig,signal p-value,GRCh38,H3K9me3,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR558AXI
7658,ENCFF578GZL,bigWig,bigWig,signal p-value,GRCh38,H3K27ac,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR500YBS
7867,ENCFF771GOG,bigWig,bigWig,signal p-value,GRCh38,H3K4me3,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR309UVT
8498,ENCFF536JGG,bigWig,bigWig,signal p-value,GRCh38,H3K27me3,/human-donors/ENCDO271OUW/,UBERON:0002046,thyroid gland,Homo sapiens,...,unknown,unknown,unknown,unknown,unknown,unknown,unknown,female,non-cancer,ENCSR586DVD
8955,ENCFF044TOH,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,unknown,...,unknown,unknown,unknown,unknown,/replicates/485ce890-862c-47ee-8e94-f21cba13f5...,unknown,"ENCFF898MQI,ENCFF759PNS,ENCFF266UXT,ENCFF237JO...",unknown,unknown,ENCSR347VUG


Unnamed: 0,experiment_accession,epirr_no_version,in_epiatlas
2550,ENCSR309UVT,IHECRE00004717,True
2551,ENCSR347VUG,IHECRE00004717,True
2553,ENCSR497OVD,IHECRE00004717,True
2554,ENCSR500YBS,IHECRE00004717,True
2555,ENCSR558AXI,IHECRE00004717,True
2556,ENCSR586DVD,IHECRE00004717,True
2559,ENCSR942UIL,IHECRE00004717,True


In [319]:
problematic_epirr_example = set(
    encode_accessions_df[
        encode_accessions_df["experiment_accession"].isin(one_biosample_accs)
    ]["epirr_no_version"].values.tolist()
)
if len(problematic_epirr_example) > 1:
    raise ValueError("One biosample with multiple epirrs", problematic_epirr_example)

problematic_epirr_example = problematic_epirr_example.pop()

## EpiClass actual training metadata

In [320]:
epiclass_metadata_path = (
    metadata_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
)
epiclass_metadata = Metadata(epiclass_metadata_path)
epiclass_df = pd.DataFrame.from_records(list(epiclass_metadata.datasets))
print(epiclass_df.shape)

(20922, 71)


In [321]:
epiclass_epirrs = set(epiclass_df["epirr_id_without_version"].tolist())

In [322]:
print(problematic_epirr_example in epiclass_epirrs)

True


Conclusion: Some errors have been made during the creation of "ENCODE_IHEC_keys.tsv". As demonstrated by having a set of files from a biosamples being marked as not having an epirr, when we found the corresponding epirr in the training metadata. We need to recreate the metadata from zero to guarantee the right values.

In [323]:
chip_experiment_accessions = set(encode_meta_df["experiment_accession"].unique().tolist())
chip_file_accessions = set(encode_meta_df["md5sum"].unique().tolist())
chip_biosample_accessions = set()
for vals in encode_meta_df["biosample_accession"].unique():
    biosample_accs = vals.split(",")
    chip_biosample_accessions.update(biosample_accs)

print("CHIP file accessions:", len(chip_file_accessions))
print("CHIP experiment accessions:", len(chip_experiment_accessions))
print("CHIP biosample accessions:", len(chip_biosample_accessions))

# assert encode_meta_df["experiment_accession"].str.slice(0, 5).isin(["ENCSR"]).all()

CHIP file accessions: 9619
CHIP experiment accessions: 9082
CHIP biosample accessions: 1729


In [324]:
for elem, name in zip(
    [chip_experiment_accessions, chip_file_accessions, chip_biosample_accessions],
    ["exp", "file", "biosample"],
):
    if "unknown" in elem:
        print(f"Unknown accession in {name} ChIP metadata.")

Unknown accession in biosample ChIP metadata.


In [325]:
del encode_meta_df
del epiclass_df
del epiclass_metadata
del encode_accessions_df
del encode_ihec_df
gc.collect()

55

## Collect RNA accessions

In [326]:
encode_rna_file_meta_path = (
    encode_metadata_dir / "old_meta" / "metadata--ENCODE_RNA_2023mar_hg38_BW_default.tsv"
)
encode_rna_meta_df = pd.read_csv(encode_rna_file_meta_path, sep="\t")
print(encode_rna_meta_df.shape)
rna_file_accessions = set(encode_rna_meta_df["File accession"].unique().tolist())

(1792, 59)


In [327]:
encode_rna_exp_meta_path = (
    encode_metadata_dir / "old_meta" / "ENCODE_RNA_2023mar_hg38_BW_default_exp_report.tsv"
)
encode_rna_exp_meta_df = pd.read_csv(encode_rna_exp_meta_path, sep="\t", skiprows=1)
print(encode_rna_exp_meta_df.shape)

rna_exp_accessions = set(encode_rna_exp_meta_df["Accession"].unique().tolist())
rna_biosample_accessions = set()
for vals in encode_rna_exp_meta_df["Biosample accession"].unique():
    biosample_accs = vals.split(",")
    rna_biosample_accessions.update(biosample_accs)

(1015, 40)


In [328]:
print("RNA file accessions:", len(rna_file_accessions))
print("RNA experiment accessions:", len(rna_exp_accessions))
print("RNA biosample accessions:", len(rna_biosample_accessions))

RNA file accessions: 1792
RNA experiment accessions: 1015
RNA biosample accessions: 1301


In [329]:
for elem, name in zip(
    [rna_exp_accessions, rna_file_accessions, rna_biosample_accessions],
    ["exp", "file", "biosample"],
):
    if "unknown" in elem:
        print(f"Unknown accession in {name} RNA metadata.")

In [330]:
del encode_rna_meta_df
del encode_rna_exp_meta_df
gc.collect()

0

## Download and create new ENCODE metadata

In [331]:
all_experiment_accessions = chip_experiment_accessions | rna_exp_accessions
all_file_accessions = chip_file_accessions | rna_file_accessions
all_biosample_accessions = chip_biosample_accessions | rna_biosample_accessions
for elem in [all_experiment_accessions, all_file_accessions, all_biosample_accessions]:
    try:
        elem.remove("unknown")
    except KeyError:
        pass

print("All file accessions:", len(all_file_accessions))
print("All experiment accessions:", len(all_experiment_accessions))
print("All biosample accessions:", len(all_biosample_accessions))

All file accessions: 11411
All experiment accessions: 10097
All biosample accessions: 2915


In [332]:
url_exp_template_object = (
    "https://www.encodeproject.org/experiments/{}/?frame=object&format=json"
)
url_exp_template_embedded = (
    "https://www.encodeproject.org/experiments/{}/?frame=embedded&format=json"
)
url_biosample_template_object = (
    "https://www.encodeproject.org/biosamples/{}/?frame=object&format=json"
)
url_file_template_object = (
    "https://www.encodeproject.org/files/{}/?frame=object&format=json"
)
url_biosample_type_template_object = (
    "https://www.encodeproject.org/biosample-types/{}/?frame=object&format=json"
)

In [333]:
def fetch_json(url: str, headers: Dict[str, str]) -> Dict[str, Any] | None:
    """Helper function to fetch JSON data from a URL."""
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json()

    print(f"Failed to fetch {url}: {response.status_code}")
    return None


def fetch_experiment_metadata(
    experiment_acc: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch experiment metadata from ENCODE API."""
    return fetch_json(url_exp_template_object.format(experiment_acc), headers)


def fetch_biosample_type_metadata(
    term_id: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch biosample type metadata from ENCODE API.

    Args:
        term_id (str): Biosample type term ID, of the form "[classification]_[ontology_id]", e.g. "cell_line_EFO_0001203".
        headers (Dict[str, str]): HTTP headers, such as authorization, to be passed.
    """
    return fetch_json(url_biosample_type_template_object.format(term_id), headers)


def fetch_file_metadata(file_acc: str, headers: Dict[str, str]) -> Dict[str, Any] | None:
    """Fetch experiment metadata from ENCODE API."""
    return fetch_json(url_file_template_object.format(file_acc), headers)


def fetch_replicate_biosample(
    experiment_acc: str, headers: Dict[str, str]
) -> Dict[str, List[str]] | None:
    """Fetch biosample accessions for an experiment."""
    data = fetch_json(url_exp_template_embedded.format(experiment_acc), headers)
    if data:
        accession_list = [
            replicate["library"]["biosample"]["accession"]
            for replicate in data.get("replicates", [])
        ]
        return {experiment_acc: accession_list}
    return None


def fetch_biosample_metadata(
    biosample_acc: str, headers: Dict[str, str]
) -> Dict[str, Any] | None:
    """Fetch biosample metadata from ENCODE API."""
    return fetch_json(url_biosample_template_object.format(biosample_acc), headers)


def parallel_fetch(
    func: Callable[[str, Dict[str, str]], Any],
    identifiers: Iterable[str],
    headers: Dict[str, str],
    max_workers: int = 10,
    task_name: str = "Fetching Data",
) -> List[Any]:
    """
    Generic function to fetch data in parallel.

    Args:
        func (Callable): The function to execute in parallel.
        identifiers (Iterable[str]): List of input identifiers for the function.
        headers (Dict[str, str]): HTTP headers, such as authorization, to be passed.
        max_workers (int): Number of parallel threads.
        task_name (str): Name for progress bar.

    Returns:
        List of results.
    """
    results = []

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_id = {executor.submit(func, id_, headers): id_ for id_ in identifiers}

        for future in tqdm(
            as_completed(future_to_id),
            total=len(future_to_id),
            desc=task_name,
            unit="entry",
        ):
            result = future.result()
            if result is not None:
                results.append(result)

    return results

In [334]:
def fetch_and_update_metadata(
    func: Callable[[str, Dict[str, str]], Dict[str, Any] | None],
    all_accession_set: Set[str],
    output_filepath: Path,
    max_workers: int = 5,
    task_name: str = "Fetching metadata",
) -> List[Dict[str, Any]]:
    """
    General function to fetch new metadata and update an existing JSON/TSV file.

    Args:
        func (Callable): Function to fetch metadata for a single accession.
        all_accession_set (Set[str]): Set of all accessions to process.
        output_filepath (Path): Path to the JSON file storing metadata.
        max_workers (int): Number of parallel workers (default: 5).
        task_name (str): Progress bar name.

    Returns:
        List[Dict[str, Any]]: The combined metadata after fetching new entries.
    """
    headers = {"accept": "application/json"}

    # Step 1: Load existing metadata if the file exists
    if output_filepath.exists():
        with open(output_filepath, "r", encoding="utf-8") as f:
            existing_metadata = json.load(f)
    else:
        existing_metadata = []

    # Step 2: Identify already fetched accessions
    attempt_keys = ["accession", "obo_id", "name"]
    for key in attempt_keys:
        try:
            existing_accessions = {entry[key] for entry in existing_metadata}
            break
        except KeyError:
            continue
    else:
        print(
            f"No {attempt_keys} keys in existing metadata. Using first key of each entry instead."
        )
        existing_accessions = {list(entry.keys())[0] for entry in existing_metadata}

    # Step 3: Find missing accessions
    missing_accessions = all_accession_set - existing_accessions

    # Step 4: Fetch new metadata if needed
    if missing_accessions:
        print(f"Fetching {len(missing_accessions)} new records...")
        new_metadata = parallel_fetch(
            func=func,
            identifiers=missing_accessions,
            headers=headers,
            max_workers=max_workers,
            task_name=task_name,
        )

        # Merge old and new metadata
        combined_metadata = existing_metadata + new_metadata

        # Save updated JSON file
        with open(output_filepath, "w", encoding="utf-8") as f:
            json.dump(combined_metadata, f, indent=2)

        # Save updated TSV file
        file_df = pd.DataFrame.from_records(combined_metadata)
        file_df.to_csv(output_filepath.with_suffix(".tsv"), sep="\t", index=False)

        return combined_metadata

    print("No new metadata to fetch. Data is already up-to-date.")
    return existing_metadata

In [335]:
output_file = encode_metadata_dir / "new_meta" / "encode_file_metadata_2025-02.json"
encode_file_metadata_list = fetch_and_update_metadata(
    func=fetch_file_metadata,
    all_accession_set=all_file_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching File metadata",
)
encode_file_metadata: Dict[str, Dict[str, Any]] = {
    dset["accession"]: dset for dset in encode_file_metadata_list
}

No new metadata to fetch. Data is already up-to-date.


Find experiment accessions from file metadata. Make sure exp accessions are unique.

In [336]:
all_experiment_accessions = set()
for file_acc, dataset in encode_file_metadata.items():
    acc_str = dataset["dataset"]
    if acc_str.count("ENCSR") > 1:
        raise ValueError("Multiple experiments per file:", dataset)

    exp_acc = acc_str.split("/")[-2]
    all_experiment_accessions.add(exp_acc)
    encode_file_metadata[file_acc]["experiment_accession"] = exp_acc

for val in all_experiment_accessions:
    if not val.startswith("ENCSR"):
        raise ValueError("Experiment accessions do not start with ENCSR:", val)

In [337]:
output_file = encode_metadata_dir / "new_meta" / "encode_experiment_metadata_2025-02.json"
encode_exp_metadata_list = fetch_and_update_metadata(
    func=fetch_experiment_metadata,
    all_accession_set=all_experiment_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Experiment metadata",
)
encode_exp_metadata: Dict[str, Dict[str, Any]] = {
    dset["accession"]: dset for dset in encode_exp_metadata_list
}

No new metadata to fetch. Data is already up-to-date.


In [338]:
output_file = (
    encode_metadata_dir / "new_meta" / "encode_biosample_accessions_2025-02.json"
)
encode_biosample_accessions_list = fetch_and_update_metadata(
    func=fetch_replicate_biosample,
    all_accession_set=all_experiment_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Experiment Biosample Accessions",
)
encode_biosample_accessions_dict = {
    list(dset.keys())[0]: list(dset.values())[0]
    for dset in encode_biosample_accessions_list
}

No ['accession', 'obo_id', 'name'] keys in existing metadata. Using first key of each entry instead.
No new metadata to fetch. Data is already up-to-date.


In [339]:
all_biosamples_accessions = set()
for biosample_accs in encode_biosample_accessions_dict.values():
    all_biosamples_accessions.update(biosample_accs)

In [340]:
output_file = encode_metadata_dir / "new_meta" / "encode_biosample_metadata_2025-02.json"
encode_biosample_metadata_list = fetch_and_update_metadata(
    func=fetch_biosample_metadata,
    all_accession_set=all_biosamples_accessions,
    output_filepath=output_file,
    max_workers=5,
    task_name="Fetching Biosample Metadata",
)
encode_biosample_metadata = {
    dset["accession"]: dset for dset in encode_biosample_metadata_list
}

Fetching 4 new records...


Fetching Biosample Metadata: 100%|██████████| 4/4 [00:00<00:00,  8.32entry/s]

Failed to fetch https://www.encodeproject.org/biosamples/ENCBS126XME/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosamples/ENCBS374HUP/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosamples/ENCBS297ZBJ/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosamples/ENCBS184ERI/?frame=object&format=json: 403





In [341]:
def get_biosample_type(biosample_ontology: str) -> str:
    """Extract the term ID from a biosample ontology string.

    Takes string of format '/biosample-types/cell_line_EFO_0001203/'
    and returns 'cell_line_EFO_0001203'.
    """
    biosample_ontology_no_prefix = biosample_ontology.replace("/", "").replace(
        "biosample-types", ""
    )
    return biosample_ontology_no_prefix


def get_biosample_term_id(biosample_ontology: str) -> str:
    """Extract the term ID from a biosample ontology string.

    Takes string of format '/biosample-types/cell_line_EFO_0001203/'
    and returns 'EFO:0001203'.
    """
    biosample_ontology_no_prefix = get_biosample_type(biosample_ontology)
    biosample_term_id = ":".join(biosample_ontology_no_prefix.split("_")[-2:])
    return biosample_term_id

In [342]:
get_biosample_type(encode_biosample_metadata["ENCBS000AAA"]["biosample_ontology"])
get_biosample_term_id(encode_biosample_metadata["ENCBS000AAA"]["biosample_ontology"])

'EFO:0001203'

In [343]:
biosample_types = [
    get_biosample_type(dset["biosample_ontology"])
    for dset in encode_biosample_metadata.values()
]

In [344]:
new_file = encode_metadata_dir / "new_meta" / "biosample_types_metadata.json"

biosample_type_metadata_list = fetch_and_update_metadata(
    func=fetch_biosample_type_metadata,
    all_accession_set=set(biosample_types),
    output_filepath=new_file,
    max_workers=5,
    task_name="Fetching missing biosample type info",
)
encode_biosample_type_metadata = {
    dset["term_id"]: dset for dset in biosample_type_metadata_list
}

Fetching 4 new records...


Fetching missing biosample type info:   0%|          | 0/4 [00:00<?, ?entry/s]

Fetching missing biosample type info: 100%|██████████| 4/4 [00:00<00:00,  9.32entry/s]

Failed to fetch https://www.encodeproject.org/biosample-types/primary_cell_NTR_0000661/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosample-types/primary_cell_NTR_0000667/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosample-types/primary_cell_NTR_0000662/?frame=object&format=json: 403
Failed to fetch https://www.encodeproject.org/biosample-types/primary_cell_NTR_0000668/?frame=object&format=json: 403





### Combine experiment and biosample metadata

Exclude datasets with incoherent biosamples, can't do a case by case basis

In [345]:
verbose = False

N_counter = Counter()
missing_biosample_metadata = set()
problematic_experiments = set()
for exp_acc, biosample_accs in encode_biosample_accessions_dict.items():
    N = len(biosample_accs)
    N_counter[N] += 1
    biosample_meta = []
    if N > 1:
        for biosample_acc in biosample_accs:
            try:
                meta = encode_biosample_metadata[biosample_acc]
            except KeyError:
                missing_biosample_metadata.add(biosample_acc)
                biosample_meta.append(("unknown", "unknown", "unknown"))
                if verbose:
                    print("Missing biosample metadata:", biosample_acc)
                continue
            sex = meta.get("sex", "unknown")
            life_stage = meta.get("life_stage", "unknown")
            health_status = meta.get("health_status", "unknown")
            biosample_meta.append((sex, life_stage, health_status))

        # biosample info needs to be the same
        if len(set(biosample_meta)) > 1:
            problematic_experiments.add(exp_acc)
            if verbose:
                print("Inconsistent biosample metadata:", exp_acc, biosample_meta)

In [346]:
print(missing_biosample_metadata)

{'ENCBS297ZBJ', 'ENCBS184ERI', 'ENCBS126XME', 'ENCBS374HUP'}


Remove experiments with mixed biosample metadata

In [347]:
print("Nb exp to remove:", len(problematic_experiments))
print("Nb exp before:", len(encode_exp_metadata))
experiment_metadata = {
    k: v for k, v in encode_exp_metadata.items() if k not in problematic_experiments
}
print("Nb exp after:", len(experiment_metadata))

Nb exp to remove: 76
Nb exp before: 10085
Nb exp after: 10009


Remove experiments with no known biosamples

In [348]:
to_remove = []
for exp_acc in list(experiment_metadata.keys()):
    biosample_accs = encode_biosample_accessions_dict[exp_acc]
    # print(experiment_accession, biosample_accs)

    known_biosamples_accs = set(biosample_accs) - set(missing_biosample_metadata)
    if exp_acc == "ENCSR988LZG":
        print("Known biosamples:", known_biosamples_accs)
    if not known_biosamples_accs:
        to_remove.append(exp_acc)
        print(f"Experiment {exp_acc} has no known biosamples: {biosample_accs}")

print("Number of experiments to remove:", len(to_remove))
print("to_remove:", to_remove)
print("Nb exp before:", len(experiment_metadata))
for acc in to_remove:
    del experiment_metadata[acc]
print("Nb exp after:", len(experiment_metadata))

Known biosamples: set()
Experiment ENCSR988LZG has no known biosamples: ['ENCBS374HUP', 'ENCBS297ZBJ']
Number of experiments to remove: 1
to_remove: ['ENCSR988LZG']
Nb exp before: 10009
Nb exp after: 10008


#### Prepend name to columns from file, experiment vs biosample metadata.

In [349]:
file_unique_keys = set()
for dset in encode_file_metadata.values():
    file_unique_keys.update(dset.keys())

exp_unique_keys = set()
for dset in encode_exp_metadata.values():
    exp_unique_keys.update(dset.keys())

biosample_unique_keys = set()
for dset in encode_biosample_metadata.values():
    biosample_unique_keys.update(dset.keys())

biosample_type_unique_keys = set()
for dset in encode_biosample_type_metadata.values():
    biosample_type_unique_keys.update(dset.keys())

overlapping_keys = exp_unique_keys & biosample_unique_keys
if overlapping_keys:
    print("Overlapping keys:", overlapping_keys)

Overlapping keys: {'documents', '@id', 'internal_tags', 'accession', '@type', 'date_created', 'submitted_by', 'status', 'aliases', 'description', 'uuid', 'perturbed', 'references', 'schema_version', 'lab', 'submitter_comment', 'award', 'biosample_ontology', 'dbxrefs', 'notes', 'alternate_accessions'}


In [350]:
full_metadata = copy.deepcopy(encode_file_metadata)
invalid_files = set()
for file_acc, dset_metadata in list(full_metadata.items()):
    exp_acc = dset_metadata["experiment_accession"]

    exp_metadata = copy.deepcopy(encode_exp_metadata[exp_acc])

    biosample_accs = encode_biosample_accessions_dict[exp_acc]
    known_biosamples_accs = set(biosample_accs) - set(missing_biosample_metadata)
    if not known_biosamples_accs:
        print(
            f"File {file_acc} with Experiment {exp_acc} has no known biosamples: {biosample_accs}"
        )
        invalid_files.add(file_acc)
        continue

    biosample_acc = (
        known_biosamples_accs.pop()
    )  # choose random one, coherence check previously
    try:
        biosample_metadata = copy.deepcopy(encode_biosample_metadata[biosample_acc])
    except KeyError:
        print(f"ERROR:{exp_acc}:{biosample_acc}")
        continue

    biosample_term_id = get_biosample_term_id(biosample_metadata["biosample_ontology"])
    try:
        biosample_type_metadata = copy.deepcopy(
            encode_biosample_type_metadata[biosample_term_id]
        )
    except KeyError:
        print(
            f"Missing biosample type metadata: {file_acc}-{exp_acc}-{biosample_acc}-{biosample_term_id}"
        )
        biosample_type_metadata = {}

    # create unique names
    for unique_meta_labels, str_prepend, dset in zip(
        [
            file_unique_keys,
            exp_unique_keys,
            biosample_unique_keys,
            biosample_type_unique_keys,
        ],
        ["FILE", "EXPERIMENT", "BIOSAMPLE", "BIOSAMPLE_TYPE"],
        [dset_metadata, exp_metadata, biosample_metadata, biosample_type_metadata],
    ):
        for key in unique_meta_labels:
            try:
                dset[f"{str_prepend}_{key}"] = dset[key]
                del dset[key]
            except KeyError:
                pass

    dset_metadata.update(exp_metadata)
    dset_metadata.update(biosample_metadata)
    dset_metadata.update(biosample_type_metadata)

    dset_metadata["biosamples"] = biosample_accs

for file_acc in invalid_files:
    del full_metadata[file_acc]

Missing biosample type metadata: ENCFF369ZTS-ENCSR207ZAR-ENCBS253TXN-NTR:0000667
Missing biosample type metadata: ENCFF730ZVW-ENCSR753YVA-ENCBS253TXN-NTR:0000667
Missing biosample type metadata: ENCFF507ALQ-ENCSR948CRE-ENCBS523QRO-NTR:0000668
Missing biosample type metadata: ENCFF781QJF-ENCSR109NVM-ENCBS218KXX-NTR:0000661
Missing biosample type metadata: ENCFF868UKZ-ENCSR053RFF-ENCBS619HZC-NTR:0000662
Missing biosample type metadata: ENCFF531QKY-ENCSR563XBT-ENCBS619HZC-NTR:0000662
Missing biosample type metadata: ENCFF922VJK-ENCSR526IXP-ENCBS261THR-NTR:0000667
Missing biosample type metadata: ENCFF196DNZ-ENCSR630FZF-ENCBS819MUY-NTR:0000667
Missing biosample type metadata: ENCFF554LYV-ENCSR439SOW-ENCBS619HZC-NTR:0000662
Missing biosample type metadata: ENCFF827TJV-ENCSR473VWE-ENCBS819MUY-NTR:0000667
Missing biosample type metadata: ENCFF118NTF-ENCSR617DUK-ENCBS261THR-NTR:0000667
Missing biosample type metadata: ENCFF953JLS-ENCSR453MVF-ENCBS261THR-NTR:0000667
Missing biosample type metad

Remove line breaks from values, it made reading file difficult.

In [351]:
full_metadata_df = pd.DataFrame(list(full_metadata.values()))
cols_with_line_breaks = set()
for col in full_metadata_df.columns:
    for specific_value in full_metadata_df[col].astype(str).unique():
        if "\n" in specific_value:
            cols_with_line_breaks.add(col)
            break

In [352]:
for col in cols_with_line_breaks:
    for idx, value in full_metadata_df[col].items():
        if "\n" in str(value):
            full_metadata_df.at[idx, col] = value.replace("\n", ";")

Finally, save the combined metadata

In [353]:
full_metadata_path = (
    encode_metadata_dir / "new_meta" / "encode_full_metadata_2025-02.json"
)

if not full_metadata_path.exists():
    full_metadata_df.to_json(full_metadata_path, orient="records", indent=2)
    full_metadata_df.to_csv(full_metadata_path.with_suffix(".csv"), sep=",", index=False)
else:
    with open(full_metadata_path, "r", encoding="utf-8") as f:
        full_metadata = json.load(f)
        full_metadata_df = pd.DataFrame(full_metadata)

In [354]:
print(full_metadata_df.shape)

(11410, 197)


In [355]:
if full_metadata_df.shape[0] != full_metadata_df["FILE_accession"].nunique():
    print("MAJOR ERROR")

### Check relevant metadata categories

In [356]:
potential_cols = []
for col in full_metadata_df.columns:
    if any(
        label in col.lower()
        for label in [
            "cancer",
            "health",
            "status",
            "life",
            "sex",
            "biosample_ontology",
            "name",
        ]
    ):
        print(col)
        potential_cols.append(col)

for col in potential_cols:
    display(full_metadata_df[col].value_counts(dropna=False))

FILE_assay_term_name
FILE_status
FILE_biosample_ontology
FILE_submitted_file_name
EXPERIMENT_assay_term_name
EXPERIMENT_biosample_ontology
EXPERIMENT_status
EXPERIMENT_life_stage_age
EXPERIMENT_internal_status
BIOSAMPLE_life_stage
BIOSAMPLE_status
BIOSAMPLE_biosample_ontology
BIOSAMPLE_sex
BIOSAMPLE_TYPE_name
BIOSAMPLE_TYPE_status
BIOSAMPLE_TYPE_term_name
BIOSAMPLE_health_status
BIOSAMPLE_disease_term_name
BIOSAMPLE_subcellular_fraction_term_name


FILE_assay_term_name
ChIP-seq              8631
RNA-seq               1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

FILE_status
released    11353
archived       40
revoked        17
Name: count, dtype: int64

FILE_biosample_ontology
/biosample-types/cell_line_EFO_0002067/      1196
/biosample-types/cell_line_EFO_0001187/      1015
/biosample-types/cell_line_EFO_0001086/       550
/biosample-types/tissue_UBERON_0009834/       549
/biosample-types/cell_line_EFO_0002784/       293
                                             ... 
/biosample-types/primary_cell_CL_0000121/       1
/biosample-types/tissue_UBERON_0003662/         1
/biosample-types/tissue_UBERON_0004538/         1
/biosample-types/cell_line_EFO_0002324/         1
/biosample-types/tissue_UBERON_0003663/         1
Name: count, Length: 402, dtype: int64

FILE_submitted_file_name
None                                                                                                                                                                                                                                                              888
gs://encode-processing/caper_out_v04_05/chip/995c7d12-d738-4d65-b752-81b76d940336/call-macs2_signal_track_pooled/glob-7ab0340dfeb10ca109917cbdcc568548/rep.pooled_x_ctl.pooled.pval.signal.bigwig                                                                   1
gs://encode-processing/caper_out_v04_05/rna/32b8d0f6-8d39-4dff-9f47-df576faf22d8/call-bam_to_signals/shard-0/glob-b1f717d7f19cd2629e443e4a3f43f8de/rep1ENCSR995BHD_genome_minusUniq.bw                                                                              1
gs://encode-processing/caper_out_v04_05/rna/e6fbbdd2-c0fa-4e20-adc1-4f1d454b55d5/call-bam_to_signals/shard-0/glob-b1f717d7f19cd2629e443e4a3f43f8de/rep1ENCSR043RSE_genome_minusUniq.bw       

EXPERIMENT_assay_term_name
ChIP-seq              8631
RNA-seq               1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

EXPERIMENT_biosample_ontology
/biosample-types/cell_line_EFO_0002067/      1196
/biosample-types/cell_line_EFO_0001187/      1015
/biosample-types/cell_line_EFO_0001086/       550
/biosample-types/tissue_UBERON_0009834/       549
/biosample-types/cell_line_EFO_0002784/       293
                                             ... 
/biosample-types/primary_cell_CL_0000121/       1
/biosample-types/tissue_UBERON_0003662/         1
/biosample-types/tissue_UBERON_0004538/         1
/biosample-types/cell_line_EFO_0002324/         1
/biosample-types/tissue_UBERON_0003663/         1
Name: count, Length: 402, dtype: int64

EXPERIMENT_status
released    11404
revoked         6
Name: count, dtype: int64

EXPERIMENT_life_stage_age
None                  2483
adult 53 years        1655
child 15 years        1028
adult 58 years         555
adult 51 years         316
                      ... 
embryonic 112 days       1
embryonic 117 days       1
embryonic 107 days       1
child 11 years           1
embryonic 87 days        1
Name: count, Length: 121, dtype: int64

EXPERIMENT_internal_status
release ready            11241
pipeline completed         150
pipeline error              10
no available pipeline        7
unreviewed                   2
Name: count, dtype: int64

BIOSAMPLE_life_stage
adult        7058
embryonic    1612
child        1448
unknown      1076
newborn       216
Name: count, dtype: int64

BIOSAMPLE_status
released    11410
Name: count, dtype: int64

BIOSAMPLE_biosample_ontology
/biosample-types/cell_line_EFO_0002067/      1196
/biosample-types/cell_line_EFO_0001187/      1015
/biosample-types/cell_line_EFO_0001086/       550
/biosample-types/tissue_UBERON_0009834/       549
/biosample-types/cell_line_EFO_0002784/       293
                                             ... 
/biosample-types/primary_cell_CL_0000121/       1
/biosample-types/tissue_UBERON_0003662/         1
/biosample-types/tissue_UBERON_0004538/         1
/biosample-types/cell_line_EFO_0002324/         1
/biosample-types/tissue_UBERON_0003663/         1
Name: count, Length: 402, dtype: int64

BIOSAMPLE_sex
female     5353
male       5219
unknown     838
Name: count, dtype: int64

BIOSAMPLE_TYPE_name
cell_line_EFO_0002067      1196
cell_line_EFO_0001187      1015
cell_line_EFO_0001086       550
tissue_UBERON_0009834       549
cell_line_EFO_0002784       293
                           ... 
cell_line_EFO_0002324         1
cell_line_EFO_0005237         1
tissue_UBERON_0004538         1
tissue_UBERON_0003662         1
primary_cell_CL_0000121       1
Name: count, Length: 395, dtype: int64

BIOSAMPLE_TYPE_status
released    11369
None           41
Name: count, dtype: int64

BIOSAMPLE_TYPE_term_name
K562                              1196
HepG2                             1015
A549                               550
dorsolateral prefrontal cortex     549
GM12878                            293
                                  ... 
hindlimb muscle                      1
left kidney                          1
Purkinje cell                        1
forelimb muscle                      1
U-87 MG                              1
Name: count, Length: 394, dtype: int64

BIOSAMPLE_health_status
None                                                                                     4020
healthy                                                                                  1775
chronic myelogenous leukemia (CML)                                                       1196
hepatocellular carcinoma                                                                 1015
unknown                                                                                   988
                                                                                         ... 
neuroglioma                                                                                 2
clinically normal; monozygotic twin sister with Cornelia De Lange syndrome is GM13977       2
malignant glioblastoma                                                                      2
medulloblastoma                                                                             2
Burkitt's lymphoma                  

BIOSAMPLE_disease_term_name
None                                           10814
[multiple sclerosis]                             172
[Alzheimer's disease]                            161
[mild cognitive impairment]                      131
[Cognitive impairment]                            28
[nonobstructive coronary artery disease]          28
[Alzheimer's disease, Cognitive impairment]       19
[amyotrophic lateral sclerosis]                   18
[squamous cell carcinoma]                         15
[basal cell carcinoma]                            14
[Cognitive impairment, Alzheimer's disease]       10
Name: count, dtype: int64

BIOSAMPLE_subcellular_fraction_term_name
None                              11274
nucleus                              74
cytosol                              38
membrane                              8
insoluble cytoplasmic fraction        8
nucleolus                             4
chromatin                             2
nucleoplasm                           2
Name: count, dtype: int64

Sex and life stage metadata categories already available, let's get format them.

In [357]:
full_metadata_df["donor_life_stage"] = full_metadata_df["BIOSAMPLE_life_stage"]
full_metadata_df["donor_sex"] = full_metadata_df["BIOSAMPLE_sex"]

Note: must remove revoked files/experiments.

In [358]:
new_output_name = "encode_full_metadata_2025-02_no_revoked.csv"
new_output_path = full_metadata_path.with_name(new_output_name)
new_output_path.unlink(missing_ok=True)
if not new_output_path.exists():
    N_before = full_metadata_df.shape[0]
    for cat_type in ["FILE", "EXPERIMENT", "BIOSAMPLE"]:
        cat = f"{cat_type}_status"
        full_metadata_df = full_metadata_df[full_metadata_df[cat] != "revoked"]
    N_after = full_metadata_df.shape[0]

    print(f"Removed {N_before - N_after} revoked entries")

    full_metadata_df.to_csv(new_output_path, index=False)

Removed 17 revoked entries


In [359]:
potential_cols = []
for col in full_metadata_df.columns:
    if any(label in col.lower() for label in ["assay", "target", "antibody"]):
        potential_cols.append(col)

for col in potential_cols:
    display(full_metadata_df[col].value_counts(dropna=False))

FILE_assay_term_name
ChIP-seq              8614
RNA-seq               1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

FILE_target
None                        3773
/targets/H3K4me3-human/      705
/targets/H3K27ac-human/      623
/targets/H3K27me3-human/     585
/targets/H3K4me1-human/      556
                            ... 
/targets/E2F7-human/           1
/targets/GFI1-human/           1
/targets/PPARD-human/          1
/targets/NUFIP1-human/         1
/targets/ZNF205-human/         1
Name: count, Length: 1179, dtype: int64

FILE_assay_title
TF ChIP-seq           3559
Histone ChIP-seq      3074
Control ChIP-seq      1981
total RNA-seq         1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

EXPERIMENT_assay_term_id
OBI:0000716    8614
OBI:0001271    1188
OBI:0002160     987
OBI:0002571     604
Name: count, dtype: int64

EXPERIMENT_assay_slims
[DNA binding]      9601
[Transcription]    1792
Name: count, dtype: int64

EXPERIMENT_assay_term_name
ChIP-seq              8614
RNA-seq               1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

EXPERIMENT_target
None                        3773
/targets/H3K4me3-human/      705
/targets/H3K27ac-human/      623
/targets/H3K27me3-human/     585
/targets/H3K4me1-human/      556
                            ... 
/targets/E2F7-human/           1
/targets/GFI1-human/           1
/targets/PPARD-human/          1
/targets/NUFIP1-human/         1
/targets/ZNF205-human/         1
Name: count, Length: 1179, dtype: int64

EXPERIMENT_assay_title
TF ChIP-seq           3559
Histone ChIP-seq      3074
Control ChIP-seq      1981
total RNA-seq         1188
Mint-ChIP-seq          987
polyA plus RNA-seq     604
Name: count, dtype: int64

In [360]:
unknown_target_df = full_metadata_df[full_metadata_df["FILE_target"].isnull()].copy()
for col in potential_cols:
    display(unknown_target_df[col].value_counts(dropna=False))

FILE_assay_term_name
ChIP-seq              1981
RNA-seq               1188
polyA plus RNA-seq     604
Name: count, dtype: int64

FILE_target
None    3773
Name: count, dtype: int64

FILE_assay_title
Control ChIP-seq      1981
total RNA-seq         1188
polyA plus RNA-seq     604
Name: count, dtype: int64

EXPERIMENT_assay_term_id
OBI:0000716    1981
OBI:0001271    1188
OBI:0002571     604
Name: count, dtype: int64

EXPERIMENT_assay_slims
[DNA binding]      1981
[Transcription]    1792
Name: count, dtype: int64

EXPERIMENT_assay_term_name
ChIP-seq              1981
RNA-seq               1188
polyA plus RNA-seq     604
Name: count, dtype: int64

EXPERIMENT_target
None    3773
Name: count, dtype: int64

EXPERIMENT_assay_title
Control ChIP-seq      1981
total RNA-seq         1188
polyA plus RNA-seq     604
Name: count, dtype: int64

Note: No EXPERIMENT_target means RNA-seq or input.

### Create 'assay' and 'assay_epiclass' categories

In [361]:
all_core_assays = set(ASSAY_ORDER) | {"mrna_seq", "wgbs_standard", "wgbs_pbat"}
print(all_core_assays)

{'h3k4me3', 'wgbs_pbat', 'h3k27ac', 'rna_seq', 'wgbs', 'h3k27me3', 'input', 'mrna_seq', 'h3k36me3', 'wgbs_standard', 'h3k9me3', 'h3k4me1'}


In [362]:
no_target_mapping = {
    "Control ChIP-seq": "input",
    "total RNA-seq": "rna_seq",
    "polyA plus RNA-seq": "mrna_seq",
}
unknown_target_df["assay"] = unknown_target_df["EXPERIMENT_assay_title"].map(
    no_target_mapping
)
display(unknown_target_df["assay"].value_counts(dropna=False))

assay
input       1981
rna_seq     1188
mrna_seq     604
Name: count, dtype: int64

In [363]:
known_target_df = full_metadata_df[~full_metadata_df["FILE_target"].isnull()].copy()

Sanity check: are all targets marked as human?

In [364]:
non_human_target = []
for val in known_target_df["FILE_target"].value_counts(dropna=False).keys():
    if "human" not in val:
        non_human_target.append(val)

for val in non_human_target:
    sub_df = known_target_df[known_target_df["FILE_target"] == val]
    print(f"{val}: {sub_df.shape[0]} files")
    # display(
    #     known_target_df[known_target_df["FILE_target"] == val].head()
    # )

/targets/Cebpa-rat/: 12 files


In [365]:
for vals in known_target_df[
    known_target_df["FILE_target"].str.lower().str.contains("cebpa")
][
    ["FILE_accession", "EXPERIMENT_accession", "BIOSAMPLE_accession", "FILE_target"]
].values:
    print("\t".join(vals))

ENCFF381JQO	ENCSR142IGM	ENCBS448PNJ	/targets/CEBPA-human/
ENCFF556MUP	ENCSR827TOM	ENCBS696VSU	/targets/Cebpa-rat/
ENCFF077NMD	ENCSR490DXW	ENCBS554PIB	/targets/Cebpa-rat/
ENCFF493RKY	ENCSR195PUH	ENCBS768BYO	/targets/Cebpa-rat/
ENCFF270PHI	ENCSR334SSD	ENCBS550RXJ	/targets/Cebpa-rat/
ENCFF232PZE	ENCSR917FJW	ENCBS344FXJ	/targets/Cebpa-rat/
ENCFF933YBT	ENCSR400YGT	ENCBS699CMK	/targets/Cebpa-rat/
ENCFF175OUN	ENCSR867ARL	ENCBS276SDE	/targets/Cebpa-rat/
ENCFF801ICZ	ENCSR548ADW	ENCBS929MQK	/targets/Cebpa-rat/
ENCFF511QZU	ENCSR368EZK	ENCBS439JQP	/targets/Cebpa-rat/
ENCFF248HWL	ENCSR236DYB	ENCBS101RZA	/targets/Cebpa-rat/
ENCFF631BOR	ENCSR288MRA	ENCBS054NUM	/targets/Cebpa-rat/
ENCFF381MBI	ENCSR377EZA	ENCBS732KGE	/targets/Cebpa-rat/


c'est en effet inhabituel, c'est possiblement une erreur d'annotation ou encore que l'anticorps utilisé pour faire le ChIP reconnaît à la fois la prt humaine et celle du rat car les séquences de protéines sont ~95% identique; le détail important est que l'expérience a été faite dans cellules humaines, ce qui est le cas ici alors je propose de simplement ignorer la présence de rat et combiner les résultats.

In [366]:
known_target_df["assay"] = (
    known_target_df["FILE_target"]
    .str.split(r"/targets/", expand=True)[1]
    .str.split("-", expand=True)[0]
    .str.lower()
)
display(known_target_df["assay"].value_counts(dropna=False))

assay
h3k4me3     705
h3k27ac     623
h3k27me3    585
h3k4me1     556
h3k36me3    554
           ... 
atm           1
znf219        1
h4k12ac       1
znf788        1
znf621        1
Name: count, Length: 1176, dtype: int64

In [367]:
full_metadata_df = pd.concat([known_target_df, unknown_target_df], axis=0)

In [368]:
unique_names = list(all_core_assays) + ["ctcf"]
full_metadata_df[ASSAY] = [
    label if label in unique_names else "non-core" for label in full_metadata_df["assay"]
]
display(full_metadata_df[ASSAY].value_counts(dropna=False))

assay_epiclass
non-core    3593
input       1981
rna_seq     1188
h3k4me3      705
h3k27ac      623
mrna_seq     604
h3k27me3     585
h3k4me1      556
h3k36me3     554
h3k9me3      536
ctcf         468
Name: count, dtype: int64

### Add 'sample_ontology' category

In [369]:
full_metadata_df["biomaterial_type"] = full_metadata_df["BIOSAMPLE_TYPE_classification"]

In [370]:
curie_def_df = pd.read_csv(
    encode_metadata_dir / "EpiAtlas_list-curie_term_HSOI.tsv",
    sep="\t",
    names=["biosample_term_id", "biosample_term_name", "epiclass_sample_ontology"],
)

In [371]:
new_df = full_metadata_df.merge(
    right=curie_def_df[["biosample_term_id", "epiclass_sample_ontology"]],
    left_on="BIOSAMPLE_TYPE_term_id",
    right_on="biosample_term_id",
    how="left",
)

In [372]:
new_df = new_df.drop(columns=["biosample_term_id"])
new_df[CELL_TYPE] = new_df["epiclass_sample_ontology"]

### Add 'in_epiatlas" category.

In [373]:
new_df["in_epiatlas"] = (
    new_df["EXPERIMENT_related_series"].astype(str).str.contains("reference-epigenomes")
) & (new_df[ASSAY].isin(ASSAY_ORDER))

Sanity check, reference epigenomes always mean IHEC?

yes: https://www.encodeproject.org/profiles/reference_epigenome

In [374]:
new_df["in_epiatlas"].value_counts(dropna=False)

in_epiatlas
False    8679
True     2714
Name: count, dtype: int64

Final save

In [375]:
output_name = "encode_full_metadata_2025-02_no_revoked.csv"
output_path = full_metadata_path.with_name(output_name)
new_df.to_csv(output_path, sep=",", index=False)