In [4]:
"""Workbooks to analyze metadata differences."""

'Workbooks to analyze metadata differences.'

In [5]:
from __future__ import annotations

import io
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd
import requests

from epi_ml.core.metadata import Metadata
from epi_ml.utils.notebooks.paper.paper_utilities import (
    BIOMATERIAL_TYPE,
    CELL_TYPE,
    DISEASE,
    LIFE_STAGE,
    SEX,
)

In [6]:
paper_dir = Path().home() / "Projects/epiclass/output/paper"
paper_meta_dir = paper_dir / "data" / "metadata"

table_dir = paper_dir / "tables"

### Our training metadata VS official metadata

#### Metadata we use for training

In [7]:
path = paper_meta_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
training_metadata = Metadata(path)
files_df = training_metadata.to_df()
training_df = files_df.copy(deep=True)

# keeping biological samples only, not track types or assays (EpiRR level)
training_df = training_df.drop_duplicates(subset=["epirr_id_without_version"])
my_epirrs = set(training_df["epirr_id_without_version"].unique())

In [8]:
relevants_cols = [CELL_TYPE, BIOMATERIAL_TYPE, SEX, DISEASE, LIFE_STAGE]
training_df = training_df[["epirr_id_without_version"] + relevants_cols]
training_df = training_df.set_index("epirr_id_without_version")

#### Official metadata

In [9]:
dfs = {}

url_template = "https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/{version}/IHEC_metadata_harmonization.{version}.extended.csv"
for version in ["v1.0", "v1.1", "v1.2", "v1.3"]:
    myurl = url_template.format(version=version)
    print(f"Downloading version {version}: {myurl}")

    try:
        # Download the file
        response = requests.get(myurl, stream=True)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        # Load file as a DataFrame
        content = response.content
        df = pd.read_csv(io.StringIO(content.decode("utf-8")))

    except requests.exceptions.RequestException as e:
        print(f"Error downloading {myurl}: {e}")

    dfs[version] = df

Downloading version v1.0: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.0/IHEC_metadata_harmonization.v1.0.extended.csv
Downloading version v1.1: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.1/IHEC_metadata_harmonization.v1.1.extended.csv
Downloading version v1.2: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.2/IHEC_metadata_harmonization.v1.2.extended.csv
Downloading version v1.3: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.3/IHEC_metadata_harmonization.v1.3.extended.csv
Error downloading https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.3/IHEC_metadata_harmonization.v1.3.extended.csv: 404 Client Error: Not Found for url: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefi

Modify dataframes to fit with our metadata.

In [10]:
for v, df in dfs.items():
    df["epirr_id_without_version"] = df["EpiRR"].str.split(".").str[0]
    df = df.set_index("epirr_id_without_version")
    df.fillna("unknown", inplace=True)
    dfs[v] = df

In [11]:
# unknown_ls_epirrs = training_df[(training_df[LIFE_STAGE] == "unknown") & (training_df[BIOMATERIAL_TYPE] == "cell line")].index.unique()
# unknown_ls_epirrs = pd.Series(unknown_ls_epirrs)
# unknown_ls_epirrs.to_csv(paper_meta_dir / "training_metadata_unknown_LS_cell_line.list", index=False, header=False)

#### Creating json of differences (VS v1.0/v1.1)

In [12]:
problematic_idxs = defaultdict(set)
for cat in relevants_cols:
    for version in ["v1.0", "v1.1"]:
        meta = dfs[version]
        meta = meta[meta.index.isin(my_epirrs)]

        # sort same way
        meta = meta.loc[training_df.index]

        # find idx where value is different
        diff = meta[cat] != training_df[cat]
        diff_idxs = diff[diff].index

        if not diff_idxs.empty:
            problematic_idxs[cat].update(diff_idxs)

In [13]:
all_changes = {col: {} for col in relevants_cols if col in problematic_idxs}
for col in relevants_cols:
    cat_idxs = problematic_idxs[col]
    for idx in cat_idxs:
        values = {
            "training": training_df.loc[idx, col],
            "v1.0-official": dfs["v1.0"].loc[idx, col],
            "v1.1-official": dfs["v1.1"].loc[idx, col],
            "v1.2-official": dfs["v1.2"].loc[idx, col],
            "v1.3-official": dfs["v1.3"].loc[idx, col],
        }
        all_changes[col][idx] = values

In [14]:
for col in relevants_cols:
    if col in problematic_idxs:
        print(f"Changes in {col}: {len(problematic_idxs[col])}")
    else:
        print(f"No changes in {col}")

Changes in harmonized_sample_ontology_intermediate: 29
Changes in harmonized_biomaterial_type: 6
Changes in harmonized_donor_sex: 0
Changes in harmonized_sample_disease_high: 14
Changes in harmonized_donor_life_stage: 0


In [15]:
filename = "training_metadata_vs_official.json"
path = paper_meta_dir / filename

with open(path, "w", encoding="utf8") as f:
    json.dump(all_changes, f, indent=4, allow_nan=False)

In [16]:
diff_epirrs = set()
for cat_label, diff_dict in all_changes.items():
    print(cat_label)
    for epirr, values_dict in sorted(diff_dict.items()):
        training_val = values_dict["training"]
        official_val = values_dict["v1.1-official"]
        v1_0_val = values_dict["v1.0-official"]
        v1_2_val = values_dict["v1.2-official"]
        if training_val != official_val:
            print(
                f"{epirr}: {training_val} != {official_val} (v1.0={v1_0_val}, v1.2={v1_2_val})"
            )
            diff_epirrs.add(epirr)

print(f"Unique EpiRRs with changes: {len(diff_epirrs)}")

harmonized_sample_ontology_intermediate
IHECRE00003725: neural progenitor cell != stem cell derived cell line (v1.0=neural progenitor cell, v1.2=stem cell derived cell line)
IHECRE00003726: neural cell != stem cell derived cell line (v1.0=unknown, v1.2=stem cell derived cell line)
IHECRE00003728: neural progenitor cell != stem cell derived cell line (v1.0=neural progenitor cell, v1.2=stem cell derived cell line)
IHECRE00003729: neural cell != stem cell derived cell line (v1.0=unknown, v1.2=stem cell derived cell line)
harmonized_biomaterial_type
IHECRE00003724: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003725: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003726: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003727: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003728: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003729: primary cell != cell line (v1.0=primary ce

### Sanity check: SEX v1.2 = SEX v1.3

In [17]:
official_metadata_dir = (
    Path.home() / "Projects/epiclass/output/paper/data/metadata/official"
)

official_metadata_dfs = {}
for version in ["v1.1", "v1.2", "v1.3"]:
    path = official_metadata_dir / f"IHEC_metadata_harmonization.{version}.extended.csv"
    df = pd.read_csv(path, sep=",")
    official_metadata_dfs[version] = df

In [18]:
SEX = "harmonized_donor_sex"
sex_mislabels_path = (
    official_metadata_dir / "BadQual-mislabels" / "official_Sex_mislabeled.csv"
)
sex_mislabels_df = pd.read_csv(sex_mislabels_path, sep=",")

In [19]:
sex_epirrs = {}
subset_df = sex_mislabels_df
for version, df in official_metadata_dfs.items():
    relevant_df = df.loc[:, ["epirr_id_without_version", SEX]]
    subset_df = relevant_df.merge(
        subset_df,
        left_on="epirr_id_without_version",
        right_on="EpiRR_no-v",
        how="right",
        suffixes=(f"_{version}", ""),
    )

In [20]:
subset_df = subset_df.drop(
    columns=[col for col in subset_df.columns if col.startswith("epirr_id")]
)
subset_df = subset_df.drop(columns=[SEX])

In [21]:
assert (subset_df[f"{SEX}_v1.3"] != subset_df[f"{SEX}_v1.2"]).sum() == 0

In [22]:
merged_df = official_metadata_dfs["v1.2"].merge(
    official_metadata_dfs["v1.3"],
    on="epirr_id_without_version",
    how="inner",
    suffixes=("_v1.2", "_v1.3"),
)

In [23]:
assert (merged_df[f"{SEX}_v1.3"] != merged_df[f"{SEX}_v1.2"]).sum() == 0

### Sanity check: How much RNA Unique_raw tracks (unstranded data) in the training metadata

In [25]:
outfile = table_dir / "experiments_including_unique_raw_files.list"
outfile.unlink(missing_ok=True)
outfile.touch()

v2_meta_df = files_df
md5_unique_raw = v2_meta_df[v2_meta_df["track_type"] == "Unique_raw"].index.unique()

with outfile.open("w", encoding="utf-8") as out:
    print(f"Total Unique_raw md5sums: {len(md5_unique_raw)}", file=out)
    for pred_file in table_dir.rglob("*pred*.csv"):
        if any(label in str(pred_file) for label in ["recount3", "encode"]):
            continue
        df = pd.read_csv(pred_file, sep=",", low_memory=False)

        # Get md5sums
        try:
            md5sums = set(df["md5sum"])
        except KeyError:
            if isinstance(df.index[0], str) and len(df.index[0]) == 32:
                md5sums = set(df.index)
            else:
                print(f"Could not find md5sum column in {pred_file}", file=out)
                continue

        shared_md5sums = md5sums.intersection(md5_unique_raw)

        pred_file_relpath = pred_file.relative_to(table_dir)
        print(f"{pred_file_relpath}: {len(shared_md5sums)}", file=out)