In [1]:
"""Workbooks to analyze metadata differences."""

'Workbooks to analyze metadata differences.'

In [2]:
from __future__ import annotations

import io
import json
from collections import defaultdict
from pathlib import Path

import pandas as pd
import requests
from IPython.display import display

from epi_ml.core.metadata import Metadata
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    BIOMATERIAL_TYPE,
    CELL_TYPE,
    DISEASE,
    LIFE_STAGE,
    SEX,
)

In [3]:
paper_dir = Path().home() / "Projects/epiclass/output/paper"

paper_meta_dir = paper_dir / "data" / "metadata"
official_metadata_dir = paper_meta_dir / "official"

table_dir = paper_dir / "tables"

### Our training metadata VS official metadata

#### Metadata we use for training

In [4]:
path = paper_meta_dir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
training_metadata = Metadata(path)
files_df = training_metadata.to_df()
training_df = files_df.copy(deep=True)

# keeping biological samples only, not track types or assays (EpiRR level)
training_df = training_df.drop_duplicates(subset=["epirr_id_without_version"])
my_epirrs = set(training_df["epirr_id_without_version"].unique())

In [5]:
relevants_cols = [CELL_TYPE, BIOMATERIAL_TYPE, SEX, DISEASE, LIFE_STAGE]
training_df = training_df[["epirr_id_without_version"] + relevants_cols]
training_df = training_df.set_index("epirr_id_without_version")

#### Official metadata

In [6]:
dfs = {}
url_template = "https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/{version}/IHEC_metadata_harmonization.{version}.extended.csv"
for version in ["v1.0", "v1.1", "v1.2", "v1.3", "v1.4", "v2.0"]:
    myurl = url_template.format(version=version)

    # naming convention changed starting v1.3
    if version in ["v1.3", "v1.4", "v2.0"]:
        myurl = myurl.replace(".extended", "_extended").replace(
            "IHEC_metadata", "IHEC_sample_metadata"
        )

    print(f"Downloading version {version}: {myurl}")
    try:
        # Download the file
        response = requests.get(myurl, stream=True)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        # Load file as a DataFrame
        content = response.content
        df = pd.read_csv(io.StringIO(content.decode("utf-8")))

    except requests.exceptions.RequestException as e:
        print(f"Error downloading {myurl}: {e}")

    dfs[version] = df

Downloading version v1.0: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.0/IHEC_metadata_harmonization.v1.0.extended.csv
Downloading version v1.1: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.1/IHEC_metadata_harmonization.v1.1.extended.csv
Downloading version v1.2: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.2/IHEC_metadata_harmonization.v1.2.extended.csv
Downloading version v1.3: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.3/IHEC_sample_metadata_harmonization.v1.3_extended.csv
Downloading version v1.4: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/main/openrefine/v1.4/IHEC_sample_metadata_harmonization.v1.4_extended.csv
Downloading version v2.0: https://raw.githubusercontent.com/IHEC/epiATLAS-metadata-harmonization/refs/heads/mai

Modify dataframes to fit with our metadata.

In [7]:
for v, df in dfs.items():
    if "epirr_id_without_version" in df.columns:
        print(f"version {v} already has epirr_id_without_version column")
    else:
        print(f"Adding epirr_id_without_version column to version {v}")
        df["epirr_id_without_version"] = df["EpiRR"].str.split(".").str[0]

    df.fillna("unknown", inplace=True)
    df = df.set_index("epirr_id_without_version")
    df.to_csv(
        official_metadata_dir / f"IHEC_sample_metadata_harmonization.{v}.extended.csv"
    )

    dfs[v] = df

Adding epirr_id_without_version column to version v1.0
version v1.1 already has epirr_id_without_version column
version v1.2 already has epirr_id_without_version column
version v1.3 already has epirr_id_without_version column


  df.fillna("unknown", inplace=True)


version v1.4 already has epirr_id_without_version column
version v2.0 already has epirr_id_without_version column


In [8]:
# unknown_ls_epirrs = training_df[(training_df[LIFE_STAGE] == "unknown") & (training_df[BIOMATERIAL_TYPE] == "cell line")].index.unique()
# unknown_ls_epirrs = pd.Series(unknown_ls_epirrs)
# unknown_ls_epirrs.to_csv(paper_meta_dir / "training_metadata_unknown_LS_cell_line.list", index=False, header=False)

#### Creating json of differences (Our metadata VS official v1.1+)

In [9]:
problematic_idxs = defaultdict(set)
for version in ["v1.0", "v1.1"]:
    meta = dfs[version]
    common_epirr = my_epirrs.intersection(meta.index)

    # Order by epirr
    common_epirr = [epirr for epirr in training_df.index if epirr in common_epirr]
    meta = meta.loc[common_epirr, :]
    training = training_df.loc[common_epirr, :]
    for cat in relevants_cols:
        # find idx where value is different
        diff = meta[cat] != training[cat]
        diff_idxs = diff[diff].index

        if not diff_idxs.empty:
            problematic_idxs[cat].update(diff_idxs)

In [10]:
all_changes = {col: {} for col in relevants_cols if col in problematic_idxs}
for col in relevants_cols:
    cat_idxs = problematic_idxs[col]
    for idx in cat_idxs:
        values = {
            "training": training_df.loc[idx, col],
            "v1.0-official": dfs["v1.0"].loc[idx, col],
            "v1.1-official": dfs["v1.1"].loc[idx, col],
            "v1.2-official": dfs["v1.2"].loc[idx, col],
            "v1.3-official": dfs["v1.3"].loc[idx, col],
            "v1.4-official": dfs["v1.4"].loc[idx, col],
            "v2.0-official": dfs["v2.0"].loc[idx, col],
        }
        all_changes[col][idx] = values

In [11]:
for col in relevants_cols:
    if col in problematic_idxs:
        print(f"Changes in {col}: {len(problematic_idxs[col])}")
    else:
        print(f"No changes in {col}")

Changes in harmonized_sample_ontology_intermediate: 29
Changes in harmonized_biomaterial_type: 6
Changes in harmonized_donor_sex: 0
Changes in harmonized_sample_disease_high: 14
Changes in harmonized_donor_life_stage: 0


In [None]:
filename = "training_metadata_vs_official.json"
path = paper_meta_dir / filename

with open(path, "w", encoding="utf8") as f:
    json.dump(all_changes, f, indent=4, allow_nan=False)

In [13]:
diff_epirrs = set()
for cat_label, diff_dict in all_changes.items():
    print(cat_label)
    for epirr, values_dict in sorted(diff_dict.items()):
        training_val = values_dict["training"]
        official_val = values_dict["v1.1-official"]
        v1_0_val = values_dict["v1.0-official"]
        v1_2_val = values_dict["v1.2-official"]
        if training_val != official_val:
            print(
                f"{epirr}: {training_val} != {official_val} (v1.0={v1_0_val}, v1.2={v1_2_val})"
            )
            diff_epirrs.add(epirr)

print(f"Unique EpiRRs with changes: {len(diff_epirrs)}")

harmonized_sample_ontology_intermediate
IHECRE00003725: neural progenitor cell != stem cell derived cell line (v1.0=neural progenitor cell, v1.2=stem cell derived cell line)
IHECRE00003726: neural cell != stem cell derived cell line (v1.0=unknown, v1.2=stem cell derived cell line)
IHECRE00003728: neural progenitor cell != stem cell derived cell line (v1.0=neural progenitor cell, v1.2=stem cell derived cell line)
IHECRE00003729: neural cell != stem cell derived cell line (v1.0=unknown, v1.2=stem cell derived cell line)
harmonized_biomaterial_type
IHECRE00003724: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003725: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003726: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003727: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003728: primary cell != cell line (v1.0=primary cell, v1.2=cell line)
IHECRE00003729: primary cell != cell line (v1.0=primary ce

In [None]:
row_vals = []
cols = [
    "epirr",
    "category",
    "training",
    "v1.0-official",
    "v1.1-official",
    "v1.2-official",
    "v1.3-official",
    "v1.4-official",
    "v2.0-official",
]
for cat_label, diff_dict in all_changes.items():
    for epirr, values_dict in sorted(diff_dict.items()):
        if epirr in diff_epirrs:
            row_vals.append(
                [
                    epirr,
                    cat_label,
                    values_dict["training"],
                    values_dict["v1.0-official"],
                    values_dict["v1.1-official"],
                    values_dict["v1.2-official"],
                    values_dict["v1.3-official"],
                    values_dict["v1.4-official"],
                    values_dict["v2.0-official"],
                ]
            )
df = pd.DataFrame(row_vals, columns=cols)
df.to_csv(table_dir / "training_metadata_vs_official_v1.1.csv", index=False)

### Sanity check: SEX v1.2 = SEX v1.3

In [15]:
official_metadata_dfs = {}
for version in ["v1.1", "v1.2", "v1.3"]:
    path = (
        official_metadata_dir
        / f"IHEC_sample_metadata_harmonization.{version}.extended.csv"
    )
    df = pd.read_csv(path, sep=",")
    official_metadata_dfs[version] = df

In [16]:
SEX = "harmonized_donor_sex"
sex_mislabels_path = (
    official_metadata_dir / "BadQual-mislabels" / "official_Sex_mislabeled.csv"
)
sex_mislabels_df = pd.read_csv(sex_mislabels_path, sep=",")

In [17]:
sex_epirrs = {}
subset_df = sex_mislabels_df
for version, df in official_metadata_dfs.items():
    relevant_df = df.loc[:, ["epirr_id_without_version", SEX]]
    subset_df = relevant_df.merge(
        subset_df,
        left_on="epirr_id_without_version",
        right_on="EpiRR_no-v",
        how="right",
        suffixes=(f"_{version}", ""),
    )

In [18]:
subset_df = subset_df.drop(
    columns=[col for col in subset_df.columns if col.startswith("epirr_id")]
)
subset_df = subset_df.drop(columns=[SEX])

In [19]:
assert (subset_df[f"{SEX}_v1.3"] != subset_df[f"{SEX}_v1.2"]).sum() == 0

In [20]:
merged_df = official_metadata_dfs["v1.2"].merge(
    official_metadata_dfs["v1.3"],
    on="epirr_id_without_version",
    how="inner",
    suffixes=("_v1.2", "_v1.3"),
)

In [21]:
assert (merged_df[f"{SEX}_v1.3"] != merged_df[f"{SEX}_v1.2"]).sum() == 0

### Sanity check: How much RNA Unique_raw tracks (unstranded data) in the final training metadata

In [22]:
outfile = table_dir / "experiments_including_unique_raw_files.list"
outfile.unlink(missing_ok=True)
outfile.touch()

v2_meta_df = files_df
print(v2_meta_df.shape)
display(v2_meta_df["track_type"].value_counts(dropna=False))

(20922, 71)


track_type
fc                 5337
pval               5337
raw                5337
Unique_minusRaw    1435
Unique_plusRaw     1435
ctl_raw             777
gembs_neg           572
gembs_pos           572
Unique_raw          120
Name: count, dtype: int64

In [23]:
md5_unique_raw = v2_meta_df[v2_meta_df["track_type"] == "Unique_raw"]["md5sum"].tolist()
print(f"Total Unique_raw md5sums: {len(md5_unique_raw)}")

md5_unique_raw = set(md5_unique_raw)

Total Unique_raw md5sums: 120


In [24]:
with outfile.open("w", encoding="utf-8") as out:
    print(f"Total Unique_raw md5sums: {len(md5_unique_raw)}", file=out)
    for pred_file in table_dir.rglob("*pred*.csv"):
        if any(label in str(pred_file) for label in ["recount3", "encode"]):
            continue
        df = pd.read_csv(pred_file, sep=",", low_memory=False)

        # Get md5sums
        try:
            md5sums = set(df["md5sum"])
        except KeyError:
            if isinstance(df.index[0], str) and len(df.index[0]) == 32:
                md5sums = set(df.index)
            else:
                print(f"Could not find md5sum column in {pred_file}", file=out)
                continue

        shared_md5sums = md5sums.intersection(md5_unique_raw)

        pred_file_relpath = pred_file.relative_to(table_dir)
        print(f"{pred_file}: {len(shared_md5sums)}")
        print(f"{pred_file_relpath}: {len(shared_md5sums)}", file=out)

/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_10fold_predictions_harmonized_donor_sex_10fold-binary.csv: 0
/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_0blklst_winsorized_10fold_predictions_harmonized_sample_ontology_intermediate.csv: 0
/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_0blklst_winsorized_10fold_predictions_assay_epiclass.csv: 120
/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_0blklst_10fold_predictions_harmonized_sample_ontology_intermediate.csv: 0
/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_0blklst_winsorized_10fold_predictions_harmonized_biomaterial_type.csv: 0
/home/rabj2301/Projects/epiclass/output/paper/tables/2023-01-epiatlas-freeze/hg38_100kb_all_none_10fold_predictions_harmonized_biomaterial_type.csv: 0
/home/rab

### Diff sex/life stage mislabels

Training and v1.1 are the same for sex and life stage categories.

In [25]:
version_before = "v1.1"
version_after = "v1.4"

In [26]:
official_metadata_dfs = {}
for version in [version_before, version_after]:
    path = (
        official_metadata_dir
        / f"IHEC_sample_metadata_harmonization.{version}.extended.csv"
    )
    df = pd.read_csv(path, sep=",", index_col=0)
    official_metadata_dfs[version] = df

In [27]:
common_epirrs = set(official_metadata_dfs[version_before].index).intersection(
    set(official_metadata_dfs[version_after].index)
)
common_epirrs = list(common_epirrs)

In [28]:
print(len(common_epirrs))

2216


In [29]:
df_before = official_metadata_dfs[version_before]
df_after = official_metadata_dfs[version_after]
cell_line_epirrs = set()
for df in [df_before, df_after]:
    cell_line_epirrs = cell_line_epirrs.union(
        set(df[df["harmonized_biomaterial_type"] == "cell line"].index)
    )

In [34]:
info = {}
for col in [SEX, LIFE_STAGE]:
    print(f"-- Category: {col} --\n")
    df_before = official_metadata_dfs[version_before].loc[common_epirrs, :]
    df_after = official_metadata_dfs[version_after].loc[common_epirrs, :]

    ref_known_epirr = df_before[df_before[col] != "unknown"].index

    # Which samples had values changed? (mislabels)
    print("Mislabels")
    values_before = df_before.loc[ref_known_epirr, col]
    values_after = df_after.loc[ref_known_epirr, col]

    changed = values_before != values_after
    changed_epirrs = ref_known_epirr[changed]
    if col == LIFE_STAGE:
        assert set(changed_epirrs).intersection(cell_line_epirrs) == set()

    print(f"{col}: {len(changed_epirrs)}\n")
    info[f"changed_{col}"] = changed_epirrs

    # Which unknown samples were given a value in v1.4?
    print("Unknown to known (resolved)")
    unknown_before = df_before[df_before[col] == "unknown"].index
    unknown_after = df_after[df_after[col] == "unknown"].index
    print(f"{col}: {len(unknown_before)} -> {len(unknown_after)}")

    resolved = set(unknown_before) - set(unknown_after)
    resolved_epirrs = [epirr for epirr in common_epirrs if epirr in resolved]
    if col == LIFE_STAGE:
        assert set(resolved_epirrs).intersection(cell_line_epirrs) == set()

    print(f"Resolved: {len(resolved_epirrs)}\n")
    info[f"resolved_{col}"] = resolved_epirrs

    unresolved = set(unknown_before) & set(unknown_after)
    unresolved_epirrs = [epirr for epirr in common_epirrs if epirr in unresolved]
    info[f"unresolved_{col}"] = unresolved_epirrs


-- Category: harmonized_donor_sex --

Mislabels
harmonized_donor_sex: 23

Unknown to known (resolved)
harmonized_donor_sex: 314 -> 46
Resolved: 268

-- Category: harmonized_donor_life_stage --

Mislabels
harmonized_donor_life_stage: 17

Unknown to known (resolved)
harmonized_donor_life_stage: 517 -> 113
Resolved: 404



In [35]:
save_dir = table_dir / "metadata_diff"
save_dir.mkdir(exist_ok=True)

for col in [SEX, LIFE_STAGE]:
    vals_before = official_metadata_dfs[version_before].loc[common_epirrs, col]
    vals_after = official_metadata_dfs[version_after].loc[common_epirrs, col]
    partial_name = f"{version_before}_to_{version_after}"
    for status_name in ["changed", "resolved", "unresolved"]:
        epirrs = info[f"{status_name}_{col}"]

        status_df = pd.DataFrame(
            {
                version_before: vals_before.loc[epirrs],
                version_after: vals_after.loc[epirrs],
            }
        )

        status_df.to_csv(save_dir / f"{status_name}_{partial_name}_{col}.csv")



## Adding missing uuids/datasets to general cross-val results file

In [None]:
all_preds_path = (
    paper_dir
    / "data"
    / "training_results"
    / "hg38_100kb_all_none"
    / "merged_pred_results_all_2.1_chrY_zscores.csv"
)
all_preds_df = pd.read_csv(all_preds_path, low_memory=False)
print(all_preds_df.shape)

In [None]:
all_preds_df = all_preds_df[all_preds_df["md5sum_encode"].isnull()]
print(all_preds_df.shape)

In [None]:
chip_df = all_preds_df[all_preds_df["assay_type"] == "ChIP-Seq"]

In [None]:
our_chip_uuids = set(chip_df["uuid"])
all_our_uuids = set(all_preds_df["uuid"])

In [None]:
official_meta_dir = paper_meta_dir / "official"

official_exp_metadata_path = (
    official_meta_dir / "EpiATLAS_experiment_metadata_11032024.csv"
)
official_exp_metadata_df = pd.read_csv(official_exp_metadata_path, low_memory=False)
print(official_exp_metadata_df.shape)

In [None]:
official_all_uuids = set(official_exp_metadata_df["uuid"])

In [None]:
diff_uuids = sorted(official_all_uuids - all_our_uuids)
print(len(diff_uuids))
filename = "uuids_diff_hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl_VS_EpiATLAS_experiment_metadata_11032024.list"
pd.DataFrame(diff_uuids).to_csv(official_meta_dir / filename, index=False, header=False)

In [None]:
mask = official_exp_metadata_df["uuid"].isin(diff_uuids)
for col in ["data_generating_centre", "experiment_type", "assay_type"]:
    display(official_exp_metadata_df[mask][col].value_counts(dropna=False))

In [None]:
pruned_epirrs_path = official_meta_dir / "pruned_wbgs_1.1_epirr.list"
pruned_epirrs = pd.read_csv(pruned_epirrs_path, header=None)[0].tolist()
pruned_uuids = set(
    official_exp_metadata_df[
        official_exp_metadata_df["epirr_id_without_version"].isin(pruned_epirrs)
    ]["uuid"]
)
assert len(pruned_uuids) == len(pruned_epirrs)

In [None]:
diff_uuids = sorted(official_all_uuids - all_our_uuids - pruned_uuids)
filename = "uuids_diff_no_pruned_wgbs_hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl_VS_EpiATLAS_experiment_metadata_11032024.list"
pd.DataFrame(diff_uuids).to_csv(official_meta_dir / filename, index=False, header=False)

In [None]:
mask = official_exp_metadata_df["uuid"].isin(diff_uuids)
for col in ["data_generating_centre", "experiment_type", "assay_type"]:
    display(official_exp_metadata_df[mask][col].value_counts(dropna=False))

In [None]:
filename = "diff_no_pruned_wgbs_hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl_VS_EpiATLAS_experiment_metadata_11032024.csv"

diff_uuids = sorted(official_all_uuids - all_our_uuids - pruned_uuids)
mask = official_exp_metadata_df["uuid"].isin(diff_uuids)

missing_files_df = official_exp_metadata_df[mask]
print(missing_files_df.shape)

missing_files_df.to_csv(official_meta_dir / filename, header=True, index=False)

In [None]:
official_sample_meta = (
    official_meta_dir / "IHEC_sample_metadata_harmonization.v1.4.extended.csv"
)
official_sample_meta_df = pd.read_csv(official_sample_meta, low_memory=False)
print(official_sample_meta_df.shape)

In [None]:
missing_files_df = missing_files_df.merge(
    official_sample_meta_df, how="left", on="epirr_id_without_version"
)
print(missing_files_df.shape)

In [None]:
missing_files_df["experiment_type"].value_counts(dropna=False)
missing_files_df[ASSAY] = missing_files_df["experiment_type"].replace(
    {
        "standard": "wgbs-standard",
        "PBAT": "wgbs-pbat",
    }
)

In [None]:
common_cols = set(missing_files_df.columns) & set(all_preds_df.columns)
print(len(common_cols))

In [None]:
# Adding missing datasets info.
preds_w_missing_df = pd.concat(
    [all_preds_df, missing_files_df[list(common_cols)]],
    axis=0,
    join="outer",
    ignore_index=True,
)

if preds_w_missing_df.shape[0] != all_preds_df.shape[0] + missing_files_df.shape[0]:
    raise ValueError("Merge failed, expected concatenation of rows")
if preds_w_missing_df.shape[1] != all_preds_df.shape[1]:
    raise ValueError("Merge failed, expected no new columns")

In [None]:
preds_v2_path = str(all_preds_path).replace(".csv", "_with_missing.csv")
preds_w_missing_df.to_csv(preds_v2_path, index=False)