In [None]:
"""
Metadata for ChIP-Atlas datasets
"""
# pylint: disable=redefined-outer-name, import-error

In [None]:
from __future__ import annotations

import gc
import json
import os
from collections import Counter
from pathlib import Path

import numpy as np
import pandas as pd
import requests
from IPython.display import display  # pylint: disable=unused-import

from epi_ml.utils.notebooks.paper.paper_utilities import ASSAY_ORDER

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
paper_dir = base_dir

base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
metadata_dir = base_data_dir / "metadata"

predictions_dir = base_data_dir / "training_results" / "predictions"

In [None]:
ca_metadata_dir = metadata_dir / "chip_atlas"
ca_pred_dir = predictions_dir / "C-A" / "assay_epiclass"

## Detail `CA_metadata_4DB+all_pred.20240606.tsv` modifications

See `paper/data/training_results/predictions/C-A/assay_epiclass/README.txt`

Starting from `CA_metadata_4DB+all_pred.20240606_mod2.tsv`, since 1.0 -> 2.0 involved a manual modification of the life (shifting some rows that got mangled)

In [None]:
initial_df_path = ca_pred_dir / "CA_metadata_4DB+all_pred.20240606_mod2.tsv"
initial_df = pd.read_csv(initial_df_path, sep="\t", low_memory=False)
assert "is_EpiAtlas_EpiRR" not in initial_df.columns

2.0 -> 2.1: Add `is_EpiAtlas_EpiRR` column.

In [None]:
encode_metadata_dir = base_data_dir / "metadata" / "encode" / "old_meta"
encode_epiatlas_mapping_path = encode_metadata_dir / "ENCODE_IHEC_keys.tsv"
encode_epiatlas_mapping_df = pd.read_csv(encode_epiatlas_mapping_path, sep="\t")
print(encode_epiatlas_mapping_df.shape)

enc_df = encode_epiatlas_mapping_df[["is_EpiAtlas_EpiRR", "accession"]]

In [None]:
new_pred_df = initial_df.merge(
    enc_df, left_on="ENCODE_GSE", right_on="accession", how="left"
).drop_duplicates()

new_pred_df.drop(columns=["accession"], inplace=True)
new_pred_df["is_EpiAtlas_EpiRR"].fillna("0", inplace=True)

assert initial_df.shape[0] == new_pred_df.shape[0]

In [None]:
new_path = str(initial_df_path).replace("mod2.tsv", "mod2.1.tsv")
new_pred_df_2 = pd.read_csv(new_path, sep="\t", low_memory=False)
assert new_pred_df.shape[0] == new_pred_df_2.shape[0]

In [None]:
new_pred_df_2 = new_pred_df_2.fillna("unknown")
new_pred_df = new_pred_df.fillna("unknown")

In [None]:
for i, (col1, col2) in enumerate(zip(new_pred_df.columns, new_pred_df_2.columns)):
    if col1 != col2:
        print(col1, col2)

In [None]:
for col in new_pred_df.columns:
    if not np.equal(new_pred_df[col].values, new_pred_df_2[col].values).all():  # type: ignore
        print(col)

In [None]:
del new_pred_df_2, enc_df, encode_epiatlas_mapping_df, initial_df
_ = gc.collect()

In [None]:
pred_df_2_1 = new_pred_df.copy(deep=True)

2.1 -> 2.2:  Add `core7_DBs_consensus` column.

In [None]:
SAME_TARGET = "core7_DBs_consensus"
DB_COLS = ["GEO_mod", "C-A", "Cistrome", "NGS_mod"]
new_pred_df.loc[:, DB_COLS] = new_pred_df[DB_COLS].apply(lambda x: x.str.lower())

CORE_ASSAYS = ASSAY_ORDER[0:7]
print(CORE_ASSAYS)

non_core_labels = ["non-core", "CTCF"]

In [None]:
def create_core_consensus_column(
    df: pd.DataFrame, verbose=False
) -> tuple[pd.DataFrame, Counter, Counter]:
    """Create or replaces consensus column for core7 assays.

    First column is presumed to be ID column.
    """
    id_col = df.columns[0]
    df = df.copy(deep=True)
    try:
        df.drop(columns=[SAME_TARGET], inplace=True)
    except KeyError:
        pass

    core_df = df[~df["manual_target_consensus"].isin((non_core_labels))].copy(deep=True)
    if core_df["manual_target_consensus"].isna().sum() > 0:
        raise ValueError("There are missing values in the target column.")

    if verbose:
        print(f"Input shape: {df.shape}. Core7 shape: {core_df.shape}.")

    tmp_df = core_df.loc[:, DB_COLS].copy(deep=True)
    tmp_df["C-A"].replace("unclassified", "----", inplace=True)
    if verbose:
        display(tmp_df.value_counts(dropna=False))

    id_db_target = []
    unique_labels = Counter()
    different_labels = Counter()

    for labels in tmp_df.values:
        missing_N = sum(label == "----" for label in labels)
        db_labels = set(labels)

        try:
            db_labels.remove("----")
        except KeyError:
            pass
        if any(label not in CORE_ASSAYS + ["ctrl"] for label in db_labels):
            id_db_target.append("Ignored - Potential non-core")
        elif missing_N == 3:
            id_db_target.append("1 source")
        elif len(db_labels) == 1:
            id_db_target.append("Identical")
        else:
            id_db_target.append("Different")
            different_labels[tuple(db_labels)] += 1

        unique_labels[tuple(db_labels)] += 1

    core_df.loc[:, SAME_TARGET] = id_db_target

    df = pd.merge(df, core_df[[id_col, SAME_TARGET]], on=id_col, how="left")
    df.loc[df[SAME_TARGET].isna(), SAME_TARGET] = "non-core/CTCF"

    return df, unique_labels, different_labels

In [None]:
new_pred_df, unique_labels, different_labels = create_core_consensus_column(new_pred_df)

In [None]:
#  "non-core/CTCF" never there!^!^!
new_pred_df[SAME_TARGET].value_counts(dropna=False)

In [None]:
new_counts = new_pred_df[SAME_TARGET].value_counts(dropna=False)
display(new_counts)

In [None]:
pred_df_2_2_path = str(initial_df_path).replace("mod2.tsv", "mod2.2.tsv")
pred_df_2_2 = pd.read_csv(pred_df_2_2_path, sep="\t", low_memory=False)

assert new_pred_df.shape[0] == pred_df_2_2.shape[0]

In [None]:
counts_2_2 = pred_df_2_2[SAME_TARGET].value_counts(dropna=False)
display(counts_2_2)
display(new_counts - counts_2_2)

Now, let's see if making the 2.2 -> 3.0 target corrections takes the difference to 0.

`h3.3k27m` is not a target, it's a cell line name

In [None]:
# Correct some NGS_mod annotation errors using the file titles (GSE78801).
# They took h3.3k27m as the target when it is related to the cell line (SF8628 Human DIPG H3.3-K27M Cell Line).
to_replace = {
    "GSM2265634": "h3k27me3",
    "GSM2265635": "h3k27me3",
    "GSM2265642": "h3k4me1",
}
idx = new_pred_df["GSM"].isin(to_replace.keys())
new_pred_df.loc[idx, "NGS_mod"] = new_pred_df.loc[idx, "GSM"].map(to_replace)

if new_pred_df[new_pred_df.isin(["h3.3k27m"])].notna().sum().sum() != 0:
    raise ValueError("h3.3k27m is still present in the dataframe")

Old target mislabeling error

In [None]:
ca_correction_path = ca_pred_dir / "CA_metadata_correction.tsv"
ca_correction_df = pd.read_csv(ca_correction_path, sep="\t", low_memory=False)

In [None]:
new_pred_df = new_pred_df.merge(ca_correction_df, on="Experimental-id", how="left")
new_pred_df[["manual_target_consensus", "GEO_mod"]] = new_pred_df[
    ["manual_target_consensus2", "GEO_mod2"]
]
new_pred_df = new_pred_df.drop(columns=["manual_target_consensus2", "GEO_mod2"])
new_pred_df["GEO_mod"] = new_pred_df["GEO_mod"].str.lower()

Transform all "revxlinkchromatin" target into "input" so they're not counted as different targets.

In [None]:
new_pred_df.loc[:, DB_COLS] = new_pred_df[DB_COLS].replace("revxlinkchromatin", "input")

Redoing core7 consensus

In [None]:
new_pred_df, unique_labels, different_labels = create_core_consensus_column(new_pred_df)

In [None]:
new_counts = new_pred_df[SAME_TARGET].value_counts(dropna=False)
assert sum(new_counts - counts_2_2) == 0

Success!

In [None]:
for col in new_pred_df.columns:
    if "Same" in col:
        new_pred_df.drop(columns=[col], inplace=True)

In [None]:
current_3_path = str(initial_df_path).replace("mod2.tsv", "mod3.0.tsv")
df_3 = pd.read_csv(current_3_path, sep="\t", low_memory=False)
df_3 = df_3.fillna("unknown")

assert new_pred_df.shape[0] == df_3.shape[0]

In [None]:
for col in new_pred_df.columns:
    if not np.equal(new_pred_df[col].astype(str).values, df_3[col].astype(str).values).all():  # type: ignore
        print(col)

In [None]:
new_pred_df.to_csv(current_3_path, sep="\t", index=False)

### CTCF details

Our 13c classifier will tend to classify CTCF as input, so we cannot trust it to differentiate between CTCF and input signals.  
Possibly CTCF samples COULD be excluded from the prediction pool, since core assays classifier have never seen CTCF.

In [None]:
ca_core_df = new_pred_df[~new_pred_df["manual_target_consensus"].isin(non_core_labels)]

In [None]:
display(
    ca_core_df[ca_core_df[DB_COLS].isin(["ctcf"]).any(axis=1)][
        ["Experimental-id", "Gse-geo", "GSM"]
        + DB_COLS
        + ["manual_target_consensus", SAME_TARGET]
    ].sort_values(["Gse-geo", "GSM"])
)

After a review of the experiment descriptions on GEO, it seems GSE102237, GSE108869 and GSE38411 samples marked as CTCF by cistrome have an uncertain target, and so could be excluded from our core samples.

As for GSE183379 samples marked as ctcf by C-A (7 samples), it seems none of them are actually CTCF, according to the original files names on GEO, so they don't need to be excluded.

For simplicity's sake, they were all left as "Ignored - Potential non-core"

In [None]:
del new_pred_df, ca_correction_df, ca_core_df, df_3, pred_df_2_2, pred_df_2_1
_ = gc.collect()

## ChIP-Atlas website download

How to obtain some of the metadata:

```bash
wget https://chip-atlas.dbcls.jp/data/metadata/experimentList.tab
grep -E "^[DESRX]{3}[0-9]{4,8}\shg38\s" experimentList.tab > experimentList_hg38.tab
grep -vE "^[DESRX]{3}[0-9]{4,8}\shg38\s[ATAC,DNASE,Bisulfate,RNA]" experimentList_hg38.tab > experimentList_hg38_chip.tab
cut -f1,3-7,9- experimentList_hg38_chip.tab | sponge experimentList_hg38_chip.tab # Removing col 2 and 8.
```

Following columns given at the [wiki](https://github.com/inutano/chip-atlas/wiki#tables-summarizing-metadata-and-files), `assembly` and `Processing_logs_of[...]` columns were removed.  
metadata submitted by authors was chunked into the last column, instead of dealing with varying column lengths.

In [None]:
metadata_path = ca_metadata_dir / "experimentList_hg38_chip_20250306.tab"
new_file_name = ca_metadata_dir / (metadata_path.stem + "_formatted.tab")

if not new_file_name.exists():
    with open(metadata_path, "r", encoding="utf8") as f:
        lines = f.readlines()

    # Merging all lines past the title
    new_file = []
    for line in lines:
        elems = line.split("\t")
        core = elems[0:7]
        rest = elems[7:]

        rest = [x.strip() for x in rest]
        rest = [x for x in rest if x]

        new_line = "\t".join(core) + "\t" + str(rest)
        new_file.append(new_line)

    new_file.insert(
        0,
        "Experimental_ID\tTrack_type_class\tTrack_type\tCell_type_class\tCell_type\tCell_type_description\tTitle\tMeta_data_submitted_by_authors",
    )

    with open(new_file_name, "w", encoding="utf8") as f:
        f.write("\n".join(new_file))

ca_metadata_df = pd.read_csv(new_file_name, sep="\t", low_memory=False)
print(ca_metadata_df.shape)

## Minimal DB matching metadata

Minimal metadata created from `CA_metadata_4DB+all_pred.20240606_mod3.0.tsv`.  

I mostly kepts the ids and targets from different databases, renamed+moved columns for easier understanding

In [None]:
minimal_metadata_path = ca_metadata_dir / "CA_minimal_metadata_20240606.tsv"
ca_minimal_metadata_df = pd.read_csv(minimal_metadata_path, sep="\t", low_memory=False)
print(ca_minimal_metadata_df.shape)

Some GSM title were missing from the old work, so I redownloaded metadata from GEO.

In [None]:
ca_minimal_metadata_df.replace("-", None, inplace=True)
missing_mask = ca_minimal_metadata_df["GEO_gsm-title"].isna()
print("Missing GSM titles:", sum(missing_mask))

missing_titles = ca_minimal_metadata_df[missing_mask]["GEO_GSM"].to_list()

In [None]:
def download_GEO_file(
    GEO: str, logdir: str | Path, amount: str = "quick", verbose: bool = True
):
    """
    Downloads a GEO (GSM) accession file and saves it to the specified log directory.

    Args:
        GEO (str): The GEO accession number (e.g., "GSM123456").
        logdir (str): Directory to save the downloaded file.
        amount (str): Level of detail for the file. Options: 'full', 'brief', 'quick', 'data'.
                      Default is 'full'.

    Returns:
        str: Path to the saved file.
    """
    # Ensure GEO is uppercase
    GEO = GEO.upper()

    # Validate the accession type
    if not GEO.startswith("GSM"):
        raise ValueError("Only GSM accession numbers are supported.")

    # Ensure logdir exists
    os.makedirs(logdir, exist_ok=True)

    # Construct the URL
    gseurl = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
    myurl = f"{gseurl}?targ=self&acc={GEO}&form=text&view={amount}"

    # Define the destination file path
    destfile = os.path.join(logdir, f"{GEO}.soft")
    if os.path.exists(destfile):
        if verbose:
            print(f"File already exists: {destfile}")
        return destfile

    try:
        # Download the file
        response = requests.get(myurl, stream=True)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        # Save the file
        with open(destfile, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        if verbose:
            print(f"File saved: {destfile}")
        return destfile

    except requests.exceptions.RequestException as e:
        print(f"Error downloading {GEO}: {e}")
        return None

In [None]:
logdir = ca_metadata_dir / "GSM_metadata"
logdir.mkdir(exist_ok=True)

meta_paths = []
for GEO in missing_titles:
    filepath = download_GEO_file(GEO, logdir, amount="quick", verbose=False)
    if filepath:
        meta_paths.append(Path(filepath))

In [None]:
missing_title_dict = {}
for filepath in meta_paths:
    gsm = filepath.stem
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()
        title_line = lines[1]
        if not title_line.startswith("!Sample_title"):
            raise ValueError(f"Title not found for {gsm}")

        title = title_line.split("=")[1].strip()
        missing_title_dict[gsm] = title

with open(logdir / "GSM_title.json", "w", encoding="utf-8") as f:
    json.dump(missing_title_dict, f, indent=4)

In [None]:
ca_minimal_metadata_df["GEO_gsm-title"] = (
    ca_minimal_metadata_df["GEO_GSM"]
    .map(missing_title_dict)
    .fillna(ca_minimal_metadata_df["GEO_gsm-title"])
)

In [None]:
ca_minimal_metadata_df.to_csv(
    ca_metadata_dir / "CA_minimal_metadata_20240606_mod.tsv", sep="\t", index=False
)

## Cancer / Sex / Age metadata categories

`CA_metadata_FW_20240917` contains new metadata categories (cancer/sex/age) created from analyzing more complete metadata.

In [None]:
ca_custom_metadata_path = metadata_dir / "chip_atlas" / "CA_metadata_FW_20240917.tsv"
ca_custom_metadata_df = pd.read_csv(ca_custom_metadata_path, sep="\t", low_memory=False)
print(ca_custom_metadata_df.shape)

In [None]:
# TODO: Integrate how CA_metadata_FW_20240917 was created.

## Merge all metadata

In [None]:
col1 = ca_minimal_metadata_df.columns[0]
col2 = ca_metadata_df.columns[0]

meta_df = ca_minimal_metadata_df.merge(
    ca_metadata_df, how="left", left_on=col1, right_on=col2
)
meta_df.drop(col2, axis=1, inplace=True)

col2 = ca_custom_metadata_df.columns[0]
meta_df = meta_df.merge(ca_custom_metadata_df, how="left", left_on=col1, right_on=col2)
meta_df.drop(col2, axis=1, inplace=True)
meta_df.rename({"Title": "C-A_title"}, axis=1, inplace=True)

In [None]:
meta_df = meta_df.fillna("unknown")
meta_df = meta_df.replace("Unclassified", "unknown")

In [None]:
meta_df.to_csv(ca_metadata_dir / "CA_metadata_joined_20250306.tsv", sep="\t", index=False)

### Explicit assay and biospecimen counts

In [None]:
df_no_epiatlas = meta_df[meta_df["is_EpiAtlas_EpiRR"] == "0"].copy()
print(meta_df.shape, df_no_epiatlas.shape)

In [None]:
df_biospecimens = df_no_epiatlas.copy(deep=True)

# Count occurrences of each "Cell_type" within "Cell_type_class"
group_sizes = df_biospecimens.groupby("Cell_type_class")["Cell_type"].count()

# Sort "Cell_type_class" by descending count of "Cell_type"
sorted_classes = group_sizes.sort_values(ascending=False).index

# Apply sorted order to the original grouping
sorted_groupby = (
    df_biospecimens.groupby(["Cell_type_class", "Cell_type"], dropna=False)
    .size()
    .reset_index(name="count")
    .set_index("Cell_type_class")
    .loc[sorted_classes]
).reset_index()

In [None]:
sorted_groupby.columns = ["Cell_type_class", "Cell_type", "count"]

output_dir = base_dir / "tables" / "datasets_composition"
sorted_groupby.to_csv(output_dir / "ChIP-Atlas_biospecimens.csv", index=False)

In [None]:
assays_df = (
    df_no_epiatlas.groupby("manual_target_consensus", dropna=False)
    .size()
    .sort_values(ascending=False)
    .to_frame(name="count")
)
assays_df.to_csv(output_dir / "ChIP-Atlas_assays.csv")