In [None]:
"""
Metadata for ChIP-Atlas datasets
"""
# pylint: disable=redefined-outer-name, import-error

In [None]:
from __future__ import annotations

import json
import os
from pathlib import Path

import pandas as pd
import requests
from IPython.display import display  # pylint: disable=unused-import

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
paper_dir = base_dir

base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
metadata_dir = base_data_dir / "metadata"

predictions_dir = base_data_dir / "training_results" / "predictions"

In [None]:
ca_metadata_dir = metadata_dir / "chip_atlas"

## ChIP-Atlas website download

How to obtain some of the metadata:

```bash
wget https://chip-atlas.dbcls.jp/data/metadata/experimentList.tab
grep -E "^[DESRX]{3}[0-9]{4,8}\shg38\s" experimentList.tab > experimentList_hg38.tab
grep -vE "^[DESRX]{3}[0-9]{4,8}\shg38\s[ATAC,DNASE,Bisulfate,RNA]" experimentList_hg38.tab > experimentList_hg38_chip.tab
cut -f1,3-7,9- experimentList_hg38_chip.tab | sponge experimentList_hg38_chip.tab # Removing col 2 and 8.
```

Following columns given at the [wiki](https://github.com/inutano/chip-atlas/wiki#tables-summarizing-metadata-and-files), 'assembly' and 'Processing_logs_of_chip_ATAC_DNASE' were removed, and the next column is observed as being the title. Despite the indicated columns on the wiki, the downloaded tab file does not correspond to it, as column 'Processing logs of Bisulfite-seq' actually contaisn the title for chip experiment, when I would expect it to be empty. The varying length of each line made the handling much more bothersome.

In [None]:
metadata_path = ca_metadata_dir / "experimentList_hg38_chip_20250306.tab"
new_file_name = ca_metadata_dir / (metadata_path.stem + "_formatted.tab")

if not new_file_name.exists():
    with open(metadata_path, "r", encoding="utf8") as f:
        lines = f.readlines()

    # Merging all lines past the title
    new_file = []
    for line in lines:
        elems = line.split("\t")
        core = elems[0:7]
        rest = elems[7:]

        rest = [x.strip() for x in rest]
        rest = [x for x in rest if x]

        new_line = "\t".join(core) + "\t" + str(rest)
        new_file.append(new_line)

    new_file.insert(
        0,
        "Experimental_ID\tTrack_type_class\tTrack_type\tCell_type_class\tCell_type\tCell_type_description\tTitle\tMeta_data_submitted_by_authors",
    )

    with open(new_file_name, "w", encoding="utf8") as f:
        f.write("\n".join(new_file))

ca_metadata_df = pd.read_csv(new_file_name, sep="\t", low_memory=False)
print(ca_metadata_df.shape)

## Minimal DB matching metadata

Minimal metadata created from `CA_metadata_4DB+all_pred.20240606_mod3.0.tsv`.  
Acquired from 4 databases.  
I mostly kepts the ids and targets from different databases.  

In [None]:
minimal_metadata_path = ca_metadata_dir / "CA_minimal_metadata_20240606.tsv"
ca_minimal_metadata_df = pd.read_csv(minimal_metadata_path, sep="\t", low_memory=False)
print(ca_minimal_metadata_df.shape)

Some GSM title were missing from the old work, so I redownloaded metadata from GEO.

In [None]:
ca_minimal_metadata_df.replace("-", None, inplace=True)
print(ca_minimal_metadata_df["GEO_gsm-title"].isna().sum())

missing_titles = ca_minimal_metadata_df[ca_minimal_metadata_df["GEO_gsm-title"].isna()][
    "GEO_GSM"
].to_list()

In [None]:
def download_GEO_file(
    GEO: str, logdir: str | Path, amount: str = "quick", verbose: bool = True
):
    """
    Downloads a GEO (GSM) accession file and saves it to the specified log directory.

    Args:
        GEO (str): The GEO accession number (e.g., "GSM123456").
        logdir (str): Directory to save the downloaded file.
        amount (str): Level of detail for the file. Options: 'full', 'brief', 'quick', 'data'.
                      Default is 'full'.

    Returns:
        str: Path to the saved file.
    """
    # Ensure GEO is uppercase
    GEO = GEO.upper()

    # Validate the accession type
    if not GEO.startswith("GSM"):
        raise ValueError("Only GSM accession numbers are supported.")

    # Ensure logdir exists
    os.makedirs(logdir, exist_ok=True)

    # Construct the URL
    gseurl = "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi"
    myurl = f"{gseurl}?targ=self&acc={GEO}&form=text&view={amount}"

    # Define the destination file path
    destfile = os.path.join(logdir, f"{GEO}.soft")
    if os.path.exists(destfile):
        if verbose:
            print(f"File already exists: {destfile}")
        return destfile

    try:
        # Download the file
        response = requests.get(myurl, stream=True)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)

        # Save the file
        with open(destfile, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        if verbose:
            print(f"File saved: {destfile}")
        return destfile

    except requests.exceptions.RequestException as e:
        print(f"Error downloading {GEO}: {e}")
        return None

In [None]:
logdir = ca_metadata_dir / "GSM_metadata"
logdir.mkdir(exist_ok=True)

meta_paths = []
for GEO in missing_titles:
    filepath = download_GEO_file(GEO, logdir, amount="quick", verbose=False)
    if filepath:
        meta_paths.append(Path(filepath))

In [None]:
missing_title_dict = {}
for filepath in meta_paths:
    gsm = filepath.stem
    with open(filepath, "r", encoding="utf-8") as f:
        lines = f.readlines()
        title_line = lines[1]
        if not title_line.startswith("!Sample_title"):
            raise ValueError(f"Title not found for {gsm}")

        title = title_line.split("=")[1].strip()
        missing_title_dict[gsm] = title

with open(logdir / "GSM_title.json", "w", encoding="utf-8") as f:
    json.dump(missing_title_dict, f, indent=4)

In [None]:
ca_minimal_metadata_df["GEO_gsm-title"] = (
    ca_minimal_metadata_df["GEO_GSM"]
    .map(missing_title_dict)
    .fillna(ca_minimal_metadata_df["GEO_gsm-title"])
)

In [None]:
ca_minimal_metadata_df.to_csv(
    ca_metadata_dir / "CA_minimal_metadata_20240606_mod.tsv", sep="\t", index=False
)

## Cancer / Sex / Age metadata categories

`CA_metadata_FW_20240917` contains new metadata categories (cancer/sex/age) created from analyzing more complete metadata.

In [None]:
ca_custom_metadata_path = metadata_dir / "chip_atlas" / "CA_metadata_FW_20240917.tsv"
ca_custom_metadata_df = pd.read_csv(ca_custom_metadata_path, sep="\t", low_memory=False)
print(ca_custom_metadata_df.shape)

## Merge all metadata

In [None]:
col1 = ca_minimal_metadata_df.columns[0]
col2 = ca_metadata_df.columns[0]

meta_df = ca_minimal_metadata_df.merge(
    ca_metadata_df, how="left", left_on=col1, right_on=col2
)
meta_df.drop(col2, axis=1, inplace=True)

col2 = ca_custom_metadata_df.columns[0]
meta_df = meta_df.merge(ca_custom_metadata_df, how="left", left_on=col1, right_on=col2)
meta_df.drop(col2, axis=1, inplace=True)
meta_df.rename({"Title": "C-A_title"}, axis=1, inplace=True)

In [None]:
meta_df = meta_df.fillna("unknown")
meta_df = meta_df.replace("Unclassified", "unknown")

In [None]:
meta_df.to_csv(ca_metadata_dir / "CA_metadata_joined_20250306.tsv", sep="\t", index=False)

### Explicit assay and biospecimen counts

In [None]:
df_biospecimens = meta_df.copy(deep=True)

# Count occurrences of each "Cell_type" within "Cell_type_class"
group_sizes = df_biospecimens.groupby("Cell_type_class")["Cell_type"].count()

# Sort "Cell_type_class" by descending count of "Cell_type"
sorted_classes = group_sizes.sort_values(ascending=False).index

# Apply sorted order to the original grouping
sorted_groupby = (
    df_biospecimens.groupby(["Cell_type_class", "Cell_type"], dropna=False)
    .size()
    .reset_index(name="count")
    .set_index("Cell_type_class")
    .loc[sorted_classes]
).reset_index()

In [None]:
sorted_groupby.columns = ["Cell_type_class", "Cell_type", "count"]

output_dir = base_dir / "tables" / "datasets_composition"
sorted_groupby.to_csv(output_dir / "ChIP-Atlas_biospecimens.csv", index=False)

In [None]:
assays_df = (
    meta_df.groupby("manual_target_consensus", dropna=False)
    .size()
    .sort_values(ascending=False)
    .to_frame(name="count")
)
assays_df.to_csv(output_dir / "ChIP-Atlas_assays.csv")