In [None]:
"""Notebook for code related to importing/downloading results from remote sources."""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches

In [None]:
from __future__ import annotations

import shutil
from pathlib import Path
from typing import List

In [None]:
ASSAY = "assay_epiclass"

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"

### Download files from remote server, via python

In [None]:
def determine_remote_base(version: str) -> Path:
    """Determines the base directory based on the version."""
    remote_base = Path.home() / "mounts" / "narval-mount"
    if version == "v1":
        return (
            remote_base
            / "projects/rrg-jacquesp-ab/rabyj/epiclass-project/output/epiclass-logs/2023-01-epiatlas-freeze/hg38_100kb_all_none"
        )
    if version == "v2":
        return remote_base / "logs-dfreeze-2.1/hg38_100kb_all_none"

    raise ValueError(f"Unknown version '{version}'. Use 'v1' or 'v2'.")

In [None]:
def copy_dfreeze_NN_results(version: str = "v1", dry_run: bool = True):
    """Copy the results of the dfreeze NN runs to the local machine.

    Args:
        version: The version of the dfreeze NN results to copy. v1 or v2.
    """
    if dry_run:
        print("Dry run, no files will be copied.")

    remote_data_dir = determine_remote_base(version)

    if not remote_data_dir.exists():
        raise FileNotFoundError(f"Directory {remote_data_dir} does not exist.")

    # Find general subfolders (one subfolder = one 10-fold training run)
    exclude_dir = ["epirr_correlation", "hdf5_stats", "no_input"]
    subfolders = set()
    for folder in sorted(remote_data_dir.glob("*")):
        if folder.is_dir() and folder.name not in exclude_dir:
            for subfolder in folder.glob("*"):
                if subfolder.is_dir():
                    subfolders.add(subfolder)

    # Ignore non-NN results and more
    exclude_substrings = [
        "l1",
        "no-valid",
        "no_valid",
        "w-unknown",
        "predict",
    ]
    for subfolder in sorted(subfolders):
        if any(substring in subfolder.name for substring in exclude_substrings):
            subfolders.remove(subfolder)

    # Specify and examine desired files
    desired_files = [
        "full-10fold-validation.*",
        "full-10fold-validation_prediction.csv",
        "output_job*.*",
        "launch_script_*.sh",
    ]
    for subfolder in sorted(subfolders):
        for item in desired_files:
            items = list(subfolder.glob(item))

            # Check for failed jobs files (in NN results), and overwritten results, to have it be clear where the results come from
            # Using "cat output_*.o | grep -E "Split . training size" | wc  -l" for those subfolders to confirm if splits are unique
            # if len(items) > 2 and "predict" not in str(subfolder):
            #     print(f"More than 2 items ({len(items)}) found in {subfolder}: {[item.name for item in items]}")

    output_dir = base_data_dir / f"dfreeze_{version}"

    # Copying items to local
    for subfolder in sorted(subfolders):
        category_label = subfolder.parent.name.rstrip("_1l_3000n")
        category_output_dir = output_dir / category_label
        category_output_dir.mkdir(exist_ok=True, parents=True)
        for item in desired_files:
            items = list(subfolder.glob(item))
            new_dir = category_output_dir / subfolder.name
            new_dir.mkdir(exist_ok=True, parents=True)
            for item in items:
                if dry_run:
                    print(f"Copying {item} to {new_dir}.")
                else:
                    shutil.copy(item, new_dir)

        # Get individual split results
        split_results = list(subfolder.glob("split*/validation_prediction.csv"))
        for split_result in split_results:
            new_dir = category_output_dir / subfolder.name / split_result.parent.name
            new_dir.mkdir(exist_ok=True, parents=True)
            if dry_run:
                print(f"Copying {split_result} to {new_dir}.")
            else:
                shutil.copy(split_result, new_dir)

Note: harmonized_sample_disease_high_1l_3000n had overwritten results for split7

In [None]:
# copy_dfreeze_v1_NN_results()

In [None]:
def copy_dfreeze_nn_results(version: str = "v1", dry_run: bool = True) -> None:
    """
    Initiates file copy once "splitX" directories are found, preserving intermediate directory structure.

    Args:
        version: Version of the dfreeze NN results to copy. Options are "v1" or "v2".
        dry_run: If True, simulates the copy operation without performing it.
    """
    base_dir = determine_remote_base(version)
    desired_files_patterns = [
        "full-10fold-validation.*",
        "full-10fold-validation_prediction.csv",
        "output_job*.*",
        "launch_script_*.sh",
    ]
    search_and_copy_with_split_detection(
        base_dir, desired_files_patterns, version, dry_run
    )


def contains_split_folders(dir_path: Path) -> bool:
    """
    Checks if the directory contains any subdirectories named "splitX" where X is a number.

    Args:
        dir_path: The directory path to check.

    Returns:
        True if "splitX" directories are found, False otherwise.
    """
    for sub_dir in dir_path.iterdir():
        if (
            sub_dir.is_dir()
            and sub_dir.name.startswith("split")
            and sub_dir.name[5:].isdigit()
        ):
            return True
    return False


def copy_files_from_dir(
    dir_path: Path, patterns: List[str], version: str, dry_run: bool, base_dir: Path
) -> None:
    """
    Copies files matching patterns from the specified directory, preserving the directory structure in the output.

    Args:
        dir_path: The directory from which to copy files.
        patterns: Glob patterns for the files to copy.
        version: The version, for output directory organization.
        dry_run: If True, simulates the file copying.
        base_dir: The base directory of the search, used to preserve relative paths.
    """
    for pattern in patterns:
        for file in dir_path.glob(pattern):
            relative_path = dir_path.relative_to(base_dir)
            output_dir = Path(base_data_dir) / f"dfreeze_{version}" / relative_path
            output_dir.mkdir(parents=True, exist_ok=True)
            output_file_path = output_dir / file.name

            if dry_run:
                print(f"Would copy {file} to {output_file_path}.")
            else:
                shutil.copy(file, output_file_path)
                print(f"Copied {file} to {output_file_path}.")


def search_and_copy_with_split_detection(
    base_dir: Path, patterns: List[str], version: str, dry_run: bool
) -> None:
    """
    Searches directories for 'splitX' subfolders, then copies specified files from these directories and the splits.

    Args:
        base_dir: The starting directory for the search.
        patterns: Glob patterns for the files to copy.
        version: The version, used for organizing the output directory.
        dry_run: If True, simulates the file copying.
    """
    for dir_path in base_dir.glob("**/"):  # Search through all directories
        # Check if this directory contains any 'splitX' subdirectories
        split_dirs = [d for d in dir_path.glob("split*") if d.is_dir()]
        if split_dirs:
            # We're in a directory that contains 'splitX' subfolders
            # Copy files from the current directory
            copy_files_from_dir(dir_path, patterns, version, dry_run, base_dir)

            # Copy files from the 'splitX' subdirectories
            for split_dir in split_dirs:
                copy_files_from_dir(
                    split_dir, ["validation_prediction.csv"], version, dry_run, base_dir
                )

In [None]:
# copy_dfreeze_nn_results("v2", dry_run=False)

### Compare our metadata with IHEC official

In [None]:
import numpy as np
import pandas as pd

from epi_ml.core.metadata import Metadata

metadata_basedir = base_data_dir / "metadata"

ihec_metadata_file = (
    metadata_basedir / "official" / "EpiATLAS_experiment_metadata_11032024.csv"
)
ihec_meta_df = pd.read_csv(ihec_metadata_file, header=0)

my_meta_path = (
    metadata_basedir / "hg38_2023-epiatlas-dfreeze-pospurge-nodup_filterCtl.json"
)
my_meta = Metadata(my_meta_path)
my_meta_df = pd.DataFrame.from_records(list(my_meta.datasets))

In [None]:
# Same epirrs?
for epirr_label in ["EpiRR", "epirr_id"]:
    try:
        if not set(ihec_meta_df[epirr_label].unique()) == set(
            my_meta_df[epirr_label].unique()
        ):
            print(f"warning: Not the same {epirr_label} in each df.")
            break
    except KeyError:
        print(f"warning: No {epirr_label} column in one of the metadata files.")

In [None]:
ihec_meta_df.sort_values(by=epirr_label, inplace=True)
my_meta_df.sort_values(by=epirr_label, inplace=True)

ihec_meta_df = ihec_meta_df.apply(lambda x: x.astype(str).str.lower())
my_meta_df = my_meta_df.apply(lambda x: x.astype(str).str.lower())

ihec_meta_df.replace(np.nan, "", inplace=True)
my_meta_df.replace(np.nan, "", inplace=True)

In [None]:
# Compare content by EpiRR+experiment_type
assay_epiclass_dict = {
    "chip-seq-h3k27ac": "h3k27ac",
    "chip-seq-h3k27me3": "h3k27me3",
    "chip-seq-h3k36me3": "h3k36me3",
    "chip-seq-h3k4me1": "h3k4me1",
    "chip-seq-h3k4me3": "h3k4me3",
    "chip-seq-h3k9me3": "h3k9me3",
    "rna-seq-mrna-seq": "mrna_seq",
    "rna-seq-total-rna-seq": "rna_seq",
    "wgbs-standard": "wgbs-standard",
    "wgbs-pbat": "wgbs-pbat",
}
ihec_meta_df[ASSAY] = (
    ihec_meta_df["assay_type"] + "-" + ihec_meta_df["experiment_type"]
).map(assay_epiclass_dict)

ihec_all_uuids = ihec_meta_df.groupby([epirr_label, ASSAY])["uuid"].apply(
    lambda x: x.unique()
)
my_meta_all_uuids = my_meta_df.groupby([epirr_label, ASSAY])["uuid"].apply(
    lambda x: x.unique()
)

discrepancies = []
for epirr_id, experiment_type in ihec_all_uuids.index:
    ihec_uuid = ihec_all_uuids[epirr_id, experiment_type][0]
    try:
        my_uuid = my_meta_all_uuids[epirr_id, experiment_type][0]
    except KeyError:
        discrepancies.append((epirr_id, experiment_type, ihec_uuid, ""))
        continue
    if ihec_uuid != my_uuid:
        discrepancies.append((epirr_id, experiment_type, ihec_uuid, my_uuid))

for epirr_id, experiment_type in my_meta_all_uuids.index:
    my_uuid = my_meta_all_uuids[epirr_id, experiment_type][0]
    try:
        ihec_uuid = ihec_all_uuids[epirr_id, experiment_type][0]
    except KeyError:
        discrepancies.append((epirr_id, experiment_type, "", my_uuid))
        continue
    if ihec_uuid != my_uuid:
        discrepancies.append((epirr_id, experiment_type, ihec_uuid, my_uuid))

In [None]:
output_path = (
    metadata_basedir
    / f"discrepancies_{my_meta_path.stem}_VS_{ihec_metadata_file.stem}.csv"
)
pd.DataFrame(
    discrepancies,
    columns=[
        epirr_label,
        "experiment_type",
        f"uuid:{ihec_metadata_file.stem}",
        f"uuid:{my_meta_path.stem}",
    ],
).to_csv(output_path, index=False)

In [None]:
# Compare content by EpiRR
# Only works if IHEC metadata contains one line per epirr
try:
    my_meta_df = my_meta_df.set_index(epirr_label)
except KeyError:
    pass

discrepancies = []
for column_name in ihec_meta_df.columns:
    for epirr in ihec_meta_df[epirr_label]:
        my_vals = None
        try:
            my_vals = my_meta_df[column_name]
        except KeyError as e:
            print(f"Skipping {column_name} column")
            break
        try:
            my_vals = my_vals[epirr]
        except KeyError as e:
            # print(f"Skipping {epirr}")
            continue
        if isinstance(my_vals, pd.Series):
            my_vals = set(my_vals.unique())
        elif isinstance(my_vals, str):
            my_vals = set([my_vals])
        else:
            raise ValueError(
                f"Unknown type for '{column_name}' for '{epirr}': {type(my_vals)}"
            )

        ihec_vals = ihec_meta_df[column_name][ihec_meta_df[epirr_label] == epirr].unique()
        ihec_vals = set(ihec_vals)

        if my_vals != ihec_vals:
            if len(my_vals) != 1:
                raise ValueError(f"Multiple {column_name} values for {epirr}: {my_vals}")
            discrepancies.append(
                (epirr, column_name, sorted(my_vals)[0], sorted(ihec_vals)[0])
            )

In [None]:
output_path = (
    metadata_basedir
    / f"discrepancies_{my_meta_path.stem}_VS_{ihec_metadata_file.stem}.csv"
)
pd.DataFrame(
    discrepancies, columns=[epirr_label, "Category", "Our value", "IHEC value"]
).to_csv(output_path)