In [None]:
"""Notebook for code related to importing/downloading results from remote sources."""
# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, too-many-branches

In [None]:
from __future__ import annotations

import shutil
from pathlib import Path

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"

In [None]:
def copy_dfreeze_v1_NN_results():
    """Copy the results of the dfreeze-v1 NN runs to the local machine."""
    remote_base = Path.home() / "mounts" / "narval-mount"
    remote_data_dir = (
        remote_base
        / "projects/rrg-jacquesp-ab/rabyj/epiclass-project/output/epiclass-logs/2023-01-epiatlas-freeze/hg38_100kb_all_none"
    )

    if not remote_data_dir.exists():
        raise FileNotFoundError(f"Directory {remote_data_dir} does not exist.")

    # Find general subfolders (one subfolder = one 10-fold training run)
    exclude_dir = ["epirr_correlation", "hdf5_stats", "no_input"]
    subfolders = set()
    for folder in sorted(remote_data_dir.glob("*")):
        if folder.is_dir() and folder.name not in exclude_dir:
            for subfolder in folder.glob("*"):
                if subfolder.is_dir():
                    subfolders.add(subfolder)

    # Ignore non-NN results and more
    exclude_substrings = [
        "l1",
        "no-valid",
        "no_valid",
        "dfreeze-v2",
        "w-unknown",
        "predict",
    ]
    for subfolder in sorted(subfolders):
        if any(substring in subfolder.name for substring in exclude_substrings):
            subfolders.remove(subfolder)

    # Specify and examine desired files
    desired_files = ["full-10fold-validation.csv", "output_job*.*", "launch_script_*.sh"]
    for subfolder in sorted(subfolders):
        for item in desired_files:
            items = list(subfolder.glob(item))

            # Check for failed jobs files (in NN results), and overwritten results, to have it be clear where the results come from
            # Using "cat output_*.o | grep -E "Split . training size" | wc  -l" for those subfolders to confirm if splits are unique
            # if len(items) > 2 and "predict" not in str(subfolder):
            #     print(f"More than 2 items ({len(items)}) found in {subfolder}: {[item.name for item in items]}")

    output_dir = base_data_dir / "dfreeze_v1"

    # Copying items to local
    for subfolder in sorted(subfolders):
        category_label = subfolder.parent.name.rstrip("_1l_3000n")
        category_output_dir = output_dir / category_label
        category_output_dir.mkdir(exist_ok=True, parents=True)
        for item in desired_files:
            items = list(subfolder.glob(item))
            new_dir = category_output_dir / subfolder.name
            new_dir.mkdir(exist_ok=True, parents=True)
            for item in items:
                shutil.copy(item, new_dir)

        # Get individual split results
        split_results = list(subfolder.glob("split*/validation_prediction.csv"))
        for split_result in split_results:
            new_dir = category_output_dir / subfolder.name / split_result.parent.name
            new_dir.mkdir(exist_ok=True, parents=True)
            shutil.copy(split_result, new_dir)

Note: harmonized_sample_disease_high_1l_3000n had overwritten results for split7