In [None]:
"""Some results from two different groups_second_level_name runs were mixed up, so I need to unmix them."""
# pylint: disable=import-error,redefined-outer-name,consider-using-f-string

In [None]:
from __future__ import annotations

import os
import shutil
import urllib.request
from datetime import datetime
from pathlib import Path
from typing import Dict, List

from comet_ml.api import API

In [None]:
def find_smallest_md5_files(root_dir: str | Path) -> List[Dict[str, int | str | Path]]:
    """
    Traverse the directory structure and identify the smallest .md5 files for each split and set pair.

    Args:
        root_dir (str | Path): The root directory to start the search.

    Returns:
        List[Dict[str, int|str|Path]]: A list containing dictionaries with the smallest files for each split and set pair.
    """
    root_path = Path(root_dir)
    smallest_files = [
        {"split": i, "training": (float("inf"), None), "validation": (float("inf"), None)}
        for i in range(10)
    ]
    for file_path in root_path.rglob("*.md5"):
        filename = file_path.name
        # Extracting split and set information from the filename
        if filename.startswith("split") and (
            "_training_" in filename or "_validation_" in filename
        ):
            split, set_type = filename.split("_")[:2]
            split_index = int(split[-1])  # spliti string
            file_size = file_path.stat().st_size
            if file_size < smallest_files[split_index][set_type][0]:
                smallest_files[split_index][set_type] = (file_size, file_path)
    return smallest_files

In [None]:
def find_recent_md5_files(root_dir: str | Path) -> List[Dict[str, int | str | Path]]:
    """
    Traverse the directory structure and identify the most recent .md5 files for each split and set pair.

    Args:
        root_dir (str | Path): The root directory to start the search.

    Returns:
        List[Dict[str, int|str|Path]]: A list containing dictionaries with the recent files for each split and set pair.
    """
    root_path = Path(root_dir)
    recent_files = [
        {"split": i, "training": (0, None), "validation": (0, None)} for i in range(10)
    ]
    for file_path in root_path.rglob("*.md5"):
        filename = file_path.name
        # Extracting split and set information from the filename
        if filename.startswith("split") and (
            "_training_" in filename or "_validation_" in filename
        ):
            split, set_type = filename.split("_")[:2]
            split_index = int(split[-1])  # spliti string
            file_time = int(file_path.stat().st_mtime)
            if file_time > recent_files[split_index][set_type][0]:
                recent_files[split_index][set_type] = (file_time, file_path)
    return recent_files

In [None]:
root_directory = (
    Path.home()
    / "mounts/narval-mount/project-rabyj/epilap/output/logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none/assay_epiclass_1l_3000n/10fold"
)
destination_root_directory = (
    Path(root_directory).parents[2]
    / "hg38_100kb_all_none_w_encode_noncore"
    / "assay_epiclass_1l_3000n"
    / "10fold"
)
print(destination_root_directory, destination_root_directory.exists())

In [None]:
# smallest_md5_files = find_smallest_md5_files(root_directory)
# print(smallest_md5_files)

In [None]:
recent_md5_files = find_recent_md5_files(root_directory)
# print(recent_md5_files)

In [None]:
# # print all paths
# for file_dict in recent_md5_files:
#     print(file_dict["training"][1])  # type: ignore
#     print(file_dict["validation"][1])  # type: ignore

In [None]:
def transfer_files(
    files: List[Dict[str, int | str | Path]],
    destination_root: str | Path,
    delete_original: bool = False,
):
    """
    Transfer md5 .md5 files to a new directory structure, preserving the original structure.

    Args:
        smallest_files (List[Dict[str, int|str|Path]]): A list containing dictionaries with the smallest files for each split and set pair.
        destination_root (str | Path): The root directory where the files will be transferred to.
    """
    for file_info in files:
        split = "split" + str(file_info["split"])
        for set_type in ["training", "validation"]:
            _, file_path = file_info[set_type]  # type: ignore
            if file_path:  # Check if the file path is not None
                destination_path: Path = Path(destination_root) / split / file_path.name
                os.makedirs(destination_path.parent, exist_ok=True)

                shutil.copy2(file_path, destination_path)
                if delete_original:
                    os.unlink(file_path)

In [None]:
def transer_results(root: Path, new_root: Path, delete_original: bool = False):
    """Transfer png, csv and tsv and list files to a new directory structure, preserving the original structure."""
    for file_path in root.rglob("*"):
        if file_path.is_file():
            if file_path.suffix in [".png", ".csv", ".tsv", ".list"]:
                destination_path: Path = Path(new_root) / file_path.relative_to(root)
                os.makedirs(destination_path.parent, exist_ok=True)
                shutil.copy2(file_path, destination_path)
                if delete_original:
                    os.unlink(file_path)

In [None]:
# transfer_files(recent_md5_files, destination_root_directory, delete_original=True)
transer_results(root_directory, destination_root_directory, delete_original=True)

In [None]:
experiments_to_move = []

In [None]:
def copy_files_with_strings(
    root_dir: str | Path, destination_root: str | Path, strings_list: list
):
    """
    Traverse the directory structure and print commands to move files containing any of the specified strings
    to a mirrored directory structure.

    Args:
        root_dir (str): The root directory to start the search.
        destination_root (str): The root directory where the files will be copied to.
        strings_list (list): List of strings to look for in the file paths.
    """
    root_path = Path(root_dir)
    for file_path in root_path.rglob("*"):
        if (
            file_path.is_dir()
            and any(s in str(file_path) for s in strings_list)
            and len(file_path.name) == 32
        ):
            relative_path = file_path.relative_to(root_path)
            destination_path = Path(destination_root) / relative_path

            os.makedirs(destination_path.parent, exist_ok=True)

            print(r"\mv {} {}".format(file_path, destination_path))

In [None]:
# just change the base of the paths for the non-mounted one and it will all be good.
copy_files_with_strings(root_directory, destination_root_directory, experiments_to_move)

## Dealing with the cometML API to retrieve saved result files.

In [None]:
api = API()

In [None]:
for experiment in api.get("rabyj/epilap"):
    help(experiment)
    break

In the future, use "SLURM_JOB_ID" to select experiments.

In [None]:
correct_list = []
for experiment in api.get("rabyj/epilap"):
    if "assay_epiclass" not in experiment.get_tags():
        continue
    meta = experiment.get_metadata()
    time = int(meta["startTimeMillis"]) / 1000
    time = datetime.utcfromtimestamp(time)
    if (
        not datetime.fromisoformat("2023-08-20")
        < time
        < datetime.fromisoformat("2023-08-25")
    ):
        continue
    if (
        "hg38_100kb_all_none-assay_epiclass_1l_3000n-10fold-split"
        in meta["experimentName"]
    ):
        correct_list.append(experiment.key)

In [None]:
# correct_list

In [None]:
for experiment in [api.get(f"rabyj/epilap/{key}") for key in correct_list]:
    for info in experiment.get_others_summary():
        if "SLURM_JOB_ID" == info["name"]:
            print(info["valueCurrent"])

In [None]:
for experiment in [api.get(f"rabyj/epilap/{key}") for key in correct_list]:
    exp_name = experiment.get_name()
    split_name = exp_name.split("-")[-1]
    for asset_dict in experiment.get_asset_list(asset_type="all"):
        filename = asset_dict["fileName"]
        if (
            filename.endswith(".csv")
            or filename.endswith(".png")
            or filename.endswith(".tsv")
        ):
            url = asset_dict["link"]
            local_filename, _ = urllib.request.urlretrieve(url, filename)
            new_path = destination_root_directory / split_name / filename
            print(local_filename, new_path)
            shutil.move(local_filename, new_path)

In [None]:
for experiment in [api.get(f"rabyj/epilap/{key}") for key in correct_list]:
    # print(experiment.get_metadata())
    # print(experiment.get_others_summary())
    wanted_experiment = True
    for param_dict in experiment.get_parameters_summary():
        print(param_dict)
        break
    #     if "mixed.mixed" in param_dict.values() and "mapping" in param_dict["name"]:
    #         wanted_experiment = False

    # if wanted_experiment:
    #     exp_name = experiment.get_name()
    #     split_name = exp_name.split("-")[-1]
    #     for asset_dict in experiment.get_asset_list(asset_type="all"):
    #         filename = asset_dict["fileName"]
    #         if (
    #             filename.endswith(".csv")
    #             or filename.endswith(".png")
    #             or filename.endswith(".tsv")
    #         ):
    #             url = asset_dict["link"]
    #             local_filename, _ = urllib.request.urlretrieve(url, filename)
    #             new_path = destination_root_directory / split_name / filename
    #             # shutil.move(local_filename, new_path)

## Debugging last epoch not showing up in dashboard

In [None]:
api = API()

last_epochs = set()
correct_list = []
for experiment in api.get("rabyj/epilap"):
    last_epoch = experiment.get_metrics("Last epoch")
    if last_epoch:
        last_epochs.add(last_epoch[0]["metricValue"])