In [None]:
"""Notebook to work on proper way to merge a lot of already augmented output files."""
# pylint: disable=line-too-long, redefined-outer-name, import-error, unused-import

In [None]:
import collections
import functools
import re
import subprocess
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
from IPython.display import display

## Collect relevant files

In [None]:
gen_base_dir = (Path.home() / "mounts/narval-mount/logs-dfreeze-2.1").resolve()

valid_pred_files = (
    subprocess.check_output(
        (
            "find",
            f"{gen_base_dir}",
            "-mindepth",
            "3",
            "-maxdepth",
            "5",
            "-type",
            "f",
            "-name",
            "full-10fold-validation_prediction_augmented-all.csv",
        )
    )
    .decode("utf-8")
    .splitlines()
)

In [None]:
OUTPUT_PATH = Path.home() / "downloads" / "merged_pred_results.csv"

In [None]:
invalid_dirs = [
    "noFC",
    "raw",
    "pval",
    "l1",
    "harmonized_donor_sex_1l_3000n/no-mixed",
    "groups_second_level_name_1l_3000n/w-mix",
    "w-unknown",
    "10fold-2",
    "10fold-oversampling2",
    "10fold-oversample2",
    "random_1l_3000n/10fold-11c",
]
valid_pred_files = [Path(file) for file in valid_pred_files]
valid_pred_files = [
    file
    for file in valid_pred_files
    if all(name not in str(file) for name in invalid_dirs)
]

In [None]:
categories = collections.defaultdict(list)
for file in valid_pred_files:
    categories[file.parent.parent].append(file.parent.name)

In [None]:
oversampling_dirs = []
for folder, result_list in categories.items():
    if any(
        result in ["10fold-oversampling", "10fold-oversample"] for result in result_list
    ):
        oversampling_dirs.append(folder)

In [None]:
for file in valid_pred_files:
    if file.parent.parent in oversampling_dirs and file.parent.stem.endswith("10fold"):
        valid_pred_files.remove(file)

In [None]:
# for file in valid_pred_files:
#     print(file)

## Order paths in desired order

In [None]:
from fuzzywuzzy import process


def parse_instructions(instructions: str) -> Dict[str, int]:
    """
    Parse the instructions from A and return a dictionary with keys and their orders.

    Args:
        instructions (str): The instructions from A.

    Returns:
        Dict[str, int]: Dictionary containing the keys and their orders.
    """
    order_dict = {}
    for line in instructions.strip().split("\n"):
        if line.startswith("#"):
            match = re.match(r"#(\d+)", line)
            if match:
                order = int(match.group(1))
                key = re.search(r"[* ]([a-zA-Z_]+)", line[match.end() :]).group(1)
                order_dict[key] = order
    return order_dict


def fuzzy_sort_paths(paths: List[Path], order_dict: Dict[str, int]) -> List[str]:
    """
    Sort a list of paths based on the fuzzy matching with keys from an order dictionary.

    Args:
        paths (List[str]): The list of paths to sort.
        order_dict (Dict[str, int]): The dictionary containing keys and their orders.

    Returns:
        List[str]: List of paths sorted according to their best fuzzy-matched keys.
    """

    def get_order(path: Path) -> int:
        parent_names = [parent.name for parent in path.parents]
        key = "/".join(parent_names[0:3][::-1])
        print(key)
        best_match, _ = process.extractOne(key, order_dict.keys())
        return order_dict.get(best_match, 9999)

    return sorted(paths, key=get_order)

In [None]:
instructions = """
#1 assay_epiclass
#2 assay_epiclass_encode
#9 harmonized_biomaterial_type
#3 harmonized_donor_sex (trinary)
#6 harmonized_sample_disease_high
#6 harmonized_sample_cancer_high
#10 paired_end
#5 groups_second_level_name, no “mixed.mixed”
#4 harmonized_sample_ontology_intermediate
#12 random_16c
#8 project
#11 track_type
#7 harmonized_donor_life_stage
"""

In [None]:
order_dict = parse_instructions(instructions)
sorted_paths = fuzzy_sort_paths(valid_pred_files, order_dict)

In [None]:
# for elem in sorted(order_dict.items(), key=lambda x: x[1]):
#     print(elem)

# for i, path in enumerate(sorted_paths):
#     print(i, str(path).split("/")[-4:-1])

## Merge files

In [None]:
def print_cols(dfs: Dict):
    """Print all columns of first df in dict."""
    a_df = list(dfs.values())[0]
    print(a_df.shape)
    for column in a_df.columns:
        print(column)

In [None]:
def create_filename(path: Path) -> str:
    """Create filename from important path information."""
    important_names = [path.name for path in list(path.parents)[0:3][::-1]]
    for important_name in important_names:
        if "hg38_100kb_all_none" in important_name:
            important_names.remove(important_name)

    name = "_".join(important_names)

    return name

In [None]:
dfs = {}
for input_file in sorted_paths:
    input_file = Path(input_file)

    df_name = create_filename(input_file)

    df = pd.read_csv(input_file, index_col="md5sum", low_memory=False)
    df.name = df_name

    df.dropna(axis=1, how="all")
    if df_name in dfs:
        raise ValueError(
            f"Conflicting names from {input_file}: {df_name} file already exists."
        )

    dfs[df_name] = df

In [None]:
# Remove detail of prediction probabilities
col1 = "1rst/2nd prob ratio"
col2 = "files/epiRR"
for cat, df in dfs.items():
    column_names = df.columns
    try:
        cut_pos_1 = column_names.get_loc(col1)
        cut_pos_2 = column_names.get_loc(col2)
        df = df.drop(df.columns[cut_pos_1 + 1 : cut_pos_2], axis=1)
        df = df.drop(columns=["EpiRR", "md5sum.1"])
    except KeyError:
        print("df seems already reduced")

    dfs[cat] = df

In [None]:
# Drop useless columns
for cat, df in dfs.items():
    df.replace(to_replace=["--empty--", "", "NA", None], value=np.nan, inplace=True)
    df = df.dropna(axis=1, how="all")
    dfs[cat] = df

In [None]:
# Make all different columns have unique relevant names
# https://stackoverflow.com/questions/38101009/changing-multiple-column-names-but-not-all-of-them-pandas-python
nb_diff_columns = 13
old_names = list(dfs.values())[0].columns[-nb_diff_columns:]
for cat, df in dfs.items():
    new_names = [name + f" {cat}" for name in old_names if name[-1] != "n"]
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    df.name = cat
    dfs[cat] = df
    # print(df.columns)

In [None]:
# Merge all dataframes
df_list = list(dfs.values())
df_final = functools.reduce(
    lambda left, right: pd.merge(
        left, right, on="md5sum", how="outer", suffixes=("", "_delete")
    ),
    df_list,
)

In [None]:
# Remove duplicate metadata columns (those that end by _delete)
df_final = df_final.filter(regex=r"^(?:(?!_delete).)+$")

In [None]:
# Re-arrange columns
all_columns = df_final.columns.tolist()

# Separate metadata and result columns
result_columns = [col for col in all_columns if col.rsplit(" ", 1)[0] in old_names]
meta_columns = [col for col in all_columns if col not in result_columns]

new_order = meta_columns + result_columns
df_final = df_final[new_order]

In [None]:
df_final.to_csv(OUTPUT_PATH)