In [1]:
"""Notebook to work on proper way to merge a lot of already augmented output files."""
# pylint: disable=line-too-long, redefined-outer-name, import-error, unused-import, pointless-statement, unreachable,unnecessary-lambda

In [2]:
from __future__ import annotations

import collections
import functools
import re
from pathlib import Path
from typing import Dict, List

import numpy as np
import pandas as pd
from fuzzywuzzy import process
from IPython.display import display

from epi_ml.utils.ssh_utils import createSCPClient, createSSHClient, run_commands_via_ssh

## Collect relevant files

In [3]:
gen_base_dir = (Path.home() / "mounts/narval-mount/logs-dfreeze-2.1").resolve()

In [4]:
valid_pred_files = []

In [5]:
narval_base_dir = "~/logs-dfreeze-2.1"

In [6]:
cmd1 = f"cd {narval_base_dir} && find . -mindepth 3 -maxdepth 5 -type f -name full-10fold-validation_prediction_augmented-all.csv"
cmd2 = f"cd {narval_base_dir} && find . -mindepth 5 -maxdepth 6 -type f -name *test*augmented*.csv"
cmd_results = run_commands_via_ssh(
    cmds=[cmd1, cmd2],
    username="rabyj",
    hostname="narval.computecanada.ca",
    port=22,
)

In [7]:
valid_pred_files = [cmd_result.splitlines() for cmd_result in cmd_results]
valid_pred_files = valid_pred_files[0] + valid_pred_files[1]

In [8]:
for file in valid_pred_files:
    print(file)

In [9]:
OUTPUT_PATH = Path.home() / "downloads" / "merged_pred_results_blblbllblb.csv"

In [10]:
invalid_dirs = [
    "noFC",
    "raw",
    "pval",
    "l1",
    "harmonized_donor_sex_1l_3000n/no-mixed",
    "groups_second_level_name_1l_3000n/w-mix",
    "w-unknown",
    "10fold-2",
    "10fold-oversampling2",
    "10fold-oversample2",
    "random_1l_3000n/10fold-11c",
]
valid_pred_files = [Path(file) for file in valid_pred_files]
valid_pred_files = [
    file
    for file in valid_pred_files
    if all(name not in str(file) for name in invalid_dirs)
]

In [20]:
categories = collections.defaultdict(list)
for file in valid_pred_files:
    categories[file.parent.parent].append(file.parent.name)

In [21]:
categories

In [12]:
oversampling_dirs = []
for folder, result_list in categories.items():
    if any(
        result in ["10fold-oversampling", "10fold-oversample"] for result in result_list
    ):
        oversampling_dirs.append(folder)

In [19]:
oversampling_dirs

In [39]:
# remove non-oversampling results when w-oversampling also exits
for file in list(valid_pred_files):
    # sanity check
    if (file.parent.name == "10fold") != file.parent.stem.endswith("10fold"):
        raise ValueError(f"wat: {str(file)}")

    if file.parent.parent in oversampling_dirs and file.parent.name == "10fold":
        print(f"Removing {file}")
        valid_pred_files.remove(file)

In [40]:
print(len(valid_pred_files))
for file in valid_pred_files:
    print(file)

## Order paths in desired order

In [41]:
def parse_instructions(instructions: str) -> Dict[str, int]:
    """
    Parse the instructions from A and return a dictionary with keys and their orders.

    Args:
        instructions (str): The instructions from A.

    Returns:
        Dict[str, int]: Dictionary containing the keys and their orders.
    """
    order_dict = {}
    for line in instructions.strip().split("\n"):
        if line.startswith("#"):
            match = re.match(r"#(\d+)", line)
            if match:
                order = int(match.group(1))
                key = re.search(r"[* ]([a-zA-Z_]+)", line[match.end() :]).group(1)
                order_dict[key] = order
    return order_dict


def fuzzy_sort_paths(paths: List[Path], order_dict: Dict[str, int]) -> List[str]:
    """
    Sort a list of paths based on the fuzzy matching with keys from an order dictionary.

    Args:
        paths (List[str]): The list of paths to sort.
        order_dict (Dict[str, int]): The dictionary containing keys and their orders.

    Returns:
        List[str]: List of paths sorted according to their best fuzzy-matched keys.
    """

    def get_order(path: Path) -> int:
        parent_names = [parent.name for parent in path.parents]
        key = "/".join(parent_names[0:3][::-1])
        best_match, _ = process.extractOne(key, order_dict.keys())
        return order_dict.get(best_match, 9999)

    return sorted(paths, key=get_order)

In [42]:
instructions = """
#1 assay_epiclass
#2 assay_epiclass_encode
#9 harmonized_biomaterial_type
#3 harmonized_donor_sex (trinary)
#6 harmonized_sample_disease_high
#6 harmonized_sample_cancer_high
#10 paired_end
#5 groups_second_level_name, no “mixed.mixed”
#4 harmonized_sample_ontology_intermediate
#12 random_16c
#8 project
#11 track_type
#7 harmonized_donor_life_stage
#13 complete_no_valid_oversample/predictions
"""

In [43]:
order_dict = parse_instructions(instructions)
sorted_paths = fuzzy_sort_paths(valid_pred_files, order_dict)

In [44]:
# for elem in sorted(order_dict.items(), key=lambda x: x[1]):
#     print(elem)

# for i, path in enumerate(sorted_paths):
#     print(i, str(path).split("/")[-4:-1])

## Merge files

In [45]:
def create_filename(path: Path) -> str:
    """Create filename from important path information."""
    if "predictions" in str(path):
        important_names = [path.name for path in list(path.parents)[0:4][::-1]]
    else:
        important_names = [path.name for path in list(path.parents)[0:3][::-1]]

    for important_name in important_names:
        if "encode" in important_name:
            important_names.remove(important_name)
            important_names.insert(0, "encode")
        elif "hg38_100kb_all_none" in important_name:
            important_names.remove(important_name)

    name = "_".join(important_names)

    return name

In [46]:
# for path in sorted_paths:
#     print(create_filename(path))

In [50]:
scp_client = None

results_base_dir = (
    Path.home()
    / "Projects/epilap/output/logs/epiatlas-dfreeze-v2.1/merged_results/input/"
)
new_sorted_paths = []
for input_file in sorted_paths:
    input_file = Path(input_file)
    new_filename = f"{create_filename(input_file)}.csv"
    if not (results_base_dir / new_filename).is_file():
        try:
            scp_client.get(
                f"{narval_base_dir}/{input_file}", f"{results_base_dir}/{new_filename}"
            )
        except AttributeError:
            scp_client = createSCPClient(
                createSSHClient("narval.computecanada.ca", 22, "rabyj")
            )
    new_path = results_base_dir / new_filename
    new_sorted_paths.append(new_path)

In [51]:
for input_file in new_sorted_paths:
    print(input_file)

In [156]:
dfs = {}
for input_file in new_sorted_paths:
    df_name = input_file.name.split(".")[0]
    df = pd.read_csv(input_file, index_col="md5sum", low_memory=False)
    df.name = df_name

    df.dropna(axis=1, how="all")
    if df_name in dfs:
        raise ValueError(
            f"Conflicting names from {input_file}: {df_name} file already exists."
        )

    dfs[df_name] = df

In [157]:
# Remove detail of prediction probabilities
col1 = "1rst/2nd prob ratio"
col2 = "files/epiRR"
for cat, df in dfs.items():
    column_names = df.columns
    try:
        cut_pos_1 = column_names.get_loc(col1)
        cut_pos_2 = column_names.get_loc(col2)
        df = df.drop(df.columns[cut_pos_1 + 1 : cut_pos_2], axis=1)
        df = df.drop(columns=["EpiRR", "md5sum.1"])
    except KeyError:
        print("df seems already reduced")

    dfs[cat] = df

In [158]:
# Drop useless columns
for cat, df in dfs.items():
    df.replace(to_replace=["--empty--", "", "NA", None], value=np.nan, inplace=True)
    df = df.dropna(axis=1, how="all")
    dfs[cat] = df

In [159]:
# Make all different columns have unique relevant names
# https://stackoverflow.com/questions/38101009/changing-multiple-column-names-but-not-all-of-them-pandas-python
nb_diff_columns = 13
old_names = list(dfs.values())[0].columns[-nb_diff_columns:]
for cat, df in dfs.items():
    new_names = [name + f" {cat}" for name in old_names if name[-1] != "n"]
    df.rename(columns=dict(zip(old_names, new_names)), inplace=True)
    df.name = cat
    dfs[cat] = df
    # print(df.columns)

In [180]:
def merge_dataframes(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
    """
    Merge two DataFrames by concatenating along the index, aligning common columns
    and appending non-common columns.

    Parameters:
    df1 (pd.DataFrame): The first DataFrame
    df2 (pd.DataFrame): The second DataFrame

    Returns:
    pd.DataFrame: Merged DataFrame with the index name preserved.
    """
    print(f"Entering shapes: {df1.shape}, {df2.shape}")
    if df1.index.name != df2.index.name:
        raise ValueError(
            f"Index names are different: {df1.index.name} != {df2.index.name}"
        )

    result = pd.merge(df1, df2, left_index=True, right_index=True, how="outer")

    result.index.name = df1.index.name
    print(f"Output shape: {result.shape}")
    return result

Merge encode and epiatlas df, encode metadata is not redundant with epiatlas

In [181]:
df_key1 = "assay_epiclass_1l_3000n_11c_10fold-oversampling"
df_key2 = "encode_assay_epiclass_1l_3000n_10fold-oversampling"
df_key3 = "partial_merge"

partial_merge = merge_dataframes(dfs[df_key1], dfs[df_key2])

dfs[df_key3] = partial_merge

In [184]:
for val in partial_merge.columns:
    print(val)

In [182]:
for name in [df_key1, df_key2, df_key3]:
    df = dfs[name]
    display(name, df.shape)
    print(df.index.name)
    display(df["assay_epiclass"].value_counts(dropna=False))

In [163]:
raise ValueError("stop here")

In [None]:
for df_name in [df_key1, df_key2]:
    try:
        del dfs[df_name]
    except KeyError:
        continue

In [None]:
for df_name, df in dfs.items():
    if any(df["assay_epiclass"].isnull()):
        print(f"assay_epiclass is null in {df_name}")

Merge all dataframes

starting with biggest dataframes first

In [None]:
df_list = sorted(list(dfs.values()), key=lambda x: len(x), reverse=True)
df_final = functools.reduce(
    lambda left, right: pd.merge(
        left, right, on="md5sum", how="outer", suffixes=("", "_delete")
    ),
    df_list,
)

In [None]:
# for column in df_final.columns:
#     print(column)

In [None]:
# Remove duplicate metadata columns (those that end by _delete)
df_final = df_final.filter(regex=r"^(?:(?!_delete).)+$")

In [None]:
# Re-arrange columns
all_columns = df_final.columns.tolist()

# Separate metadata and result columns
result_columns = [col for col in all_columns if col.rsplit(" ", 1)[0] in old_names]
meta_columns = [col for col in all_columns if col not in result_columns]

new_order = meta_columns + result_columns
df_final = df_final[new_order]

In [None]:
df_final.to_csv(OUTPUT_PATH)

In [None]:
raise ValueError("STOP HERE")

### Add ChrY/X coverage

In [None]:
df_final = pd.read_csv(OUTPUT_PATH, index_col="md5sum", low_memory=False)
df_chrY = pd.read_csv(
    Path.home() / "downloads" / "temp" / "coverage_combined.csv", index_col="filename"
)

In [None]:
# print(df_final.shape, df_chrY.shape)

In [None]:
new_final = df_final.join(df_chrY, how="left")
assert new_final.shape == (
    df_final.shape[0],
    df_final.shape[1] + df_chrY.shape[1],
)  # same number as og samples, but more columns
new_final.to_csv(OUTPUT_PATH.parent / "merged_pred_results_chrY.csv")