In [None]:
"""
Analyze non-core predictions from 9n-nc classifier, all within epiatlas
"""

# pylint: disable=import-error, redefined-outer-name, use-dict-literal, too-many-lines, unused-import, unused-argument, too-many-branches, duplicate-code

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from __future__ import annotations

from pathlib import Path
from typing import List

import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import confusion_matrix as sk_cm

from epiclass.core.confusion_matrix import ConfusionMatrixWriter
from epiclass.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    IHECColorMap,
    MetadataHandler,
    SplitResultsHandler,
    add_second_highest_prediction,
    display_perc,
)

# import plotly.graph_objects as go
# from plotly.subplots import make_subplots

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
base_data_dir = base_dir / "data"
base_fig_dir = base_dir / "figures"
paper_dir = base_dir

if not base_fig_dir.exists():
    raise FileNotFoundError(f"Directory {base_fig_dir} does not exist.")

In [None]:
IHECColorMap = IHECColorMap(base_fig_dir)
assay_colors = IHECColorMap.assay_color_map

In [None]:
split_results_handler = SplitResultsHandler()
metadata_handler = MetadataHandler(paper_dir)

In [None]:
metadata_v2_df = metadata_handler.load_metadata_df("v2-encode", merge_assays=False)

## assay epiclass 9c-nc

### Create informative dataframe

In [None]:
results_dir = (
    base_data_dir / "training_results/dfreeze_v2/hg38_100kb_all_none_w_encode_noncore"
)
results_dir = results_dir / f"{ASSAY}_1l_3000n" / "9c-nc" / "10fold-oversampling"
if not results_dir.exists():
    raise FileNotFoundError(f"Directory {results_dir} does not exist.")

In [None]:
results = split_results_handler.read_split_results(results_dir)
concat_results = split_results_handler.concatenate_split_results(
    {"9c-nc": results}, concat_first_level=True
)["9c-nc"]

In [None]:
pred_cols = [col for col in concat_results.columns if "class" not in col]
# pred_cols

In [None]:
augmented_results = split_results_handler.add_max_pred(concat_results)
augmented_results = add_second_highest_prediction(augmented_results, pred_cols)

In [None]:
augmented_results["md5sum"] = augmented_results.index

### Analyze non-core pred that are "mislabels"

In [None]:
min_pred = 0.6
pred_mask = augmented_results["Max pred"] >= min_pred
nb_pred = pred_mask.sum()
print(
    f"Nb pred (pred score >= {min_pred:.02f}): {nb_pred/len(augmented_results) * 100:.02f}% ({nb_pred}/{len(augmented_results)})"
)

In [None]:
# # save a confusion matrix
# df = augmented_results[pred_mask]
# cm = sk_cm(df["True class"], df["Predicted class"])
# cm_writer = ConfusionMatrixWriter(labels=pred_cols, confusion_matrix=cm)

# name = f"full-10fold-validation_prediction-confusion-matrix-threshold-{min_pred:.02f}"
# cm_writer.to_all_formats(logdir=results_dir, name=name)

In [None]:
display(augmented_results["True class"].value_counts())

In [None]:
nc_pred_df = augmented_results[
    (augmented_results["Predicted class"] == "non-core")
    & (augmented_results["Predicted class"] != augmented_results["True class"])
]
print(nc_pred_df.shape)

In [None]:
second_pred_ok_mask = nc_pred_df["True class"] == nc_pred_df["2nd pred class"]
print(
    f"Number of non-core predictions mislabels where the second highest prediction is correct: {second_pred_ok_mask.sum()}/{nc_pred_df.shape[0]}"
)

In [None]:
non_pred_cols = [col for col in augmented_results.columns if col not in pred_cols]

In [None]:
# pylint: disable=consider-using-f-string
with pd.option_context("display.float_format", "{:.3f}".format):
    display(nc_pred_df[~second_pred_ok_mask][non_pred_cols])

#### Summary
- Nb pred (pred score >= 0.60): 99.17% (20682/20855)
- Number of non-core predictions mislabels where the second highest prediction is correct: 24/29
- Incorrect 2nd_pred + min_pred >= 0.6: 2/5 (both ctcf)

If we also ask for a 1st/2nd prob diff > 0.3, in non-core mislabels, only one CTCF -> h3k4me3 remains. Could be worth to examine this specific file for mislabeling?

### Analyze non-core files predicted as other

In [None]:
encode_metadata_dir = base_data_dir / "metadata/encode"
non_core_categories_path = encode_metadata_dir / "non-core_encode_assay_counts_v1.tsv"
if not non_core_categories_path.exists():
    raise FileNotFoundError(f"File {non_core_categories_path} does not exist.")

non_core_categories_df = pd.read_csv(non_core_categories_path, sep="\t")
print(non_core_categories_df.columns)

In [None]:
metadata_v2_df.loc[:, "Assay"] = metadata_v2_df["Assay"].str.lower().copy()

In [None]:
# concat_results.columns

In [None]:
nc_pred_df = concat_results[
    (concat_results["Predicted class"] != "non-core")
    & (augmented_results["True class"] == "non-core")
]
for col in ["True class", "Predicted class"]:
    nc_pred_df.loc[:, col] = nc_pred_df[col].str.lower().copy()

# print(nc_pred_df.shape)
nc_pred_df = nc_pred_df.merge(metadata_v2_df, left_index=True, right_on="md5sum")
# print(nc_pred_df.shape)
nc_pred_df = nc_pred_df.merge(
    non_core_categories_df[["assay", "assay_category"]],
    left_on="Assay",
    right_on="assay",
    how="left",
)
# print(nc_pred_df.shape)

In [None]:
display(nc_pred_df["Predicted class"].value_counts())

In [None]:
for predicted_class, group in nc_pred_df.groupby("Predicted class"):
    print(f"\nPredicted class: {predicted_class}")
    print(group["Assay"].value_counts())

    category_counts = group["assay_category"].value_counts(dropna=False)
    print("\nAssay categories:")
    print(category_counts)
    display_perc((category_counts / category_counts.sum()).sort_values(ascending=False))

In [None]:
for val in sorted(nc_pred_df[nc_pred_df["assay_category"] == "not_looked"]["assay"]):
    print(val)