In [None]:
"""Create plots of some results."""
# pylint: disable=import-error,redefined-outer-name

In [None]:
from pathlib import Path

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.ticker import PercentFormatter

## All classifiers performance comparison

In [None]:
file_path = Path.home() / "downloads" / "temp" / "all_metrics - Pivot Table 1.csv"

In [None]:
def create_metrics_df(file_path: Path) -> pd.DataFrame:
    """Create a dataframe from the csv file."""
    metrics_df = pd.read_csv(file_path)

    # Fill missing values in the 'classifier' column using forward fill method
    metrics_df["classifier"] = metrics_df["classifier"].fillna(method="ffill")

    # Rename columns to match the desired format
    metrics_df.rename(
        columns={
            "classifier": "Classifier",
            "metric": "Metric",
            "AVERAGE of value": "Average",
            "STDEV of value": "Std",
        },
        inplace=True,
    )

In [None]:
def plot_classifiers_performance(metrics_df: pd.DataFrame) -> None:
    """Plot the performance of multiple classifiers."""
    # Set the figure size
    plt.figure(figsize=(10, 6))

    # Create a bar plot without error bars
    barplot = sns.barplot(
        data=metrics_df, x="Classifier", y="Average", hue="Metric", errorbar=None
    )

    # Get the x and y coordinates of the bars
    x_coords = []
    y_coords = []
    for rect in barplot.patches:
        x_coords.append(rect.get_x() + rect.get_width() / 2)
        y_coords.append(rect.get_height())

    # Calculate the number of metrics and classifiers to determine the positions of the error bars
    num_metrics = metrics_df["Metric"].nunique()
    num_classifiers = metrics_df["Classifier"].nunique()

    # Add the error bars
    for i in range(num_classifiers):
        for j in range(num_metrics):
            barplot.errorbar(
                x_coords[i * num_metrics + j],
                y_coords[i * num_metrics + j],
                yerr=metrics_df["Std"][i * num_metrics + j],
                color="black",
                capsize=3,
                fmt="none",
            )

    # Set the y-axis limits center the value distribution
    plt.ylim(min(y_coords) - 0.025, min(max(y_coords) + 0.025, 1))  # type: ignore

    # Scale the y-axis to percentage
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

    plt.title("Classifier Performance")
    plt.ylabel("Performance")

    # Move the legend outside the plot
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

    plt.show()

## Confidence threshold impact on accuracy

In [None]:
def evaluate_thresholds(df: pd.DataFrame, thresholds: list) -> pd.DataFrame:
    """
    Evaluate the accuracy and subset size for different probability thresholds with improved automatic column detection.

    Parameters:
    df (pd.DataFrame): The dataframe containing true labels and predicted probabilities.
    thresholds (list): List of probability thresholds to evaluate.

    Returns:
    pd.DataFrame: A dataframe containing the accuracy and subset size for each threshold.
    """
    # Improved automatic column detection
    likely_true_class_cols = [col for col in df.columns if "true" in col.lower()]
    likely_pred_class_cols = [col for col in df.columns if "pred" in col.lower()]

    if not likely_true_class_cols or not likely_pred_class_cols:
        raise ValueError(
            "Could not automatically detect 'True class' or 'Predicted class' columns."
        )

    true_col = likely_true_class_cols[0]
    pred_col = likely_pred_class_cols[0]

    # Assume remaining numeric columns contain predicted probabilities
    pred_prob_cols = df.select_dtypes(include=[np.number]).columns.tolist()

    results = []
    total_size = len(df)

    for thresh in thresholds:
        # Filter rows where the max predicted probability is above the threshold
        subset_df = df[df[pred_prob_cols].max(axis=1) >= thresh]

        if len(subset_df) == 0:
            continue

        # Calculate the accuracy for this subset
        correct_preds = np.sum(subset_df[true_col] == subset_df[pred_col])
        accuracy = (correct_preds / len(subset_df)) * 100

        # Calculate the size of this subset as a percentage of the total dataset
        subset_size_percent = (len(subset_df) / total_size) * 100

        results.append([thresh, accuracy, subset_size_percent])

    # Convert to DataFrame for easier manipulation
    results_df = pd.DataFrame(
        results, columns=["Threshold", "Accuracy (%)", "Subset Size (%)"]
    )

    return results_df


def create_thresholds_graph(threshold_df: pd.DataFrame, name: str):
    """Return graph of the accuracy and subset size at different probability thresholds."""
    # Plotting the final graph with dual y-axes and 11 ticks on the x-axis
    fig, ax1 = plt.subplots(figsize=(10, 6))

    # Make the first plot for "Accuracy (%)"
    ax1.plot(
        threshold_df["Threshold"],
        threshold_df["Accuracy (%)"],
        label="Accuracy (%)",
        marker="o",
        color="b",
    )
    ax1.set_xlabel("Probability Threshold")
    ax1.set_ylabel("Accuracy (%)", color="b")
    ax1.tick_params(axis="y", labelcolor="b")

    # Make the second plot for "Subset Size (%)"
    ax2 = ax1.twinx()
    ax2.plot(
        threshold_df["Threshold"],
        threshold_df["Subset Size (%)"],
        label="Subset Size (%)",
        marker="x",
        color="r",
    )
    ax2.set_ylabel("Subset Size (%)", color="r")
    ax2.tick_params(axis="y", labelcolor="r")

    # Set 11 ticks on the x-axis
    ax1.set_xticks(np.linspace(0, 1, 11))

    # Add grid and title
    ax1.grid(True)
    plt.title("Accuracy and Subset Size at Different Probability Thresholds\n" + name)
    return fig

In [None]:
source_base = (
    Path.home()
    / "mounts/narval-mount/project-rabyj/epilap/output/logs/epiatlas-dfreeze-v2.1/hg38_100kb_all_none"
)
files = [
    file
    for file in list(source_base.glob("*/*/full-10fold-validation_prediction.csv"))
    if "oversampling" not in str(file)
]

In [None]:
# for file in files:
#     print(file)

In [None]:
thresholds = list(np.arange(0, 1, 1 / 20)) + [0.99]
for file in files:
    df = pd.read_csv(file, header=0)
    nb_samples = df.shape[0]
    nb_classes = len(df.select_dtypes(include=[np.number]).columns.tolist())
    threshold_df = evaluate_thresholds(df, thresholds)
    name = f"{file.parents[1].name} - {file.parents[0].name} - {nb_classes} classes"
    graph = create_thresholds_graph(threshold_df, f"{name} - n={nb_samples}")
    plt.savefig(file.parent / "threshold_impact_graph.png")
    new_filename = (
        name.replace(" ", "_").replace("\n", "_").replace("-", "").replace("__", "_")
    )
    graph.savefig(f"threshold_impact_graph_{new_filename}.png")