In [None]:
"""See markdown"""
# pylint: disable=line-too-long, redefined-outer-name, import-error, pointless-statement, use-dict-literal

Analyze prediction values of correct vs false predictions. Can we find a good prediction score threshold that lets us eliminate important errors?

## Prediction distributions (per cell of confusion matrix)

In [None]:
from pathlib import Path
from typing import Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

from epi_ml.utils.general_utility import get_valid_filename

In [None]:
logdir = Path.home() / "downloads" / "temp"

path = logdir / "sex3_oversample_full-10fold-validation_prediction_augmented-all.csv"
df = pd.read_csv(path, index_col=0, header=0)

In [None]:
classes = df["True class"].unique()

In [None]:
df["harmonized_donor_sex"].value_counts()

In [None]:
# for label in classes:
#     df_label = df[df["True class"] == label]
#     fig = go.Figure()

#     # Iterate classes each target and add a violin plot for it
#     for target in classes:
#         vals = df_label[df_label["Predicted class"] == target]["Max pred"]
#         print(df_label["assay_epiclass"].value_counts())

#         fig.add_trace(
#             go.Violin(
#                 y=vals,
#                 name=f"{target} ({len(vals)})",
#                 box_visible=True,
#                 meanline_visible=True,
#                 points="all",
#             )
#         )

#     fig.update_layout(
#         title_text=f"Predicted value distribution for {label} ({df_label.shape[0]})",
#         yaxis_title="Prediction score",
#         xaxis_title="Target",
#     )
#     fig.update_yaxes(range=[1 / len(classes), 1.01])

#     fig.show()

Combine chrY coverage information

In [None]:
coverage_path = logdir / "coverage_combined.csv"
coverage_df = pd.read_csv(coverage_path, index_col=0, header=0)

In [None]:
coverage_df.head()

In [None]:
merged_df = df.merge(coverage_df, left_index=True, right_index=True, how="inner")

In [None]:
for df in [df, coverage_df, merged_df]:
    print(df.shape)

In [None]:
merged_df.columns

In [None]:
merged_df = merged_df[merged_df["Max pred"] > 0.9]
merged_df = merged_df[
    ~merged_df["assay_epiclass"].str.contains(case=False, regex=True, pat="input|wgb")
]

for label in classes:
    df_label = merged_df[merged_df["Predicted class"] == label]
    fig = go.Figure()

    # Iterate classes each target and add a violin plot for it
    for target in classes:
        for coverage_label in ["chrY", "chrX", "chrY/chrX"]:
            sub_df = df_label[df_label["True class"] == target]

            fig.add_trace(
                go.Violin(
                    y=sub_df[coverage_label],
                    name=f"{target}: {coverage_label} ({sub_df.shape[0]})",
                    box_visible=True,
                    meanline_visible=True,
                    points="all",
                    text=sub_df.index,
                )
            )

    # title = f"Coverage distribution for prediction {label}"
    title = f"Coverage distribution for prediction {label}, max_pred > 0.9"
    fig.update_layout(
        title_text=f"{title} ({df_label.shape[0]})",
        yaxis_title="Mean coverage",
        xaxis_title="True class",
    )
    fig.update_yaxes(range=[-0.001, 2])

    fig.show()

    title = get_valid_filename(title)
    fig.write_html(logdir / f"{title}.html")
    fig.write_image(logdir / f"{title}.png", scale=2)

In [None]:
for label in classes:
    df_label = merged_df[merged_df["True class"] == label]
    fig = go.Figure()

    # Iterate classes each target and add a violin plot for it
    for target in classes:
        for coverage_label in ["chrY", "chrX", "chrY/chrX"]:
            sub_df = df_label[df_label["Predicted class"] == target]

            fig.add_trace(
                go.Violin(
                    y=sub_df[coverage_label],
                    name=f"{target}: {coverage_label} ({sub_df.shape[0]})",
                    box_visible=True,
                    meanline_visible=True,
                    points="all",
                    text=sub_df.index,
                )
            )

    # title = f"Coverage distribution for label {label}"
    title = f"Coverage distribution for label {label}, max_pred > 0.9"
    fig.update_layout(
        title_text=f"{title} ({df_label.shape[0]})",
        yaxis_title="Mean coverage",
        xaxis_title="Predicted class",
    )
    fig.update_yaxes(range=[-0.001, 2])

    fig.show()

    title = get_valid_filename(title)
    fig.write_html(logdir / f"{title}.html")
    fig.write_image(logdir / f"{title}.png", scale=2)

 unknown samples

In [None]:
unknown_predict_path = (
    logdir
    / "sex3_complete_no_valid_oversample_test_prediction_100kb_all_none_dfreeze_v2.1_sex_mixed_unknown_augmented-all.csv"
)
unknown_predict_df = pd.read_csv(unknown_predict_path, index_col=0, header=0)

In [None]:
label = "unknown"
unknown_predict_df = unknown_predict_df[unknown_predict_df["True class"] == label]
unknown_predict_df = unknown_predict_df.merge(
    coverage_df, left_index=True, right_index=True, how="inner"
)

In [None]:
fig = go.Figure()

classes = unknown_predict_df["Predicted class"].unique()
unknown_predict_df = unknown_predict_df[unknown_predict_df["Max pred"] > 0.7]
unknown_predict_df = unknown_predict_df[
    ~unknown_predict_df["assay_epiclass"].str.contains(
        case=False, regex=True, pat="input|wgb"
    )
]


for target in classes:
    sub_df = unknown_predict_df[unknown_predict_df["Predicted class"] == target]
    for coverage_label in ["chrY", "chrX", "chrY/chrX"]:
        fig.add_trace(
            go.Violin(
                y=sub_df[coverage_label],
                name=f"{target}: {coverage_label} ({sub_df.shape[0]})",
                box_visible=True,
                meanline_visible=True,
                points="all",
                text=sub_df.index,
            )
        )

# title = f"Coverage distribution for label {label}"
title = f"Coverage distribution for label {label}, max_pred > 0.9"
fig.update_layout(
    title_text=f"{title} ({unknown_predict_df.shape[0]})",
    yaxis_title="Mean coverage",
    xaxis_title="Predicted class",
)
fig.update_yaxes(range=[-0.001, 2])
fig.show()

title = get_valid_filename(title)
fig.write_html(logdir / f"{title}.html")
fig.write_image(logdir / f"{title}.png", scale=2)

In [None]:
print("Miaw")

## Confidence threshold impact on accuracy

In [None]:
def compute_accuracy(
    df: pd.DataFrame,
    threshold: float,
    true_col: str,
    pred_col: str,
    pred_prob_cols: List[str],
) -> Tuple[float, float, float]:
    """
    Compute the accuracy and subset size for a given probability threshold.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing the true labels, predicted labels, and predicted probabilities.
    threshold (float): The probability threshold for filtering the DataFrame.
    true_col (str): The column name containing the true labels.
    pred_col (str): The column name containing the predicted labels.
    pred_prob_cols (List[str]): List of column names containing the predicted probabilities.

    Returns:
    Tuple[float, float, float]: A tuple containing the threshold, the calculated accuracy (%), and the subset size (%) respectively.
    """
    # Filter rows where the max predicted probability is above the threshold
    total_size = len(df)
    subset_df = df[df[pred_prob_cols].max(axis=1) >= threshold]

    if len(subset_df) == 0:
        return np.nan, np.nan, np.nan

    # Calculate the accuracy for this subset
    correct_preds = np.sum(subset_df[true_col] == subset_df[pred_col])
    accuracy = (correct_preds / len(subset_df)) * 100

    # Calculate the size of this subset as a percentage of the total dataset
    subset_size_percent = (len(subset_df) / total_size) * 100

    return threshold, accuracy, subset_size_percent


def evaluate_thresholds(
    df: pd.DataFrame, thresholds: List[float]
) -> Dict[str, pd.DataFrame]:
    """
    Evaluate the accuracy and subset size for different probability thresholds with improved automatic column detection.

    Parameters:
    df (pd.DataFrame): The dataframe containing true labels and predicted probabilities.
    thresholds (list): List of probability thresholds to evaluate.

    Returns:
    pd.DataFrame: A dataframe containing the accuracy and subset size for each threshold.
    """
    # Automatic column detection
    likely_true_class_cols = [col for col in df.columns if "true" in col.lower()]
    likely_pred_class_cols = [col for col in df.columns if "pred" in col.lower()]

    if not likely_true_class_cols or not likely_pred_class_cols:
        raise ValueError(
            "Could not automatically detect 'True class' or 'Predicted class' columns."
        )

    true_col = likely_true_class_cols[0]
    pred_col = likely_pred_class_cols[0]
    if df[true_col].dtype != object or df[pred_col].dtype != object:
        print(f"{true_col} and {pred_col} are not string columns. Could cause issues.")

    classes = df[true_col].unique().tolist() + ["all"]
    pred_prob_cols = classes[0:-1]
    # Evaluate each threshold over each class
    results_dfs = {}
    for class_label in classes:
        results = []
        filtered_df = df if class_label == "all" else df[df[true_col] == class_label]

        for thresh in thresholds:
            result = compute_accuracy(
                filtered_df, thresh, true_col, pred_col, pred_prob_cols
            )
            results.append(result)

        # Convert to DataFrame for easier manipulation
        short_class_label = class_label[0:10]
        results_df = pd.DataFrame(
            results,
            columns=[
                "Threshold",
                f"Accuracy_{short_class_label} (%)",
                f"Subset_Size_{short_class_label} (%) ({filtered_df.shape[0]})",
            ],
        )

        results_dfs[class_label] = results_df

    return results_dfs

In [None]:
def create_thresholds_graph(threshold_dfs: Dict[str, pd.DataFrame], name: str):
    """
    Return graph of the accuracy and subset size at different probability thresholds for all classes.

    Parameters:
    threshold_dfs (Dict[str, pd.DataFrame]): A dictionary containing dataframes for each class label and the general case.
    name (str): Graph title suffix.

    Returns:
    plt.Figure: The figure object of the plotted graph.
    """
    fig, ax1 = plt.subplots(figsize=(10, 6))
    ax2 = ax1.twinx()

    colors = ["b", "g", "r", "c", "m", "y", "k"]
    markers = ["o"]
    marker_size = 3

    for idx, (_, threshold_df) in enumerate(threshold_dfs.items()):
        # Make the plot for "Accuracy (%)"
        marker = markers[idx % len(markers)]
        color = colors[idx % len(colors)]

        acc_label = threshold_df.filter(like="Accuracy").columns[0]
        acc_subset = threshold_df.filter(like="Subset").columns[0]

        ax1.plot(
            threshold_df["Threshold"],
            threshold_df[acc_label],
            label=acc_label,
            marker=marker,
            color=color,
            markersize=marker_size,
        )

        # Make the plot for "Subset Size (%)"
        ax2.plot(
            threshold_df["Threshold"],
            threshold_df[acc_subset],
            label=acc_subset,
            marker=marker,
            color=color,
            linestyle="--",  # Use dashed line for better distinction
            markersize=marker_size,
        )

    ax1.set_xlabel("Probability Threshold")
    ax1.set_ylabel("Accuracy (%)")
    ax1.tick_params(axis="y", labelcolor="b")

    ax2.set_ylabel("Subset Size (%)")
    ax2.tick_params(axis="y", labelcolor="r")

    # Set 11 ticks on the x-axis
    ax1.set_xticks(np.linspace(0, 1, 11))

    # Add grid, legend and title
    ax1.grid(True)
    _ = ax1.legend(loc="upper right", bbox_to_anchor=(1.3, 0.5), title="Accuracy")
    _ = ax2.legend(loc="lower right", bbox_to_anchor=(1.3, 0.5), title="Subset Size")
    plt.title("Accuracy and Subset Size at Different Probability Thresholds\n" + name)

    return fig

In [None]:
def create_thresholds_graph_plotly(threshold_dfs: Dict[str, pd.DataFrame], name: str):
    """
    Return graph of the accuracy and subset size at different probability thresholds for all classes.

    Parameters:
    threshold_dfs (Dict[str, pd.DataFrame]): A dictionary containing dataframes for each class label and the general case.
    name (str): Graph title.

    Returns:
    go.Figure: Plotly figure object with the plotted graph.
    """

    fig = go.Figure()
    colors = px.colors.qualitative.Dark24
    marker = "circle"

    for idx, (_, threshold_df) in enumerate(threshold_dfs.items()):
        color = colors[idx % len(colors)]

        acc_label = threshold_df.filter(like="Accuracy").columns[0]
        acc_subset = threshold_df.filter(like="Subset").columns[0]

        # Plot accuracy
        fig.add_trace(
            go.Scatter(
                x=threshold_df["Threshold"],
                y=threshold_df[acc_label],
                name=acc_label,
                line=dict(color=color),
                marker_symbol=marker,
                mode="lines+markers",
            )
        )

        # Plot subset size on secondary Y-axis
        fig.add_trace(
            go.Scatter(
                x=threshold_df["Threshold"],
                y=threshold_df[acc_subset],
                name=acc_subset,
                line=dict(color=color, dash="dash"),  # Dashed line for subset size
                marker_symbol=marker,
                yaxis="y2",
                mode="lines+markers",
            )
        )

    # Adjusting the layout
    fig.update_layout(
        title=f"Accuracy and Subset Size at Different Probability Thresholds<br>{name}",
        xaxis_title="Probability Threshold",
        xaxis=dict(
            tickvals=np.linspace(0, 1, 11),
            ticktext=[f"{x:.1f}" for x in np.linspace(0, 1, 11)],
        ),
        yaxis_title="Accuracy (%)",
        yaxis2=dict(title="Subset Size (%)", overlaying="y", side="right"),
        legend=dict(orientation="v", x=1.05, y=1),
        height=1000,
        width=1600,
    )
    fig.update_xaxes(range=[-0.001, 1.001])
    fig.update_traces(line={"width": 1})

    return fig

In [None]:
# Select files for analysis
list_path = (
    Path.home()
    / "projects"
    / "epilap"
    / "output"
    / "dfreeze_results"
    / "10fold_results.list"
)
with open(list_path, "r", encoding="utf8") as f:
    files = [Path(line.strip()) for line in f.readlines()]

In [None]:
thresholds = list(np.arange(0, 1, 1 / 20)) + [0.99]
for file in files:
    print(file)
    # compute
    df = pd.read_csv(file, header=0, dtype={"True class": str, "Predicted class": str})
    nb_samples = df.shape[0]
    nb_classes = len(df.select_dtypes(include=[np.number]).columns.tolist())

    threshold_df = evaluate_thresholds(df, thresholds)

    # plot
    name = f"{file.parents[1].name} - {file.parents[0].name} - {nb_classes} classes"
    graph = create_thresholds_graph_plotly(threshold_df, f"{name} - n={nb_samples}")

    # save
    filename = f"threshold_impact_graph_full_{get_valid_filename(name)}".replace(
        "_-_", "-"
    )
    print(filename)
    graph.write_html(file.parent / (filename + ".html"))
    graph.write_image(file.parent / (filename + ".png"), scale=3)

In [None]:
# thresholds = list(np.arange(0, 1, 1 / 20)) + [0.99]
# nb_samples = df.shape[0]
# nb_classes = len(df.select_dtypes(include=[np.number]).columns.tolist())

# results_dfs = evaluate_thresholds(df, thresholds)

In [None]:
# graph = create_thresholds_graph_plotly(results_dfs, f"sex3 - n={nb_samples}")
# graph.show()
# plt.savefig(file.parent / "threshold_impact_graph.png")
# new_filename = (
#     name.replace(" ", "_").replace("\n", "_").replace("-", "").replace("__", "_")
# )
# graph.savefig(f"threshold_impact_graph_{new_filename}.png")