In [None]:
"""Create plots of some results."""
# pylint: disable=import-error,redefined-outer-name, singleton-comparison

In [None]:
import re
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import seaborn as sns
from matplotlib.ticker import PercentFormatter

## All classifiers performance comparison

In [None]:
file_path = Path.home() / "downloads" / "temp" / "all_metrics - Pivot Table 1.csv"

In [None]:
def create_metrics_df(file_path: Path) -> pd.DataFrame:
    """Create a dataframe from the csv file."""
    metrics_df = pd.read_csv(file_path)

    # Fill missing values in the 'classifier' column using forward fill method
    metrics_df["classifier"] = metrics_df["classifier"].fillna(method="ffill")

    # Rename columns to match the desired format
    metrics_df.rename(
        columns={
            "classifier": "Classifier",
            "metric": "Metric",
            "AVERAGE of value": "Average",
            "STDEV of value": "Std",
        },
        inplace=True,
    )

In [None]:
def plot_classifiers_performance(metrics_df: pd.DataFrame) -> None:
    """Plot the performance of multiple classifiers."""
    # Set the figure size
    plt.figure(figsize=(10, 6))

    # Create a bar plot without error bars
    barplot = sns.barplot(
        data=metrics_df, x="Classifier", y="Average", hue="Metric", errorbar=None
    )

    # Get the x and y coordinates of the bars
    x_coords = []
    y_coords = []
    for rect in barplot.patches:
        x_coords.append(rect.get_x() + rect.get_width() / 2)
        y_coords.append(rect.get_height())

    # Calculate the number of metrics and classifiers to determine the positions of the error bars
    num_metrics = metrics_df["Metric"].nunique()
    num_classifiers = metrics_df["Classifier"].nunique()

    # Add the error bars
    for i in range(num_classifiers):
        for j in range(num_metrics):
            barplot.errorbar(
                x_coords[i * num_metrics + j],
                y_coords[i * num_metrics + j],
                yerr=metrics_df["Std"][i * num_metrics + j],
                color="black",
                capsize=3,
                fmt="none",
            )

    # Set the y-axis limits center the value distribution
    plt.ylim(min(y_coords) - 0.025, min(max(y_coords) + 0.025, 1))  # type: ignore

    # Scale the y-axis to percentage
    plt.gca().yaxis.set_major_formatter(PercentFormatter(1))

    plt.title("Classifier Performance")
    plt.ylabel("Performance")

    # Move the legend outside the plot
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

    plt.show()

## Multiple hdf5s versions over a certain classification task - using 10 split data

In [None]:
input_dir = Path.home() / "downloads" / "temp"
input_data_path = input_dir / "cometml_logs_recent.csv"
df = pd.read_csv(input_data_path)

In [None]:
df.head()

In [None]:
desired_categories = sorted(
    list(df[df["Server start time"] > 1.7e12]["category"].unique())
)

In [None]:
for category in desired_categories:
    print(f"Category: {category}")
    print(df[df["category"] == category]["output_size"].unique())

In [None]:
OUTPUT_SIZES = [11, 5, 3, 2, 16]

In [None]:
pertinent_df = df[df["category"].isin(desired_categories)]
pertinent_df = pertinent_df[pertinent_df["oversampling"] == True]
# pertinent_df = pertinent_df.iloc[:, :-4]

notes: 
- add 100kb_all_none equivalent tasks
- find a way to include the filter information AND the input size.
- to get a first idea, just throw everything in with input size + filter_name + resolution, as the label
- Then, consider making manual groups using filter names, color by resolution

In [None]:
def graph_task_metrics(df: pd.DataFrame, category: str, output_dir: Path) -> None:
    """Graph the metrics of a task."""
    for metric in ["val_Accuracy", "val_F1Score"]:
        label_order = [
            "all",
            "global_tasks_union",
            "random_n4510",
            "global_tasks_intersection",
            "random_n118",
        ]
        fig = px.box(
            df,
            x="HDF5 filter",
            y=metric,
            title=f"{category}: {metric}",
            points="all",
            category_orders={
                "HDF5 filter": label_order,
                "HDF5 Resolution": ["1.0kb", "10.0kb", "100.0kb"],
            },
            color="HDF5 Resolution",
            color_discrete_sequence=px.colors.qualitative.Safe,
            width=800,
            height=800,
        )
        fig.update_traces(boxmean=True)
        fig.write_html(output_dir / f"{category}_{metric}.html")
        fig.write_image(output_dir / f"{category}_{metric}.png")

In [None]:
def re_desired_base_name(x: str, category: str):
    """Return a representative base name from a full classification task name."""
    re_str = (
        r"hg38_\d+kb_(.*)_none-"
        + f"{category}_1l_3000n"
        + r"-10fold-oversampl\w+-split\d{1}"
    )
    m = re.search(re_str, x)
    if m is None:
        re_str = f"({category}).*"
        m = re.search(re_str, x)
    return m.group(1)  # type: ignore

In [None]:
def graph_task_metrics_naive(df: pd.DataFrame, category: str, output_dir: Path) -> None:
    """Graph the metrics of a task."""
    df["base_name"] = df["Name"].apply(lambda x: re_desired_base_name(x, category))
    df["graph_label"] = df["input_size"].astype(str).str.cat(df["base_name"], sep="|")
    label_order = sorted(
        df["graph_label"], key=lambda x: (int(x.split("|")[0]), x.split("|")[1])
    )

    for metric in ["val_Accuracy", "val_F1Score"]:
        fig = px.box(
            df,
            x="graph_label",
            y=metric,
            title=f"{category}: {metric}",
            points="all",
            color="HDF5 Resolution",
            color_discrete_sequence=px.colors.qualitative.Safe,
            category_orders={
                "graph_label": label_order,
                "HDF5 Resolution": ["1.0kb", "10.0kb", "100.0kb"],
            },
            width=800,
            height=800,
        )
        fig.update_traces(boxmean=True)
        fig.write_html(output_dir / f"{category}_{metric}.html")
        fig.write_image(output_dir / f"{category}_{metric}.png")

In [None]:
output_dir = Path.home() / "downloads" / "temp" / "output" / "naive"
output_dir.mkdir(exist_ok=True)
for category, output_size in zip(desired_categories, OUTPUT_SIZES):
    cat_df = pertinent_df[
        (pertinent_df["category"] == category)
        & (pertinent_df["output_size"].astype(int) == output_size)
    ]
    # cat_training_names = cat_df["Name"].tolist()
    # if len(cat_training_names) not in [50, 60]:
    #     print(category, output_size)
    #     print(category, len(cat_training_names))
    #     print(cat_training_names)

    # assert cat_df["Included tracks"].nunique() == 1

    # cat_df["base_name"] = cat_df["Name"].apply(lambda x: x.rsplit("-", 1)[0])
    # display(cat_df["base_name"].tolist())
    # print(cat_df["Name"].tolist())

    graph_task_metrics_naive(cat_df, category, output_dir)