In [None]:
"""Workbook to analyse classifier predictions on recount3 data."""

# pylint: disable=duplicate-code

In [None]:
%load_ext autoreload
%autoreload 2

## SETUP

In [None]:
from __future__ import annotations

from pathlib import Path

import pandas as pd
from IPython.display import display  # pylint: disable=unused-import

from epi_ml.utils.notebooks.paper.metrics_per_assay import MetricsPerAssay
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    BIOMATERIAL_TYPE,
    CANCER,
    LIFE_STAGE,
    SEX,
    check_label_coherence,
    merge_life_stages,
)

In [None]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
paper_dir = base_dir

base_fig_dir = base_dir / "figures"

table_dir = base_dir / "tables"

base_data_dir = base_dir / "data"
metadata_dir = base_data_dir / "metadata" / "recount3"

preds_dir = table_dir / "dfreeze_v2" / "predictions"

In [None]:
full_preds_path = preds_dir / "recount3_merged_preds_metadata_freeze1.csv.gz"

full_df = pd.read_csv(
    full_preds_path,
    sep=",",
    low_memory=False,
    compression="gzip",
)
full_df.fillna("unknown", inplace=True)
full_df.replace("indeterminate", "unknown", inplace=True)

In [None]:
# Uniformize biomat labels
col = f"Predicted class ({BIOMATERIAL_TYPE})"
full_df[col] = full_df[col].str.replace(" ", "_").str.lower()

### Check that expected/predicted labels are coherent

In [None]:
all_categories = [SEX, BIOMATERIAL_TYPE, CANCER, LIFE_STAGE]
column_templates = {
    "True": "{}",
    "Predicted": "Predicted class ({})",
}
check_label_coherence(full_df, all_categories, column_templates)

In [None]:
cell_line_vals = ["cell_line", "cell line", "unknown"]

## Computing metrics

### Assay predictions details

In [None]:
assay_df = full_df[
    ~full_df[ASSAY].isin(
        [
            "unknown",
        ]
    )
].copy()
N = assay_df.shape[0]

for max_pred in [0, 0.6, 0.8]:
    # continue
    subset = assay_df[assay_df[f"Max pred ({ASSAY})"] >= max_pred]
    counts = subset[f"Predicted class ({ASSAY})"].value_counts()

    N_subset = counts.sum()
    counts_perc = counts / N_subset
    correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
    print(f"min_PredScore >= {max_pred} ({N_subset/N:.2%} left): {correct_perc:.2%}\n")

    print("Predictions grouped, assay types left as is")
    groupby = (
        subset.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Predictions grouped, all rna types = rna")
    tmp_df = subset.copy()
    tmp_df.loc[:, ASSAY] = "rna_seq"
    tmp_df.loc[:, f"Predicted class ({ASSAY})"].replace(
        "mrna_seq", "rna_seq", inplace=True
    )
    groupby = (
        tmp_df.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Breakdown by assay type")
    assay_breakdown = subset[ASSAY].value_counts(dropna=False)
    print(assay_breakdown / assay_breakdown.sum(), "\n")
    for assay_type in assay_breakdown.index:
        assay_type_subset = subset[subset[ASSAY] == assay_type].copy()

        counts = assay_type_subset[f"Predicted class ({ASSAY})"].value_counts()
        N_subset = counts.sum()
        counts_perc = counts / N_subset
        correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
        print(f"{assay_type} acc: {correct_perc:.2%}\n")
        print(f"{assay_type} preds:\n{counts_perc}\n")
    print()

### Accuracy and F1-score summary.

In [None]:
df = full_df.copy()
print(df.shape)

In [None]:
metrics_handler = MetricsPerAssay()

In [None]:
output_dir = table_dir / "dfreeze_v2" / "predictions" / "metrics"

All files

In [None]:
categories = [CANCER, SEX, BIOMATERIAL_TYPE]
column_templates = {
    "True": "{}",
    "Predicted": "Predicted class ({})",
    "Max pred": "Max pred ({})",
}
compute_fct_kwargs = {
    "no_epiatlas": False,
    "merge_assays": False,
    "categories": categories,
    "column_templates": column_templates,
    "core_assays": df[ASSAY].unique().tolist(),
    "non_core_assays": [],  # no "non-core" assays
}

In [None]:
base_filename = "recount3_metrics_per_assay"

metrics_handler.compute_multiple_metric_formats(
    preds=full_df.copy(),
    folders_to_save=[output_dir],
    general_filename=base_filename,
    verbose=False,
    return_df=False,
    compute_fct_kwargs=compute_fct_kwargs,
)

Only files where Assay predictions are (m)rna-seq and predScore >= 0.6

In [None]:
base_filename = "recount3_metrics_per_assay_assay11c-filtered"

print(full_df.shape)
filtered_df = full_df[
    (full_df[f"Max pred ({ASSAY})"] >= 0.6)
    & (full_df[f"Predicted class ({ASSAY})"].isin(["rna_seq", "mrna_seq"]))
].copy()
print(filtered_df.shape)

metrics_handler.compute_multiple_metric_formats(
    preds=filtered_df.copy(),  # type: ignore
    folders_to_save=[output_dir],
    general_filename=base_filename,
    verbose=False,
    return_df=False,
    compute_fct_kwargs=compute_fct_kwargs,
)

Merging messenger and total RNA for a new assay_epiclass label.

In [None]:
for df, filename in zip(
    [filtered_df.copy(), full_df.copy()],
    [
        "recount3_metrics_per_assay_merge_total_mrna_assay11c-filtered",
        "recount3_metrics_per_assay_merge_total_mrna",
    ],
):
    print(filename)
    df[ASSAY] = df[ASSAY].replace(
        {
            "mrna_seq": "messenger_or_total_rna",
            "rna_seq": "messenger_or_total_rna",
        },
    )
    new_compute_fct_kwargs = compute_fct_kwargs.copy()
    new_compute_fct_kwargs["core_assays"] = df[ASSAY].unique().tolist()

    metrics_handler.compute_multiple_metric_formats(
        preds=df,  # type: ignore
        folders_to_save=[output_dir],
        general_filename=filename,
        verbose=False,
        return_df=False,
        compute_fct_kwargs=new_compute_fct_kwargs,
    )

No cell line (for life stage)

In [None]:
new_compute_fct_kwargs = compute_fct_kwargs.copy()
new_compute_fct_kwargs["categories"] = [f"{LIFE_STAGE}_merged"]

for df, filename in zip(
    [filtered_df.copy(), full_df.copy()],
    [
        "recount3_metrics_per_assay_assay11c-filtered_no_cell_line",
        "recount3_metrics_per_assay_no_cell_line",
    ],
):
    print(filename)
    df = df[~df[BIOMATERIAL_TYPE].isin(cell_line_vals)]

    print(df.shape)
    for cat in [ASSAY, f"{LIFE_STAGE}_merged", BIOMATERIAL_TYPE]:
        print(df[cat].value_counts(dropna=False), "\n")

    # Making a version of predicted class + max pred that have correct names + merged
    df = merge_life_stages(
        df=df,
        lifestage_column_name=LIFE_STAGE,
        column_name_templates=["Max pred ({})", "Predicted class ({})"],
    )

    metrics_handler.compute_multiple_metric_formats(
        preds=df,  # type: ignore
        folders_to_save=[output_dir],
        general_filename=filename,
        verbose=False,
        return_df=False,
        compute_fct_kwargs=new_compute_fct_kwargs,
    )