In [17]:
"""Workbook to analyse classifier predictions on recount3 data.
"""
# pylint: disable=duplicate-code

'Workbook to analyse classifier predictions on recount3 data.\n'

In [18]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### SETUP

In [19]:
from __future__ import annotations

from pathlib import Path

import numpy as np
import pandas as pd
from IPython.display import display
from sklearn.metrics import classification_report, confusion_matrix as sk_cm

from epi_ml.utils.notebooks.paper.metrics_per_assay import MetricsPerAssay
from epi_ml.utils.notebooks.paper.paper_utilities import (
    ASSAY,
    BIOMATERIAL_TYPE,
    CANCER,
    LIFE_STAGE,
    SEX,
)

In [20]:
base_dir = Path.home() / "Projects/epiclass/output/paper"
paper_dir = base_dir

base_fig_dir = base_dir / "figures"

table_dir = base_dir / "tables"

base_data_dir = base_dir / "data"
metadata_dir = base_data_dir / "metadata"
predictions_dir = base_data_dir / "training_results" / "predictions"

recount3_folder = predictions_dir / "recount3" / "hg38_100kb_all_none"

In [21]:
meta_name = "harmonized_metadata_20250122_leuk2"
preds_path = recount3_folder / f"recount3_merged_preds_{meta_name}.tsv.gz"
full_df = pd.read_csv(preds_path, sep="\t", compression="gzip")

### Assay predictions details

In [22]:
assay_df = full_df[full_df[ASSAY] != "unknown"]
N = assay_df.shape[0]

for max_pred in [0, 0.6, 0.8]:
    # continue
    subset = assay_df[assay_df[f"Max pred ({ASSAY})"] >= max_pred]
    counts = subset[f"Predicted class ({ASSAY})"].value_counts()

    N_subset = counts.sum()
    counts_perc = counts / N_subset
    correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
    print(f"min_PredScore >= {max_pred} ({N_subset/N:.2%} left): {correct_perc:.2%}\n")

    print("Predictions grouped, assay types left as is")
    groupby = (
        subset.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Predictions grouped, all rna types = rna")
    tmp_df = subset.copy()
    tmp_df.loc[:, ASSAY] = "rna_seq"
    tmp_df.loc[:, f"Predicted class ({ASSAY})"].replace(
        "mrna_seq", "rna_seq", inplace=True
    )
    groupby = (
        tmp_df.groupby([ASSAY, f"Predicted class ({ASSAY})"])
        .size()
        .reset_index()
        .rename(columns={0: "Count"})
        .sort_values(by=[ASSAY, "Count"], ascending=[True, False])
    )
    print(groupby, "\n")

    print("Breakdown by assay type")
    assay_breakdown = subset[ASSAY].value_counts(dropna=False)
    print(assay_breakdown / assay_breakdown.sum(), "\n")
    for assay_type in assay_breakdown.index:
        assay_type_subset = subset[subset[ASSAY] == assay_type].copy()

        counts = assay_type_subset[f"Predicted class ({ASSAY})"].value_counts()
        N_subset = counts.sum()
        counts_perc = counts / N_subset
        correct_perc = counts_perc["rna_seq"] + counts_perc["mrna_seq"]
        print(f"{assay_type} acc: {correct_perc:.2%}\n")
        print(f"{assay_type} preds:\n{counts_perc}\n")
    print()

min_PredScore >= 0 (100.00% left): 95.95%

Predictions grouped, assay types left as is
   assay_epiclass Predicted class (assay_epiclass)  Count
7        mrna_seq                         mrna_seq   9441
8        mrna_seq                          rna_seq   1981
9        mrna_seq                        wgbs-pbat    104
6        mrna_seq                            input     75
10       mrna_seq                    wgbs-standard     61
0        mrna_seq                          h3k27ac     42
4        mrna_seq                          h3k4me3     28
3        mrna_seq                          h3k4me1     27
1        mrna_seq                         h3k27me3     13
2        mrna_seq                         h3k36me3     12
5        mrna_seq                          h3k9me3      4
17          other                         mrna_seq  10401
18          other                          rna_seq   1078
13          other                         h3k36me3     97
14          other                          

### Other metadata categories

In [23]:
for cat in [SEX, CANCER, LIFE_STAGE, BIOMATERIAL_TYPE]:
    display(full_df[cat].value_counts(dropna=False))

harmonized_donor_sex
unknown    228747
female      48115
male        39366
Name: count, dtype: int64

harmonized_sample_cancer_high
unknown       290082
cancer         21934
non-cancer      4212
Name: count, dtype: int64

harmonized_donor_life_stage
unknown      254852
adult         52461
child          4900
newborn        1925
fetal          1514
embryonic       576
Name: count, dtype: int64

harmonized_biomaterial_type
unknown           261308
cell_line          53989
primary_cell         885
primary_tissue        46
Name: count, dtype: int64

In [24]:
df = full_df.copy(deep=True)
for max_pred in [0, 0.6, 0.8]:
    # continue
    subset = df[df[f"Max pred ({ASSAY})"] >= max_pred]
    print(f"min_PredScore >= {max_pred}\n")

    for cat in [SEX, CANCER, LIFE_STAGE, BIOMATERIAL_TYPE]:
        pred_label = f"Predicted class ({cat})"
        true_label = f"Expected class ({cat})"

        if cat == CANCER:
            subset = subset.replace("healthy", "non-cancer")

        known_pred = subset[~subset[true_label].isin(["unknown", "other"])]
        if cat == LIFE_STAGE:
            diff = len(known_pred)
            known_pred = known_pred[known_pred[BIOMATERIAL_TYPE] != "cell line"]
            diff -= len(known_pred)
            print(f"Excluded cell lines for {cat} predictions: {diff}")

        # print(known_pred[true_label].value_counts(dropna=False))
        y_pred = known_pred[pred_label].str.lower().str.replace(" ", "_")
        y_true = known_pred[true_label].str.lower().str.replace(" ", "_")

        classes = sorted(set(y_pred.unique()) | set(y_true.unique()))

        N_known = known_pred.shape[0]
        N_unknown = subset.shape[0] - N_known
        # print(f"Unknown (%): {(N_unknown)/subset.shape[0]*100:.2f}")

        N_correct = (y_pred == y_true).sum()
        print(f"{cat} prediction match (%): {N_correct/N_known*100:.2f}\n")
        print(classes)
        print(y_pred.value_counts(dropna=False), "\n")
        print(y_true.value_counts(dropna=False), "\n")

        print(classification_report(y_true, y_pred, target_names=classes, zero_division=0) + "\n")  # type: ignore

        print(f"confusion matrix classes row order: {classes}")
        cm = sk_cm(y_true, y_pred, normalize="true", labels=classes)
        with np.printoptions(precision=3):
            print(str(cm) + "\n\n")

    print("-----")

min_PredScore >= 0

harmonized_donor_sex prediction match (%): 66.12

['female', 'male', 'mixed']
Predicted class (harmonized_donor_sex)
female    52566
male      33266
mixed      1649
Name: count, dtype: int64 

Expected class (harmonized_donor_sex)
female    48115
male      39366
Name: count, dtype: int64 

              precision    recall  f1-score   support

      female       0.68      0.75      0.71     48115
        male       0.66      0.56      0.60     39366
       mixed       0.00      0.00      0.00         0

    accuracy                           0.66     87481
   macro avg       0.45      0.43      0.44     87481
weighted avg       0.67      0.66      0.66     87481


confusion matrix classes row order: ['female', 'male', 'mixed']
[[0.746 0.235 0.019]
 [0.424 0.558 0.018]
 [0.    0.    0.   ]]


harmonized_sample_cancer_high prediction match (%): 63.54

['cancer', 'non-cancer']
Predicted class (harmonized_sample_cancer_high)
cancer        13142
non-cancer    13004
Name:

### Accuracy and F1-score summary.

In [25]:
metrics_handler = MetricsPerAssay()

In [26]:
output_dir = table_dir / "dfreeze_v2" / "predictions" / "metrics"

All files

In [27]:
categories = [CANCER, LIFE_STAGE, SEX, BIOMATERIAL_TYPE]
column_templates = {
    "True": "Expected class ({})",
    "Predicted": "Predicted class ({})",
    "Max pred": "Max pred ({})",
}
compute_fct_kwargs = {
    "no_epiatlas": False,
    "merge_assays": False,
    "categories": categories,
    "column_templates": column_templates,
    "core_assays": df[ASSAY].unique().tolist(),
    "non_core_assays": [],
}

In [28]:
base_filename = "recount3_metrics_per_assay"

metrics_handler.compute_multiple_metric_formats(
    preds=df,
    folders_to_save=[output_dir],
    general_filename=base_filename,
    verbose=False,
    return_df=False,
    compute_fct_kwargs=compute_fct_kwargs,
)

Only files where Assay predictions are (m)rna-seq and predScore >= 0.6

In [29]:
base_filename = "recount3_metrics_per_assay_assay11c-filtered"

filtered_df = df[
    (df[f"Max pred ({ASSAY})"] >= 0.6)
    & (df[f"Predicted class ({ASSAY})"].isin(["rna_seq", "mrna_seq"]))
]

metrics_handler.compute_multiple_metric_formats(
    preds=filtered_df,  # type: ignore
    folders_to_save=[output_dir],
    general_filename=base_filename,
    verbose=False,
    return_df=False,
    compute_fct_kwargs=compute_fct_kwargs,
)

In [30]:
for cat in categories:
    print(cat, filtered_df[filtered_df[f"Max pred ({cat})"] >= 0.6].shape[0])

harmonized_sample_cancer_high 221969
harmonized_donor_life_stage 185303
harmonized_donor_sex 209964
harmonized_biomaterial_type 188927
