In [1]:
# this notebook calculates performance metrics on external MSK imaging test data
# (MSK ran the DFCI-student-imaging model on its own data, then shared deidentified outputs for evaluation)

In [2]:
import numpy as np
import pandas as pd
import torch
from torch.utils import data
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
output_dataset = pd.read_csv('/data/clin_notes_outcomes/impression_bert/imaging/data/msk_predictions_radiology_noPHI.csv')

In [4]:
output_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24472 entries, 0 to 24471
Data columns (total 25 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   DMP_ID               24472 non-null  object 
 1   cancer_type          24472 non-null  object 
 2   any_cancer_label     24472 non-null  bool   
 3   response_label       24472 non-null  bool   
 4   progression_label    24472 non-null  bool   
 5   Bone                 24472 non-null  int64  
 6   CNS_Brain            24472 non-null  int64  
 7   Intra-Abdominal      24472 non-null  int64  
 8   Liver                24472 non-null  int64  
 9   Lung                 24472 non-null  int64  
 10  Other                24472 non-null  int64  
 11  Pleura               24472 non-null  int64  
 12  Reproductive_Organs  24472 non-null  int64  
 13  adrenal_glands       24472 non-null  int64  
 14  lymph_nodes          24472 non-null  int64  
 15  prob_any_cancer      24472 non-null 

In [5]:
output_dataset.DMP_ID.nunique()

2672

In [6]:
output_dataset = output_dataset.rename(columns={'any_cancer_label':'any_cancer', 'response_label':'response', 'progression_label':'progression',
                                  'Bone':'bone_met', 'CNS_Brain':'brain_met', 'Intra-Abdominal':'peritoneal_met', 'Liver':'liver_met',
                                 'Lung':'lung_met', 'adrenal_glands':'adrenal_met', 'lymph_nodes':'node_met', 'prob_lung':'prob_lung_met',
                                               'prob_brain':'prob_brain_met','prob_bone':'prob_bone_met','prob_adrenal':'prob_adrenal_met',
                                               'prob_liver':'prob_liver_met','prob_node':'prob_node_met','prob_peritoneum':'prob_peritoneal_met'})

In [7]:
#output_dataset

In [8]:
from utils_102023 import eval_model

In [9]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from confidenceinterval.bootstrap import bootstrap_ci

def best_f1(y_true, y_score):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_score)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = f1_scores[~np.isnan(f1_scores)]
    thresholds = thresholds[:-1]
    best_threshold = thresholds[np.argmax(f1_scores)]
    best_f1 = np.max(f1_scores)
    return(best_f1)

def eval_with_cis(y_true, y_score):
    random_generator = np.random.default_rng()
    print('auroc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=roc_auc_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('auprc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=average_precision_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('best f1')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=best_f1,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))


In [10]:
for outcome in ['any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
    print('all cancers')
    print(outcome)
    print(eval_model(output_dataset['prob_' + outcome], output_dataset[outcome], graph=False))
    print(eval_with_cis(output_dataset[outcome], output_dataset['prob_' + outcome]))
    print("\n")

all cancers
any_cancer
AUC 0.9872538656321208
Outcome probability: 0.7824043805165086
Average precision score: 1.00
Best F1: 0.9721997441854395
Best F1 threshold: 0.2352055013179779
0.2352055013179779
auroc
(0.9872538656321208, (0.9860769855164279, 0.9885076110328755))
auprc
(0.9964621242717719, (0.9960512883668053, 0.9967995882677827))
best f1
(0.9721997441854395, (0.9706601767881301, 0.9736425916178655))
None


all cancers
progression
AUC 0.9704799735551629
Outcome probability: 0.3877492644655116
Average precision score: 0.96
Best F1: 0.8912105618795156
Best F1 threshold: 0.3986702859401703
0.3986702859401703
auroc
(0.9704799735551629, (0.9684558035741562, 0.9724283869726554))
auprc
(0.9559506653705692, (0.9523271991161238, 0.9587974935993682))
best f1
(0.8912105618795156, (0.8864717873343079, 0.8956241947909784))
None


all cancers
response
AUC 0.9738039935985858
Outcome probability: 0.12732919254658384
Average precision score: 0.88
Best F1: 0.8054941585096305
Best F1 threshold: 0.4

In [11]:
for cancer in output_dataset.cancer_type.unique():
    subset = output_dataset[output_dataset.cancer_type == cancer]
    for outcome in ['any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
        print(cancer)
        print(outcome)
        print(eval_model(subset['prob_' + outcome], subset[outcome], graph=False))
        print(eval_with_cis(output_dataset[outcome], output_dataset['prob_' + outcome]))

        print("\n")
    

brca
any_cancer
AUC 0.9865254799540846
Outcome probability: 0.8551816958277254
Average precision score: 1.00
Best F1: 0.9768025078369905
Best F1 threshold: 0.1746265441179275
0.1746265441179275
auroc
(0.9872538656321208, (0.9860016427540038, 0.9883523853388244))
auprc
(0.9964621242717719, (0.9960611902389844, 0.9968036659474394))
best f1
(0.9721997441854395, (0.9704962895565394, 0.9735605198892044))
None


brca
progression
AUC 0.9602610362535804
Outcome probability: 0.41938088829071335
Average precision score: 0.95
Best F1: 0.8807631160572338
Best F1 threshold: 0.4114409685134887
0.4114409685134887
auroc
(0.9704799735551629, (0.9686447330377765, 0.9725293124823426))
auprc
(0.9559506653705692, (0.952522845663171, 0.9590803793684293))
best f1
(0.8912105618795156, (0.8864258902583018, 0.8954890246971363))
None


brca
response
AUC 0.960192271178574
Outcome probability: 0.12570659488559893
Average precision score: 0.77
Best F1: 0.7344064386317909
Best F1 threshold: 0.4896712899208069
0.4896