In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils import data
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

In [None]:
# this notebook evaluates the performance of the DFCI-medonc-student model on MSK notes.
# MSK ran models on their own data and then output the deidentified dataset used here.

In [2]:
output_dataset = pd.read_csv('/mnt/d/Dropbox (Partners HealthCare)/dfci_mimic_note_bert/msk_note_data/msk_predictions_medonc_noPHI.csv')

In [3]:
output_dataset.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40701 entries, 0 to 40700
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   DMP_ID             40701 non-null  object 
 1   any_cancer_label   40701 non-null  bool   
 2   response_label     40701 non-null  bool   
 3   progression_label  40701 non-null  bool   
 4   cancer_type        40701 non-null  object 
 5   any_cancer_logit   40701 non-null  float64
 6   response_logit     40701 non-null  float64
 7   progression_logit  40701 non-null  float64
dtypes: bool(3), float64(3), object(2)
memory usage: 1.7+ MB


In [9]:
output_dataset.cancer_type.value_counts()

nsclc       14026
crc         11900
brca         5370
prostate     4998
pancreas     4407
Name: cancer_type, dtype: int64

In [4]:
output_dataset = output_dataset.rename(columns={'any_cancer_label':'any_cancer', 'response_label':'response', 'progression_label':'progression'})

In [6]:
from utils_102023 import eval_model

In [7]:
for outcome in ['any_cancer','progression','response']:
    print('all cancers')
    print(outcome)
    print(eval_model(output_dataset[outcome + '_logit'], output_dataset[outcome], graph=False))
    print("\n")

all cancers
any_cancer
AUC 0.9556629006880287
Outcome probability: 0.8200535613375592
Average precision score: 0.99
Best F1: 0.9573217783581647
Best F1 threshold: -1.0427246
-1.0427246


all cancers
progression
AUC 0.9102195303905977
Outcome probability: 0.158448195376035
Average precision score: 0.71
Best F1: 0.672753834915997
Best F1 threshold: -1.243615
-1.243615


all cancers
response
AUC 0.9400174814769453
Outcome probability: 0.14233065526645536
Average precision score: 0.76
Best F1: 0.7171417101565637
Best F1 threshold: -0.21693532
-0.21693532




In [8]:
for cancer in output_dataset.cancer_type.unique():
    subset = output_dataset[output_dataset.cancer_type == cancer]
    for outcome in ['any_cancer','progression','response']:
        print(cancer)
        print(outcome)
        print(eval_model(subset[outcome + '_logit'], subset[outcome], graph=False))
        print("\n")
    

pancreas
any_cancer
AUC 0.9842499158818416
Outcome probability: 0.8089403222146585
Average precision score: 1.00
Best F1: 0.9763537148453897
Best F1 threshold: -1.0610669
-1.0610669


pancreas
progression
AUC 0.9401933363687329
Outcome probability: 0.15793056501021102
Average precision score: 0.79
Best F1: 0.7344398340248962
Best F1 threshold: -1.1829004
-1.1829004


pancreas
response
AUC 0.9605520401712682
Outcome probability: 0.13637395053324256
Average precision score: 0.82
Best F1: 0.7589833479404031
Best F1 threshold: 0.8911807
0.8911807


brca
any_cancer
AUC 0.9570214499163029
Outcome probability: 0.7748603351955308
Average precision score: 0.98
Best F1: 0.9496764917325664
Best F1 threshold: -0.47979498
-0.47979498


brca
progression
AUC 0.8847066020551947
Outcome probability: 0.14804469273743018
Average precision score: 0.64
Best F1: 0.5983658076681333
Best F1 threshold: -1.7155408
-1.7155408


brca
response
AUC 0.9142421975065904
Outcome probability: 0.12327746741154562
Average

  F1 = 2*((precision*recall)/(precision+recall))
