In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils import data
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# pull in your dataset here. It should have a column labeled 'text' containing the full radiology report text (not just impression).
# if you have narrative reports separate from the impressions, would concatenate the impressions at the end of the narratives.
reports = pd.read_csv('/data/clin_notes_outcomes/profile_3-2023/derived_data/labeled_imaging_prissmm.csv')
reports['progression'] = np.where(reports.class_status==3,1,reports.progression)
reports = reports[reports.split=='test']
inference_input = reports
inference_input['text'] = inference_input['text'].str.lower().str.replace("\n", " ")
inference_input.drop(inference_input.filter(regex='Unnamed|outcome').columns, axis=1, inplace=True)


In [3]:
class UnLabeledDataset(data.Dataset):
    def __init__(self, pandas_dataset):
        self.data = pandas_dataset.copy()
        self.indices = self.data.index.unique()
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', truncation_side='left')        
        
        
    def __len__(self):
        # how many notes in the dataset
        return len(self.indices)
    
    def __getitem__(self, index):
        # get data for notes corresponding to indices passed
        this_index = self.indices[index]
        pand = self.data.loc[this_index, :]
    
        encoded = self.tokenizer(pand['text'], padding='max_length', truncation=True)

        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)
        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)
        return x_text_tensor, x_attention_mask
        

In [4]:
from transformers import AutoModel
from torch.nn import functional as F
import torch.nn as nn
from torch.nn import Linear, Sequential, ReLU

   
class LabeledModel(nn.Module):

    def __init__(self):
        super(LabeledModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        
        self.any_cancer_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.response_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.progression_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.brain_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.bone_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.adrenal_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.liver_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.lung_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.node_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.peritoneal_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))

        
    def forward(self, x_text_tensor, x_attention_mask):
        main = self.bert(x_text_tensor, x_attention_mask)
        main = main.last_hidden_state[:,0,:].squeeze(1)

                                          
        any_cancer_out = self.any_cancer_head(main)
        response_out = self.response_head(main)
        progression_out = self.progression_head(main)
        brain_out = self.brain_head(main)
        bone_out = self.bone_head(main)
        adrenal_out = self.adrenal_head(main)
        liver_out = self.liver_head(main)
        lung_out = self.lung_head(main)
        node_out = self.node_head(main)
        peritoneum_out = self.peritoneal_head(main)

        return any_cancer_out, response_out, progression_out, brain_out, bone_out, adrenal_out, liver_out, lung_out, node_out, peritoneum_out
        




In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [8]:
# write out the inference dataset
themodel = LabeledModel()
themodel.load_state_dict(torch.load('./dfci_mimic_imaging_bert.pt'))
themodel.to(device)

themodel.eval()

dataset = data.DataLoader(UnLabeledDataset(inference_input), batch_size=16, shuffle=False, num_workers=0)

output_true_lists = [[] for x in range(10)]
output_prediction_lists = [[] for x in range(10)]
for batch in dataset:

    x_text_ids = batch[0].to(device)
    x_attention_mask = batch[1].to(device)
    with torch.no_grad():
        predictions = themodel(x_text_ids, x_attention_mask)
    for j in range(10):
        output_prediction_lists[j].append(predictions[j].detach().cpu().numpy())

output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]


output_dataset = inference_input.copy()
for x in range(10):
    output_dataset['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]


In [9]:
output_dataset=output_dataset.rename(columns={'outcome_0_logit' : 'any_cancer_logit',
                                              'outcome_1_logit' : 'response_logit',
                                              'outcome_2_logit' : 'progression_logit',
                                              'outcome_3_logit' : 'brain_met_logit',
                                              'outcome_4_logit' : 'bone_met_logit',
                                              'outcome_5_logit' : 'adrenal_met_logit',
                                              'outcome_6_logit' : 'liver_met_logit',
                                              'outcome_7_logit' : 'lung_met_logit',
                                              'outcome_8_logit' : 'node_met_logit',
                                              'outcome_9_logit' : 'peritoneal_met_logit'})

In [10]:

output_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3460 entries, 103 to 37183
Data columns (total 43 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   dfci_mrn                  3460 non-null   float64
 1   cancer_type               3460 non-null   object 
 2   image_scan_type           3460 non-null   float64
 3   date                      3460 non-null   object 
 4   head_imaged               3460 non-null   float64
 5   neck_imaged               3460 non-null   float64
 6   spine_imaged              3460 non-null   float64
 7   chest_imaged              3460 non-null   float64
 8   abdomen_imaged            3460 non-null   float64
 9   pelvis_imaged             3460 non-null   float64
 10  any_cancer                3460 non-null   int64  
 11  progression               3460 non-null   int64  
 12  response                  3460 non-null   int64  
 13  class_status              3459 non-null   float64
 14  brain_met 

In [11]:
from utils_102023 import eval_model

In [12]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from confidenceinterval.bootstrap import bootstrap_ci

def best_f1(y_true, y_score):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_score)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = f1_scores[~np.isnan(f1_scores)]
    thresholds = thresholds[:-1]
    best_threshold = thresholds[np.argmax(f1_scores)]
    best_f1 = np.max(f1_scores)
    return(best_f1)

def eval_with_cis(y_true, y_score):
    random_generator = np.random.default_rng()
    print('auroc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=roc_auc_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('auprc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=average_precision_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('best f1')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=best_f1,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))


In [13]:
# run metrics
for outcome in ['any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
    print('all cancers')
    print(outcome)
    print(eval_model(output_dataset[outcome + '_logit'], output_dataset[outcome], graph=False))
    print(eval_with_cis(output_dataset[outcome], output_dataset[outcome + '_logit']))
    print("\n")

all cancers
any_cancer
AUC 0.9715749410958404
Outcome probability: 0.5531791907514451
Average precision score: 0.98
Best F1: 0.9213541666666667
Best F1 threshold: 0.2041204
0.2041204
auroc
(0.9715749410958404, (0.966049640166352, 0.9759640959034103))
auprc
(0.9760852014365636, (0.970818444361765, 0.9802685201680549))
best f1
(0.9213541666666667, (0.9116106328909672, 0.9288789180484102))
None


all cancers
progression
AUC 0.9541434770911559
Outcome probability: 0.23265895953757226
Average precision score: 0.88
Best F1: 0.8026960784313726
Best F1 threshold: -0.23884423
-0.23884423
auroc
(0.9541434770911559, (0.9466796847625337, 0.9607384344158143))
auprc
(0.8793761019747219, (0.8560946162117015, 0.8969086616645315))
best f1
(0.8026960784313726, (0.781394835320766, 0.8193296681951774))
None


all cancers
response
AUC 0.9767829141183383
Outcome probability: 0.06329479768786127
Average precision score: 0.79
Best F1: 0.7249466950959489
Best F1 threshold: -0.6282928
-0.6282928
auroc
(0.976782

In [14]:
output_dataset['cancer_type'] = np.where(output_dataset.cancer_type.str.contains('nsclc'), 'nsclc', output_dataset.cancer_type)

In [15]:
for cancer in output_dataset.cancer_type.unique():
    subset = output_dataset[output_dataset.cancer_type == cancer]
    for outcome in ['any_cancer','progression','response','brain_met','bone_met','adrenal_met','liver_met','lung_met','node_met','peritoneal_met']:
        print(cancer)
        print(outcome)
        print(eval_model(subset[outcome + '_logit'], subset[outcome], graph=False))
        print(eval_with_cis(output_dataset[outcome], output_dataset[outcome + '_logit']))

        print("\n")
    

prostate
any_cancer
AUC 0.9832997282370475
Outcome probability: 0.5661252900232019
Average precision score: 0.99
Best F1: 0.9520000000000001
Best F1 threshold: 0.40061882
0.40061882
auroc
(0.9715749410958404, (0.9664402822337345, 0.976141423198454))
auprc
(0.9760852014365636, (0.970764876063303, 0.9801311297466812))
best f1
(0.9213541666666667, (0.9110039536810254, 0.9287551633316574))
None


prostate
progression
AUC 0.9671017757752451
Outcome probability: 0.20417633410672853
Average precision score: 0.88
Best F1: 0.8372093023255814
Best F1 threshold: 4.463323
4.463323
auroc


  F1 = 2*((precision*recall)/(precision+recall))


(0.9541434770911559, (0.9463831085248481, 0.9605113112732933))
auprc
(0.8793761019747219, (0.8521771462836377, 0.8966049024453241))
best f1
(0.8026960784313726, (0.7810282743788413, 0.8216533102303476))
None


prostate
response
AUC 0.9663882259347653
Outcome probability: 0.027842227378190254
Average precision score: 0.57
Best F1: 0.7142857142857143
Best F1 threshold: -0.5827851
-0.5827851
auroc
(0.9767829141183383, (0.9678285415356462, 0.9825547297393491))
auprc
(0.7945995122277495, (0.7345442225591874, 0.8411014648196297))
best f1
(0.7249466950959489, (0.6706280157877891, 0.7611650485436894))
None


prostate
brain_met
AUC 0.9867601246105919
Outcome probability: 0.0069605568445475635
Average precision score: 0.39
Best F1: 0.5714285714285715
Best F1 threshold: 2.1962397
2.1962397
auroc


  F1 = 2*((precision*recall)/(precision+recall))


(0.9959463767513302, (0.9937209136317434, 0.9973255506488673))
auprc
(0.9526222800509534, (0.9256472873130978, 0.9677393461736611))
best f1
(0.890295358649789, (0.8594121487075511, 0.9112732175134532))
None


prostate
bone_met
AUC 0.9833613517824044
Outcome probability: 0.48491879350348027
Average precision score: 0.97
Best F1: 0.9534883720930233
Best F1 threshold: 0.30309966
0.30309966
auroc
(0.9890043290043289, (0.9858409268534458, 0.9914778730653597))
auprc
(0.9505568693131786, (0.9326343743543115, 0.9625373049234156))
best f1
(0.9054652880354505, (0.8856740563554939, 0.9182360439338629))
None


prostate
adrenal_met
no outcome variation to calculate metrics
None
auroc
(0.9866940722353883, (0.9669611794309769, 0.9928013822752204))
auprc
(0.7222536344422418, (0.6202121964364976, 0.8047247134287501))
best f1
(0.6923076923076923, (0.6021709235792817, 0.7672955974842767))
None


prostate
liver_met
AUC 0.9849358974358975
Outcome probability: 0.03480278422273782
Average precision score: 0.

  F1 = 2*((precision*recall)/(precision+recall))


(0.9866940722353883, (0.9659696919273552, 0.9926056412361562))
auprc
(0.7222536344422418, (0.6107184113857754, 0.7989906675780238))
best f1
(0.6923076923076923, (0.5966583525552785, 0.7634470880208662))
None


breast
liver_met
AUC 0.9959808075772683
Outcome probability: 0.1259259259259259
Average precision score: 0.97
Best F1: 0.9275362318840579
Best F1 threshold: 0.9297301
0.9297301
auroc
(0.98626172863482, (0.9817974107784807, 0.9896185496176333))
auprc
(0.9029998128334735, (0.8691089988830186, 0.9259992086823768))
best f1
(0.854922279792746, (0.8213285436064407, 0.8769984332609068))
None


breast
lung_met
AUC 0.9926775405834721
Outcome probability: 0.09814814814814815
Average precision score: 0.96
Best F1: 0.8909090909090909
Best F1 threshold: -0.070430666
-0.070430666
auroc
(0.9746927513455769, (0.9691791157804711, 0.980054184305362))
auprc
(0.8932785357784605, (0.8704835834260702, 0.9129210673346418))
best f1
(0.8045112781954887, (0.7766827500545422, 0.8256225696294877))
None


br

  F1 = 2*((precision*recall)/(precision+recall))


(0.98626172863482, (0.9821078722547595, 0.989787252904441))
auprc
(0.9029998128334735, (0.8689637450776381, 0.9283848451926267))
best f1
(0.854922279792746, (0.8270056486335039, 0.8786943633070143))
None


rcc_barkouny
lung_met
AUC 0.967948717948718
Outcome probability: 0.10743801652892562
Average precision score: 0.78
Best F1: 0.7462686567164178
Best F1 threshold: -0.8671813
-0.8671813
auroc
(0.9746927513455769, (0.9685101802021663, 0.979732240424797))
auprc
(0.8932785357784605, (0.866267467804413, 0.9136866151967796))
best f1
(0.8045112781954887, (0.7754605773133455, 0.8269357316586432))
None


rcc_barkouny
node_met
AUC 0.9865841073271413
Outcome probability: 0.15702479338842976
Average precision score: 0.96
Best F1: 0.9014084507042254
Best F1 threshold: -0.49244568
-0.49244568
auroc
(0.9710726300995633, (0.9651106690983732, 0.9758139462258119))
auprc
(0.8007983670281836, (0.7517445487567737, 0.8370819503206587))
best f1
(0.7600000000000001, (0.7269626214321283, 0.7838886939018309))


  F1 = 2*((precision*recall)/(precision+recall))


(0.9866940722353883, (0.9640036375574206, 0.9924029608075586))
auprc
(0.7222536344422418, (0.6098439380508076, 0.8005196026110224))
best f1
(0.6923076923076923, (0.5972205911890724, 0.7585587461742271))
None


pancreas
liver_met
AUC 0.9492054707882046
Outcome probability: 0.24661246612466126
Average precision score: 0.83
Best F1: 0.7999999999999999
Best F1 threshold: -0.8434922
-0.8434922
auroc
(0.98626172863482, (0.9820778554873532, 0.9898394932679775))
auprc
(0.9029998128334735, (0.8655758383287843, 0.9250446902285516))
best f1
(0.854922279792746, (0.824364821925933, 0.8778479174500948))
None


pancreas
lung_met
AUC 0.9602952602952602
Outcome probability: 0.10569105691056911
Average precision score: 0.85
Best F1: 0.7812500000000001
Best F1 threshold: 1.6110288
1.6110288
auroc
(0.9746927513455769, (0.9679377218285281, 0.9796109109825405))
auprc
(0.8932785357784605, (0.8659704291243165, 0.9127853879027066))
best f1
(0.8045112781954887, (0.7761051360712609, 0.8249310705086296))
None


p

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.7340425531914894, (0.6449261932892737, 0.7870581669661806))
None


