In [2]:
# this notebook runs inference for evaluation using the dfci teacher model on the dfci test set

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
reports = pd.read_csv('/data/clin_notes_outcomes/profile_3-2023/derived_data/labeled_medonc_prissmm_mixedisprog.csv')
reports = reports[reports.split=='test']
inference_input = reports
inference_input['text'] = inference_input['text'].str.lower().str.replace("\n", " ")
inference_input.drop(inference_input.filter(regex='Unnamed|outcome').columns, axis=1, inplace=True)


In [4]:

from transformers import AutoModel



from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU

   
class LabeledModel(nn.Module):

    def __init__(self):
        super(LabeledModel, self).__init__()
        
        self.longformer = AutoModel.from_pretrained('yikuan8/Clinical-Longformer')
        
        self.any_cancer_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.response_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.progression_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))


        
    def forward(self, x_text_tensor, x_attention_mask):
        # x should be tuple of input IDs, then attention mask
        global_attention_mask = torch.zeros_like(x_text_tensor).to('cuda')
        # global attention on cls token
        global_attention_mask[:, 0] = 1
        main = self.longformer(x_text_tensor, x_attention_mask, global_attention_mask)
        main = main.last_hidden_state[:,0,:].squeeze(1)

                                          
        any_cancer_out = self.any_cancer_head(main)
        response_out = self.response_head(main)
        progression_out = self.progression_head(main)



        
        return any_cancer_out, response_out, progression_out
        




In [5]:
from torch.utils import data

class UnLabeledDataset(data.Dataset):
    def __init__(self, pandas_dataset):
        self.data = pandas_dataset.copy()
        self.indices = self.data.index.unique()
        self.tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer", truncation_side='left')        
        
        
    def __len__(self):
        # how many notes in the dataset
        return len(self.indices)
    
    def __getitem__(self, index):
        # get data for notes corresponding to indices passed
        this_index = self.indices[index]
        pand = self.data.loc[this_index, :]
    
        encoded = self.tokenizer(pand['text'], padding='max_length', truncation=True)

        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)
        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)
        

        return x_text_tensor, x_attention_mask
        
        

In [6]:
# write out inference dataset
themodel = LabeledModel()
themodel.load_state_dict(torch.load('dfci_phi_note_longformer.pt'))
themodel.to('cuda')

themodel.eval()

no_shuffle_valid_dataset = data.DataLoader(UnLabeledDataset(inference_input), batch_size=2, shuffle=False, num_workers=0)

output_prediction_lists = [[] for x in range(3)]
for batch in no_shuffle_valid_dataset:
    x_text_ids = batch[0].to('cuda')
    x_attention_mask = batch[1].to('cuda')
    with torch.no_grad():
        predictions = themodel(x_text_ids, x_attention_mask)
    for j in range(3):
        output_prediction_lists[j].append(predictions[j].detach().cpu().numpy())

output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]

Some weights of LongformerModel were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['longformer.pooler.dense.bias', 'longformer.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
output_dataset = inference_input.copy()
for x in range(3):
    output_dataset['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]

In [8]:
output_dataset = output_dataset.rename(columns={'outcome_0_logit':'any_cancer_logit',
                                                  'outcome_1_logit':'response_logit',
                                                  'outcome_2_logit':'progression_logit'})


In [9]:
from utils_102023 import eval_model

In [10]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from confidenceinterval.bootstrap import bootstrap_ci

def best_f1(y_true, y_score):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_score)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = f1_scores[~np.isnan(f1_scores)]
    thresholds = thresholds[:-1]
    best_threshold = thresholds[np.argmax(f1_scores)]
    best_f1 = np.max(f1_scores)
    return(best_f1)

def eval_with_cis(y_true, y_score):
    random_generator = np.random.default_rng()
    print('auroc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=roc_auc_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('auprc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=average_precision_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('best f1')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=best_f1,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))


In [11]:
for outcome in ['any_cancer','progression','response']:
    print('all cancers')
    print(outcome)
    print(eval_model(output_dataset[outcome + '_logit'], output_dataset[outcome], graph=False))
    print(eval_with_cis(output_dataset[outcome], output_dataset[outcome + '_logit']))
    print("\n")

all cancers
any_cancer
AUC 0.9531543030372565
Outcome probability: 0.8024837374334713
Average precision score: 0.98
Best F1: 0.9624197983501376
Best F1 threshold: -0.0491971
-0.0491971
auroc
(0.9531543030372565, (0.9407736278862161, 0.9636279289509344))
auprc
(0.980340054195423, (0.9723963496948828, 0.9857445216056678))
best f1
(0.9624197983501376, (0.956696756426761, 0.9662232230931317))
None


all cancers
progression
AUC 0.9614293614089721
Outcome probability: 0.17060910703725607
Average precision score: 0.85
Best F1: 0.7908937605396289
Best F1 threshold: -0.30774117
-0.30774117
auroc
(0.9614293614089721, (0.9525456535114389, 0.9678925101451509))
auprc
(0.8518891902135873, (0.8215275775725889, 0.8766249766506559))
best f1
(0.7908937605396289, (0.7630510507237318, 0.8131313131313131))
None


all cancers
response
AUC 0.9714942128333687
Outcome probability: 0.11975162625665287
Average precision score: 0.85
Best F1: 0.7995169082125605
Best F1 threshold: 0.34003878
0.34003878
auroc
(0.971

In [12]:
# fix nsclc below (has both phase 2s)

In [13]:
output_dataset['cancer_type'] = np.where(output_dataset.cancer_type.str.contains('nsclc'), 'nsclc', output_dataset.cancer_type)

In [14]:
for cancer in output_dataset.cancer_type.unique():
    subset = output_dataset[output_dataset.cancer_type == cancer]
    for outcome in ['any_cancer','progression','response']:
        print(cancer)
        print(outcome)
        print(eval_model(subset[outcome + '_logit'], subset[outcome], graph=False))
        print(eval_with_cis(subset[outcome], subset[outcome + '_logit']))
        print("\n")
    

prostate
any_cancer
AUC 0.9451417156641037
Outcome probability: 0.7528089887640449
Average precision score: 0.98
Best F1: 0.927070457354759
Best F1 threshold: 0.22991496
0.22991496
auroc
(0.9451417156641037, (0.9255439473117967, 0.9614637367436826))
auprc
(0.9819719738106394, (0.9731728086034388, 0.9878635045835051))
best f1
(0.927070457354759, (0.9029954518167946, 0.9401444061336784))
None


prostate
progression
AUC 0.9551820728291317
Outcome probability: 0.09550561797752809
Average precision score: 0.73
Best F1: 0.7090909090909091
Best F1 threshold: -0.5175506
-0.5175506
auroc
(0.9551820728291317, (0.9293514481645978, 0.9729606798227706))
auprc
(0.732826337109419, (0.6018090445537317, 0.8327964452667892))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.7090909090909091, (0.5769890848789732, 0.7797023300275119))
None


prostate
response
AUC 0.9509147701918785
Outcome probability: 0.06741573033707865
Average precision score: 0.59
Best F1: 0.676056338028169
Best F1 threshold: 0.41871384
0.41871384
auroc
(0.9509147701918785, (0.8628342081222738, 0.9736991738729822))
auprc
(0.5939773401304989, (0.40915416586236414, 0.7279080724562681))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.676056338028169, (0.5168114598350129, 0.7640449438202247))
None


breast
any_cancer
AUC 0.9928493203633318
Outcome probability: 0.8935643564356436
Average precision score: 1.00
Best F1: 0.9862637362637363
Best F1 threshold: -3.6670334
-3.6670334
auroc
(0.9928493203633318, (0.981766436964594, 0.997212116742551))
auprc
(0.9991329910813748, (0.997469855079798, 0.9996873600596671))
best f1
(0.9862637362637363, (0.9753350678934195, 0.9917782244908517))
None


breast
progression
AUC 0.9825996353216984
Outcome probability: 0.13613861386138615
Average precision score: 0.93
Best F1: 0.8571428571428571
Best F1 threshold: -1.1806909
-1.1806909
auroc
(0.9825996353216984, (0.9545953603092829, 0.9924067137682173))
auprc
(0.9282694574974761, (0.8562879353790332, 0.9662119970938532))
best f1
(0.8571428571428571, (0.767048659426345, 0.8979591836734694))
None


breast
response
AUC 0.9709653647752394
Outcome probability: 0.14603960396039603
Average precision score: 0.89
Best F1: 0.8640000000000001
Bes

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.784, (0.6730769230769231, 0.8448275862068966))
None


pancreas
response
AUC 0.978232362968791
Outcome probability: 0.09927360774818401
Average precision score: 0.79
Best F1: 0.8
Best F1 threshold: 1.0509262
1.0509262
auroc
(0.978232362968791, (0.9599939992935402, 0.9885170937974312))
auprc
(0.7925347733151192, (0.5887961679549147, 0.8873937577233941))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.8, (0.6872898649235208, 0.8695652173913043))
None


rcc_barkouny
any_cancer
AUC 0.7914049586776859
Outcome probability: 0.9063670411985019
Average precision score: 0.96
Best F1: 0.983739837398374
Best F1 threshold: 0.9678126
0.9678126
auroc
(0.7914049586776859, (0.6502485104773778, 0.9061182765807348))
auprc
(0.9618282036878976, (0.919297050955848, 0.9832394389536809))
best f1
(0.983739837398374, (0.9685084448988945, 0.9919308018866021))
None


rcc_barkouny
progression
AUC 0.8897761194029851
Outcome probability: 0.250936329588015
Average precision score: 0.69
Best F1: 0.7297297297297297
Best F1 threshold: 3.4486113
3.4486113
auroc


  F1 = 2*((precision*recall)/(precision+recall))


(0.8897761194029851, (0.8311494033646112, 0.9292334542611972))
auprc
(0.6850952135976632, (0.5299556094013042, 0.7861049629217318))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.7297297297297297, (0.6326298587259132, 0.7886373388729422))
None


rcc_barkouny
response
AUC 0.9491690049643858
Outcome probability: 0.15355805243445692
Average precision score: 0.82
Best F1: 0.7741935483870966
Best F1 threshold: -0.5209102
-0.5209102
auroc
(0.9491690049643858, (0.8645986750105994, 0.9769677356927223))
auprc
(0.8207037263086624, (0.6925725208052578, 0.9015361128660059))
best f1
(0.7741935483870966, (0.6651414200331897, 0.8387096774193548))
None


crc
any_cancer
AUC 0.9920109045473477
Outcome probability: 0.6901408450704225
Average precision score: 1.00
Best F1: 0.9770114942528736
Best F1 threshold: -3.287401
-3.287401
auroc
(0.9920109045473477, (0.9855603404971249, 0.9960874467566928))
auprc
(0.9963060525101161, (0.9927666947585828, 0.9981838957606504))
best f1
(0.9770114942528736, (0.9625520749708569, 0.9857142857142858))
None


crc
progression
AUC 0.9580739457586748
Outcome probability: 0.18309859154929578
Average precision score: 0.86
Best F1: 0.8
Best F1 thresho