In [2]:
# this notebook runs inference for evaluation using the dfci teacher model on the dfci test set

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer
import torch
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
reports = pd.read_csv('/data/clin_notes_outcomes/profile_3-2023/derived_data/labeled_medonc_prissmm_mixedisprog.csv')
reports = reports[reports.split=='test']
inference_input = reports
inference_input['text'] = inference_input['text'].str.lower().str.replace("\n", " ")
inference_input.drop(inference_input.filter(regex='Unnamed|outcome').columns, axis=1, inplace=True)


In [4]:

from transformers import AutoModel



from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU

   
class LabeledModel(nn.Module):

    def __init__(self):
        super(LabeledModel, self).__init__()
        
        self.longformer = AutoModel.from_pretrained('yikuan8/Clinical-Longformer')
        
        self.any_cancer_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.response_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.progression_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))


        
    def forward(self, x_text_tensor, x_attention_mask):
        # x should be tuple of input IDs, then attention mask
        global_attention_mask = torch.zeros_like(x_text_tensor).to('cuda')
        # global attention on cls token
        global_attention_mask[:, 0] = 1
        main = self.longformer(x_text_tensor, x_attention_mask, global_attention_mask)
        main = main.last_hidden_state[:,0,:].squeeze(1)

                                          
        any_cancer_out = self.any_cancer_head(main)
        response_out = self.response_head(main)
        progression_out = self.progression_head(main)



        
        return any_cancer_out, response_out, progression_out
        




In [5]:
from torch.utils import data

class UnLabeledDataset(data.Dataset):
    def __init__(self, pandas_dataset):
        self.data = pandas_dataset.copy()
        self.indices = self.data.index.unique()
        self.tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer", truncation_side='left')        
        
        
    def __len__(self):
        # how many notes in the dataset
        return len(self.indices)
    
    def __getitem__(self, index):
        # get data for notes corresponding to indices passed
        this_index = self.indices[index]
        pand = self.data.loc[this_index, :]
    
        encoded = self.tokenizer(pand['text'], padding='max_length', truncation=True)

        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)
        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)
        

        return x_text_tensor, x_attention_mask
        
        

In [6]:
# write out inference dataset
themodel = LabeledModel()
themodel.load_state_dict(torch.load('dfci_mimic_note_longformer.pt'))
themodel.to('cuda')

themodel.eval()

no_shuffle_valid_dataset = data.DataLoader(UnLabeledDataset(inference_input), batch_size=2, shuffle=False, num_workers=0)

output_prediction_lists = [[] for x in range(3)]
for batch in no_shuffle_valid_dataset:
    x_text_ids = batch[0].to('cuda')
    x_attention_mask = batch[1].to('cuda')
    with torch.no_grad():
        predictions = themodel(x_text_ids, x_attention_mask)
    for j in range(3):
        output_prediction_lists[j].append(predictions[j].detach().cpu().numpy())

output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]

Some weights of LongformerModel were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['longformer.pooler.dense.bias', 'longformer.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
output_dataset = inference_input.copy()
for x in range(3):
    output_dataset['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]

In [8]:
output_dataset = output_dataset.rename(columns={'outcome_0_logit':'any_cancer_logit',
                                                  'outcome_1_logit':'response_logit',
                                                  'outcome_2_logit':'progression_logit'})


In [9]:
from utils_102023 import eval_model

In [10]:
from sklearn.metrics import roc_auc_score, average_precision_score, precision_recall_curve
from confidenceinterval.bootstrap import bootstrap_ci

def best_f1(y_true, y_score):
    precisions, recalls, thresholds = precision_recall_curve(y_true, y_score)
    f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
    f1_scores = f1_scores[~np.isnan(f1_scores)]
    thresholds = thresholds[:-1]
    best_threshold = thresholds[np.argmax(f1_scores)]
    best_f1 = np.max(f1_scores)
    return(best_f1)

def eval_with_cis(y_true, y_score):
    random_generator = np.random.default_rng()
    print('auroc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=roc_auc_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('auprc')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=average_precision_score,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))
    
    print('best f1')
    print(bootstrap_ci(y_true=y_true,
                 y_pred=y_score,
                 metric=best_f1,
                 confidence_level=0.95,
                 n_resamples=1000,
                 method='bootstrap_bca',
                 random_state=random_generator))


In [11]:
for outcome in ['any_cancer','progression','response']:
    print('all cancers')
    print(outcome)
    print(eval_model(output_dataset[outcome + '_logit'], output_dataset[outcome], graph=False))
    print(eval_with_cis(output_dataset[outcome], output_dataset[outcome + '_logit']))
    print("\n")

all cancers
any_cancer
AUC 0.9486290867049982
Outcome probability: 0.8024837374334713
Average precision score: 0.98
Best F1: 0.9603636363636364
Best F1 threshold: -0.09133511
-0.09133511
auroc
(0.9486290867049982, (0.9352079637842997, 0.958542838881347))
auprc
(0.9786868932247385, (0.9703595309233324, 0.9841879719282604))
best f1
(0.9603636363636364, (0.9545235540357411, 0.9647641236773737))
None


all cancers
progression
AUC 0.953051773726664
Outcome probability: 0.17060910703725607
Average precision score: 0.82
Best F1: 0.7788378143972245
Best F1 threshold: -0.005477473
-0.005477473
auroc
(0.953051773726664, (0.9435767272641983, 0.9611189896904294))
auprc
(0.8217513696460943, (0.783038151226199, 0.8505826563450631))
best f1
(0.7788378143972245, (0.7508947516773108, 0.8027710245684336))
None


all cancers
response
AUC 0.9542077740039895
Outcome probability: 0.11975162625665287
Average precision score: 0.79
Best F1: 0.7561576354679802
Best F1 threshold: 1.1216687
1.1216687
auroc
(0.954

In [12]:
# fix nsclc below (has both phase 2s)

In [13]:
output_dataset['cancer_type'] = np.where(output_dataset.cancer_type.str.contains('nsclc'), 'nsclc', output_dataset.cancer_type)

In [14]:
for cancer in output_dataset.cancer_type.unique():
    subset = output_dataset[output_dataset.cancer_type == cancer]
    for outcome in ['any_cancer','progression','response']:
        print(cancer)
        print(outcome)
        print(eval_model(subset[outcome + '_logit'], subset[outcome], graph=False))
        print(eval_with_cis(subset[outcome], subset[outcome + '_logit']))
        print("\n")
    

prostate
any_cancer
AUC 0.9350595507311925
Outcome probability: 0.7528089887640449
Average precision score: 0.98
Best F1: 0.9272943980929679
Best F1 threshold: -1.2145292
-1.2145292
auroc
(0.9350595507311925, (0.911688945253477, 0.953395203554699))
auprc
(0.9777245324073371, (0.9674973853326982, 0.9851777085320758))
best f1
(0.9272943980929679, (0.9064721695461595, 0.9417172853846444))
None


prostate
progression
AUC 0.9348840985669631
Outcome probability: 0.09550561797752809
Average precision score: 0.66
Best F1: 0.7021276595744682
Best F1 threshold: 0.16917488
0.16917488
auroc
(0.9348840985669631, (0.8949073511279628, 0.9604404126420331))
auprc
(0.6636740178079068, (0.5126390987552445, 0.7752187391916537))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.7021276595744682, (0.5885633368777468, 0.8))
None


prostate
response
AUC 0.9442213297634985
Outcome probability: 0.06741573033707865
Average precision score: 0.45
Best F1: 0.619047619047619
Best F1 threshold: 0.67461646
0.67461646
auroc
(0.9442213297634985, (0.903432168887869, 0.963604487411706))
auprc
(0.44706777246434676, (0.3077777556049518, 0.5784157365999004))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.619047619047619, (0.4810343925274924, 0.7158351918782045))
None


breast
any_cancer
AUC 0.9900792372608387
Outcome probability: 0.8935643564356436
Average precision score: 1.00
Best F1: 0.9862637362637363
Best F1 threshold: -1.4136286
-1.4136286
auroc
(0.9900792372608387, (0.9782234271184362, 0.9955932649302816))
auprc
(0.9987993119617472, (0.9968764510253727, 0.9995186732059954))
best f1
(0.9862637362637363, (0.9746478873239437, 0.991869918699187))
None


breast
progression
AUC 0.9762438134930972
Outcome probability: 0.13613861386138615
Average precision score: 0.90
Best F1: 0.8495575221238938
Best F1 threshold: -1.0933986
-1.0933986
auroc
(0.9762438134930972, (0.954566622111426, 0.9886654555365687))
auprc
(0.901519390475449, (0.8116111732076106, 0.9467848082410757))
best f1
(0.8495575221238938, (0.7614322211913132, 0.9008017161609054))
None


breast
response
AUC 0.9389830508474577
Outcome probability: 0.14603960396039603
Average precision score: 0.83
Best F1: 0.8067226890756303
Be

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.8067226890756303, (0.7038140653001981, 0.8679747102118931))
None


nsclc
any_cancer
AUC 0.9805130893413071
Outcome probability: 0.8076572470373746
Average precision score: 0.99
Best F1: 0.9693934335002782
Best F1 threshold: -0.09133511
-0.09133511
auroc
(0.9805130893413071, (0.9650378898036133, 0.9875839644560391))
auprc
(0.9941494421265779, (0.9857538073879951, 0.9968795540685512))
best f1
(0.9693934335002782, (0.960511894421534, 0.9756447112258028))
None


nsclc
progression
AUC 0.9690818963244212
Outcome probability: 0.1959890610756609
Average precision score: 0.90
Best F1: 0.8497652582159626
Best F1 threshold: 0.2740089
0.2740089
auroc
(0.9690818963244212, (0.951391388006024, 0.9788707849711368))
auprc
(0.9036742388907794, (0.8655852610248852, 0.9325326473759323))
best f1
(0.8497652582159626, (0.8031857352332735, 0.8791828013809737))
None


nsclc
response
AUC 0.9699678637012853
Outcome probability: 0.15405651777575205
Average precision score: 0.88
Best F1: 0.8289855072463768
Best

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.7884615384615384, (0.6741347095584825, 0.8527055585158118))
None


pancreas
response
AUC 0.9822974036191976
Outcome probability: 0.09927360774818401
Average precision score: 0.86
Best F1: 0.8409090909090909
Best F1 threshold: 0.8285024
0.8285024
auroc
(0.9822974036191976, (0.9650696405292057, 0.9913354176807342))
auprc
(0.8590087134997738, (0.7097974444435676, 0.9315017780734571))
best f1
(0.8409090909090909, (0.7389316437500403, 0.898876404494382))
None


rcc_barkouny
any_cancer
AUC 0.8064462809917355
Outcome probability: 0.9063670411985019
Average precision score: 0.96
Best F1: 0.9797570850202428
Best F1 threshold: 0.08257564
0.08257564
auroc
(0.8064462809917355, (0.6719671837565334, 0.9121755262992436))
auprc
(0.9628833669077995, (0.9154550077280516, 0.9846966036440191))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.9797570850202428, (0.9634282855791025, 0.9898167006109979))
None


rcc_barkouny
progression
AUC 0.8879104477611941
Outcome probability: 0.250936329588015
Average precision score: 0.66
Best F1: 0.7037037037037036
Best F1 threshold: 3.131426
3.131426
auroc


  F1 = 2*((precision*recall)/(precision+recall))


(0.8879104477611941, (0.8429369494000395, 0.921845576387716))
auprc
(0.6633457041336291, (0.515506462033252, 0.7622887413168136))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.7037037037037036, (0.601728460545308, 0.7692307692307692))
None


rcc_barkouny
response
AUC 0.8492337578243039
Outcome probability: 0.15355805243445692
Average precision score: 0.58
Best F1: 0.6352941176470588
Best F1 threshold: 1.0662756
1.0662756
auroc
(0.8492337578243039, (0.7686309072374226, 0.9083579454876874))
auprc
(0.581968463611317, (0.39390389057721725, 0.7077752000317289))
best f1


  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_score

(0.6352941176470588, (0.5127888918216778, 0.7253669600409599))
None


crc
any_cancer
AUC 0.9876945212222179
Outcome probability: 0.6901408450704225
Average precision score: 0.99
Best F1: 0.96875
Best F1 threshold: -1.4841523
-1.4841523
auroc
(0.9876945212222179, (0.9744967124472363, 0.9938451196726344))
auprc
(0.9937821007550192, (0.9854454418192674, 0.9970187842642121))
best f1
(0.96875, (0.9536815875210576, 0.9779742499864315))
None


crc
progression
AUC 0.9442970822281167
Outcome probability: 0.18309859154929578
Average precision score: 0.82
Best F1: 0.7741935483870969
Best F1 threshold: 0.029643282
0.029643282
auroc
(0.9442970822281167, (0.9101812177472985, 0.9646818809150984))
auprc
(0.8233688899318561, (0.7222323773444544, 0.883994496976519))
best f1
(0.7741935483870969, (0.6968010086167196, 0.8272430278058007))
None


crc
response
AUC 0.9787484424754257
Outcome probability: 0.06237424547283702
Average precision score: 0.79
Best F1: 0.7719298245614036
Best F1 threshold: 1.3949373

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)
  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.7719298245614036, (0.6285714285714286, 0.8782462959431341))
None


bladder
any_cancer
AUC 0.9680952380952381
Outcome probability: 0.8235294117647058
Average precision score: 0.99
Best F1: 0.963768115942029
Best F1 threshold: 2.1990752
2.1990752
auroc
(0.9680952380952381, (0.9185084203968955, 0.9884751117036619))
auprc
(0.992495081887719, (0.9778848803168737, 0.9977945910881445))
best f1
(0.963768115942029, (0.9314389074088951, 0.9781021897810219))
None


bladder
progression
AUC 0.8982683982683983
Outcome probability: 0.25882352941176473
Average precision score: 0.76
Best F1: 0.7234042553191491
Best F1 threshold: -0.15636161
-0.15636161
auroc
(0.8982683982683983, (0.8401024489614647, 0.9348171551200131))
auprc
(0.7607948520550937, (0.6116580480249599, 0.8561633136001432))
best f1
(0.7234042553191491, (0.6100656348672678, 0.8037383177570094))
None


bladder
response
AUC 0.9300804828973843
Outcome probability: 0.16470588235294117
Average precision score: 0.75
Best F1: 0.779661016949152

  f1_scores = 2 * (precisions * recalls) / (precisions + recalls)


(0.7796610169491526, (0.6300243848614419, 0.8750000000000001))
None


