In [1]:
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# this notebook runs inference with the DFCI-medonc-teacher model on MIMIC discharge summaries that mention cancer
# (runs on half the discharge summaries at a time given the volume)

In [2]:
discharges = pd.read_csv('../data/discharge.csv', low_memory=False)

In [3]:
discharges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331794 entries, 0 to 331793
Data columns (total 8 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   note_id     331794 non-null  object
 1   subject_id  331794 non-null  int64 
 2   hadm_id     331794 non-null  int64 
 3   note_type   331794 non-null  object
 4   note_seq    331794 non-null  int64 
 5   charttime   331794 non-null  object
 6   storetime   331777 non-null  object
 7   text        331794 non-null  object
dtypes: int64(3), object(5)
memory usage: 20.3+ MB


In [4]:
cancer_discharges = discharges[discharges.text.str.lower().str.contains('cancer|malignan')]
cancer_discharges['text'] = cancer_discharges['text'].str.lower().str.replace('\n', ' ')
cancer_discharges['text'] = cancer_discharges['text'].str.replace(r'\s+', ' ', regex=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cancer_discharges['text'] = cancer_discharges['text'].str.lower().str.replace('\n', ' ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cancer_discharges['text'] = cancer_discharges['text'].str.replace(r'\s+', ' ', regex=True)


In [5]:
first_half = cancer_discharges.head(cancer_discharges.shape[0]//2)
second_half = cancer_discharges.tail(cancer_discharges.shape[0]//2+1)

In [6]:
first_half.shape, second_half.shape

((70688, 8), (70689, 8))

In [7]:
from torch.utils import data

class UnLabeledDataset(data.Dataset):
    def __init__(self, pandas_dataset):
        self.data = pandas_dataset.copy()
        self.indices = self.data.index.unique()
        self.tokenizer = AutoTokenizer.from_pretrained("yikuan8/Clinical-Longformer", truncation_side='right')        
        
        
    def __len__(self):
        # how many notes in the dataset
        return len(self.indices)
    
    def __getitem__(self, index):
        # get data for notes corresponding to indices passed
        this_index = self.indices[index]
        pand = self.data.loc[this_index, :]
        #label = torch.tensor(pand.progression, dtype=torch.float32)
    
        encoded = self.tokenizer(pand['text'], padding='max_length', truncation=True)

        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)
        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)
        

        
        return x_text_tensor, x_attention_mask
        
        

In [8]:

from transformers import AutoModel



from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU

   
class LabeledModel(nn.Module):

    def __init__(self, device='cuda:1'):
        super(LabeledModel, self).__init__()
        
        self.longformer = AutoModel.from_pretrained('yikuan8/Clinical-Longformer')
        self.any_cancer_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.response_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))
        self.progression_head = Sequential(Linear(768, 128), ReLU(), Linear(128,1))


        
    def forward(self, x_text_tensor, x_attention_mask):
        # x should be tuple of input IDs, then attention mask
        global_attention_mask = torch.zeros_like(x_text_tensor, device='cuda:1')
        # global attention on cls token
        global_attention_mask[:, 0] = 1
        main = self.longformer(x_text_tensor, x_attention_mask, global_attention_mask)
        main = main.last_hidden_state[:,0,:].squeeze(1)

                                          
        any_cancer_out = self.any_cancer_head(main)
        response_out = self.response_head(main)
        progression_out = self.progression_head(main)



        
        return any_cancer_out, response_out, progression_out
        




In [9]:
# write out actual validation dataset
themodel = LabeledModel()
themodel.load_state_dict(torch.load('dfci_phi_note_longformer.pt'))
themodel.to('cuda:1')

themodel.eval()

no_shuffle_valid_dataset = data.DataLoader(UnLabeledDataset(second_half), batch_size=8, shuffle=False, num_workers=0)

output_prediction_lists = [[] for x in range(3)]
for i, batch in enumerate(no_shuffle_valid_dataset):
    #thisframe = pd.DataFrame()
    x_text_ids = batch[0].to('cuda:1')
    x_attention_mask = batch[1].to('cuda:1')
    with torch.no_grad():
        predictions = themodel(x_text_ids, x_attention_mask)
    for j in range(3):
        output_prediction_lists[j].append(predictions[j].detach().cpu().numpy())
    if i % 1000 == 0:
        print(i)

output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]


output = second_half.copy()
for x in range(3):
    output['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]



Some weights of the model checkpoint at yikuan8/Clinical-Longformer were not used when initializing LongformerModel: ['lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerModel were not initialized from the model checkpoint at yikuan8/Clinical-Longformer and are newly initialized: ['longformer.pooler.dense.weight', 'longformer.pooler.dense.bias']
You should probably TRAIN this model on a dow

0
1000
2000
3000
4000
5000
6000
7000
8000


In [10]:
output.to_csv('../data/second_half_discharges.csv')