# RadBert Embeddings #

In [1]:
from itertools import product

import numpy as np
import torch
import torch.nn.functional as F

torch.set_printoptions(linewidth=200)

import transformers
from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM, AutoModelForCausalLM
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


## Model and Tokenizer ##
Using model and weights from https://huggingface.co/StanfordAIMI/RadBERT

In [2]:
#checkpoint1 = "UCSD-VA-health/RadBERT-RoBERTa-4m"
checkpoint1 = "zzxslp/RadBERT-RoBERTa-4m"
tokenizer1 = AutoTokenizer.from_pretrained(checkpoint1)
full_model1 = AutoModelForMaskedLM.from_pretrained(checkpoint1)
base_model1 = AutoModel.from_pretrained(checkpoint1)

Some weights of RobertaModel were not initialized from the model checkpoint at zzxslp/RadBERT-RoBERTa-4m and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Sentences ##
Checking the embeddings obtained from reports.
Consider the 9 Sentences:
 1. "The report shows small right-sided pleural effusion"
 2. "The report shows small left-sided pleural effusion"
 3. "The report shows large right-sided pleural effusion"
 4. "The report shows large left-sided pleural effusion"
 5. "There are no abnormalities in the report"
 6. "There is severe consolidation in the left side"
 7. "There is severe consolidation in the right side"
 8. "There is mild consolidation in the right side"
 9. "There is mild consolidation in the left side"
 
Checking the cosine similarities Matrix between the embeddings of above 9 sentences

In [3]:
sentence_list = ["The report shows small right-sided pleural effusion", "The report shows small left-sided pleural effusion",\
    "The report shows large right-sided pleural effusion", "The report shows large left-sided pleural effusion",\
    "There are no abnormalities in the report",\
    "There is severe consolidation in the left side","There is severe consolidation in the right side",\
    "There is mild consolidation in the right side", "There is mild consolidation in the left side"
]

## Sanity checks ##
Testing the RadBert model on masked language modelling, to test if weights are loaded correctly

In [7]:
#masked_sentence = sentence_list[0].replace('report', '[MASK]')
masked_sentence = "there is diffuse consolidation in the right lung, indicative of <mask>"
print(masked_sentence)
tokens = tokenizer1(masked_sentence, padding=True, truncation=True, return_tensors='pt')
print(tokens)
with torch.no_grad():
    output_full = full_model1(**tokens, output_hidden_states=True)
    output_base = base_model1(**tokens, output_hidden_states=True)
print(output_full)
print(output_base)

there is diffuse consolidation in the right lung, indicative of <mask>
{'input_ids': tensor([[    0,  8585,    16, 41118, 13581,    11,     5,   235, 10665,     6, 22206,     9, 50264,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
MaskedLMOutput(loss=None, logits=tensor([[[ 0.5278, -3.7901, -0.3916,  ..., -5.2278, -4.5390, -3.9142],
         [ 0.4206, -2.5342,  0.7946,  ..., -2.0640, -0.8946, -2.7140],
         [ 0.4204, -3.2287,  0.0934,  ..., -0.0249, -1.7986, -2.1001],
         ...,
         [-0.1570, -4.3633, -0.6012,  ..., -3.5548, -3.3828,  0.3929],
         [ 2.9478, -3.4692,  5.5250,  ..., -4.2230, -5.3847,  1.3726],
         [ 0.4920, -3.7886, -0.3946,  ..., -5.2342, -4.5446, -3.9283]]]), hidden_states=(tensor([[[ 0.1158, -0.0210, -0.0104,  ..., -0.0483,  0.0827,  0.0521],
         [-0.4306, -0.0558,  0.1098,  ..., -0.1587, -0.4074,  0.0425],
         [ 0.0046,  0.2573,  0.0362,  ...,  0.0495, -0.2659, -0.1285],
         ...,
         [-0.23

In [8]:
logits_before_softmax = output_full.logits
print(logits_before_softmax.size())
prediction = logits_before_softmax[0].argmax(axis=-1)
print(prediction)
print(tokenizer1.decode(prediction))

torch.Size([1, 14, 50265])
tensor([    4,  8585,    16, 41118, 13581,    11,     5,   235, 10665,     6, 22206,     9,  1437,     4])
.there is diffuse consolidation in the right lung, indicative of.


In [9]:
print(len(output_full.hidden_states))
last_hidden_state = output_full.hidden_states[-1]
print(last_hidden_state)
print(last_hidden_state.mean(dim=1).size())
print(last_hidden_state.squeeze()[0, :])

13
tensor([[[-0.1256,  0.0642, -0.0674,  ..., -0.2878, -0.0828, -0.0428],
         [ 0.2194, -0.1838,  0.1381,  ..., -0.0063, -0.1952,  0.0487],
         [ 0.0841,  0.3627,  0.2097,  ..., -0.4123, -0.0723, -0.0095],
         ...,
         [ 0.1294, -0.0459,  0.1020,  ..., -0.1478,  0.1431,  0.0026],
         [ 0.2263, -0.1205,  0.0422,  ..., -0.4867, -0.1622,  0.1561],
         [-0.1260,  0.0641, -0.0675,  ..., -0.2884, -0.0831, -0.0429]]])
torch.Size([1, 768])
tensor([-1.2561e-01,  6.4220e-02, -6.7388e-02, -4.8061e-02,  2.7138e-01, -1.4991e-01,  7.8765e-03, -1.2044e-02,  4.6624e-02, -1.3837e-01,  2.4973e-02, -2.1923e-02, -3.7160e-02, -1.3059e-01,
        -7.0208e-02, -9.2608e-03,  4.4490e-02,  3.9585e-02, -1.6303e-02, -2.8625e-02, -1.0248e-01,  1.2490e-01,  1.0356e-01,  9.4474e-02,  9.8920e-02,  3.5897e-02,  1.1930e-01,  3.1564e-02,
         1.5390e-01, -3.1594e-02, -1.0037e-01, -2.5524e-02,  7.7781e-02, -5.3618e-02, -2.0454e-02, -5.1781e-02,  7.0981e-02, -1.2935e-02, -1.3886e-02,  2.

In [10]:
embeddings_pipeline = pipeline('feature-extraction', tokenizer=tokenizer1, model=base_model1)
embedding_output = embeddings_pipeline(masked_sentence)

In [11]:
print(embedding_output[0][0])
print(last_hidden_state.squeeze()[0, :])
print(len(embedding_output[0][0]))
print(len(last_hidden_state.squeeze()[0, :]))
print(last_hidden_state.squeeze()[0, :] == embedding_output[0][0])

[-0.12561452388763428, 0.06421953439712524, -0.06738809496164322, -0.04806055873632431, 0.2713826298713684, -0.14991183578968048, 0.007876463234424591, -0.01204390823841095, 0.046623922884464264, -0.13837450742721558, 0.024972612038254738, -0.021923096850514412, -0.0371602401137352, -0.13058960437774658, -0.0702078714966774, -0.00926084816455841, 0.044490233063697815, 0.03958526998758316, -0.016303004696965218, -0.02862510457634926, -0.10248009860515594, 0.12489702552556992, 0.10355852544307709, 0.09447358548641205, 0.09892034530639648, 0.03589686378836632, 0.11930345743894577, 0.03156440705060959, 0.15389728546142578, -0.03159384801983833, -0.1003681942820549, -0.025524117052555084, 0.07778114080429077, -0.053618062287569046, -0.02045433409512043, -0.05178115889430046, 0.0709814578294754, -0.012935377657413483, -0.01388633158057928, 0.026556264609098434, -0.10963022708892822, 0.010161401703953743, 0.1659633368253708, -0.042969826608896255, 0.0366642102599144, -0.026049578562378883, 0.

## Cosine matrix ##
Checking the cosine matrix between embeddings

In [25]:
def last_hidden_layers(tokenizer, model, sentence_list):
    last_hidden_states, sentence_embeddings = list(), list()
    with torch.no_grad():
        for s in sentence_list:
            tokens = tokenizer(s, return_tensors='pt', padding=True, truncation=True)
            output = model(**tokens, output_hidden_states=True)
            last_hidden_state = output.hidden_states[-1]
            last_hidden_states.append(last_hidden_state)
            sentence_embeddings.append(last_hidden_state.squeeze()[0, :])
    return last_hidden_states, sentence_embeddings

def calc_cosine_sim_matrix(sentence_embeddings):
    stacked_sentence_embeddings = torch.stack(sentence_embeddings)
    # Calculate the cosine similarity matrix
    cosine_sim_matrix = F.cosine_similarity(stacked_sentence_embeddings.unsqueeze(1), stacked_sentence_embeddings.unsqueeze(0), dim=2)
    return stacked_sentence_embeddings, cosine_sim_matrix

In [5]:
last_hidden_states, sentence_embeddings = last_hidden_layers(tokenizer1, full_model1, sentence_list)
stacked_sentence_embeddings, cosine_sim_matrix = calc_cosine_sim_matrix(sentence_embeddings)
print(stacked_sentence_embeddings.size())
print(cosine_sim_matrix)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


torch.Size([9, 768])
tensor([[1.0000, 0.9866, 0.9625, 0.9575, 0.3582, 0.5197, 0.5173, 0.5045, 0.4987],
        [0.9866, 1.0000, 0.9430, 0.9640, 0.3649, 0.5329, 0.5076, 0.4981, 0.5171],
        [0.9625, 0.9430, 1.0000, 0.9868, 0.3074, 0.5465, 0.5548, 0.4700, 0.4553],
        [0.9575, 0.9640, 0.9868, 1.0000, 0.3147, 0.5630, 0.5479, 0.4679, 0.4772],
        [0.3582, 0.3649, 0.3074, 0.3147, 1.0000, 0.2991, 0.2910, 0.3350, 0.3374],
        [0.5197, 0.5329, 0.5465, 0.5630, 0.2991, 1.0000, 0.9799, 0.8998, 0.9070],
        [0.5173, 0.5076, 0.5548, 0.5479, 0.2910, 0.9799, 1.0000, 0.9021, 0.8684],
        [0.5045, 0.4981, 0.4700, 0.4679, 0.3350, 0.8998, 0.9021, 1.0000, 0.9764],
        [0.4987, 0.5171, 0.4553, 0.4772, 0.3374, 0.9070, 0.8684, 0.9764, 1.0000]])


## More tests ##

In [8]:
sentence1_base = "A <SizeModifier> <AbnormalReport> can be seen in the report in the <LocationModifier> part"
sentence2_base = "The report shows a <SizeModifier> <LocationModifier> <AbnormalReport>"
size_modifiers = ['small', 'large']
loc_modifiers = ['upper-left', 'lower-left', 'right-sided', 'left-sided']
abnormal_report = ['pleural effusion']

In [9]:
l1 = [sentence1_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]
l2 = [sentence2_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]

In [10]:
print('\n'.join(l1))
print('\n'.join(l2))

A small pleural effusion can be seen in the report in the upper-left part
A small pleural effusion can be seen in the report in the lower-left part
A small pleural effusion can be seen in the report in the right-sided part
A small pleural effusion can be seen in the report in the left-sided part
A large pleural effusion can be seen in the report in the upper-left part
A large pleural effusion can be seen in the report in the lower-left part
A large pleural effusion can be seen in the report in the right-sided part
A large pleural effusion can be seen in the report in the left-sided part
The report shows a small upper-left pleural effusion
The report shows a small lower-left pleural effusion
The report shows a small right-sided pleural effusion
The report shows a small left-sided pleural effusion
The report shows a large upper-left pleural effusion
The report shows a large lower-left pleural effusion
The report shows a large right-sided pleural effusion
The report shows a large left-sid

In [11]:
last_hidden_states2, sentence_embeddings2 = last_hidden_layers(tokenizer1, full_model1, l1 + l2)
stacked_sentence_embeddings2, cosine_sim_matrix2 = calc_cosine_sim_matrix(sentence_embeddings2)
print(stacked_sentence_embeddings2.size())
print(cosine_sim_matrix2)

torch.Size([16, 768])
tensor([[1.0000, 0.9939, 0.9738, 0.9727, 0.9814, 0.9744, 0.9425, 0.9432, 0.8992, 0.8797, 0.8318, 0.8322, 0.8439, 0.8331, 0.7636, 0.7699],
        [0.9939, 1.0000, 0.9759, 0.9767, 0.9770, 0.9818, 0.9459, 0.9482, 0.8996, 0.8961, 0.8391, 0.8415, 0.8439, 0.8480, 0.7703, 0.7784],
        [0.9738, 0.9759, 1.0000, 0.9968, 0.9687, 0.9695, 0.9813, 0.9795, 0.8847, 0.8746, 0.8635, 0.8601, 0.8470, 0.8439, 0.8092, 0.8116],
        [0.9727, 0.9767, 0.9968, 1.0000, 0.9668, 0.9695, 0.9773, 0.9818, 0.8878, 0.8799, 0.8624, 0.8694, 0.8481, 0.8477, 0.8057, 0.8180],
        [0.9814, 0.9770, 0.9687, 0.9668, 1.0000, 0.9940, 0.9737, 0.9731, 0.8973, 0.8792, 0.8337, 0.8317, 0.8862, 0.8733, 0.8094, 0.8125],
        [0.9744, 0.9818, 0.9695, 0.9695, 0.9940, 1.0000, 0.9754, 0.9765, 0.8964, 0.8942, 0.8399, 0.8401, 0.8845, 0.8870, 0.8148, 0.8199],
        [0.9425, 0.9459, 0.9813, 0.9773, 0.9737, 0.9754, 1.0000, 0.9968, 0.8718, 0.8631, 0.8545, 0.8488, 0.8784, 0.8734, 0.8460, 0.8449],
        [0.9

In [12]:
negative_sentences = ['The report shows no pleural effusion', 'The report shows no consolidation on any side']
all_sentence_list = l1 + l2 + negative_sentences + sentence_list[4:]
print('\n'.join(all_sentence_list))

A small pleural effusion can be seen in the report in the upper-left part
A small pleural effusion can be seen in the report in the lower-left part
A small pleural effusion can be seen in the report in the right-sided part
A small pleural effusion can be seen in the report in the left-sided part
A large pleural effusion can be seen in the report in the upper-left part
A large pleural effusion can be seen in the report in the lower-left part
A large pleural effusion can be seen in the report in the right-sided part
A large pleural effusion can be seen in the report in the left-sided part
The report shows a small upper-left pleural effusion
The report shows a small lower-left pleural effusion
The report shows a small right-sided pleural effusion
The report shows a small left-sided pleural effusion
The report shows a large upper-left pleural effusion
The report shows a large lower-left pleural effusion
The report shows a large right-sided pleural effusion
The report shows a large left-sid

In [13]:
last_hidden_states3, sentence_embeddings3 = last_hidden_layers(tokenizer1, full_model1, all_sentence_list)
stacked_sentence_embeddings3, cosine_sim_matrix3 = calc_cosine_sim_matrix(sentence_embeddings3)
print(stacked_sentence_embeddings3.size())
print(cosine_sim_matrix3)

torch.Size([23, 768])
tensor([[1.0000, 0.9939, 0.9738, 0.9727, 0.9814, 0.9744, 0.9425, 0.9432, 0.8992, 0.8797, 0.8318, 0.8322, 0.8439, 0.8331, 0.7636, 0.7699, 0.8136, 0.6586, 0.4369, 0.5004, 0.5003, 0.5375, 0.5245],
        [0.9939, 1.0000, 0.9759, 0.9767, 0.9770, 0.9818, 0.9459, 0.9482, 0.8996, 0.8961, 0.8391, 0.8415, 0.8439, 0.8480, 0.7703, 0.7784, 0.8132, 0.6683, 0.4246, 0.5137, 0.5105, 0.5444, 0.5356],
        [0.9738, 0.9759, 1.0000, 0.9968, 0.9687, 0.9695, 0.9813, 0.9795, 0.8847, 0.8746, 0.8635, 0.8601, 0.8470, 0.8439, 0.8092, 0.8116, 0.8138, 0.6488, 0.4411, 0.5290, 0.5307, 0.5475, 0.5339],
        [0.9727, 0.9767, 0.9968, 1.0000, 0.9668, 0.9695, 0.9773, 0.9818, 0.8878, 0.8799, 0.8624, 0.8694, 0.8481, 0.8477, 0.8057, 0.8180, 0.8114, 0.6476, 0.4408, 0.5408, 0.5315, 0.5489, 0.5474],
        [0.9814, 0.9770, 0.9687, 0.9668, 1.0000, 0.9940, 0.9737, 0.9731, 0.8973, 0.8792, 0.8337, 0.8317, 0.8862, 0.8733, 0.8094, 0.8125, 0.7998, 0.6732, 0.4113, 0.5479, 0.5546, 0.5371, 0.5188],
        

## Analysing Cleaned Report Dataset ##

The cleaned data is stored in /models_common_e2e/data/cxr_data/text_reports_cleaned. A sample data is as follows:

### File: ca.phase2.unit1.f1.ff5fe73b057f0093a1f5d32a281dd63c2255eaa60aa518c3b909e1a5.txt ###

```
 xr- chest pa  view
 findings
 lungs: normal.
 trachea: normal.
 carina: normal.
 right and left main bronchi: normal.
 pleura: normal.
 heart: normal.
 right heart border: normal.
 left heart border: normal.
 pulmonary bay: normal.
 pulmonary hila: normal.
 aorta: normal.
 thoracic spine: normal.
 other visualized bones: normal.
 visualized soft tissues: normal.
 diaphragm: normal.
 visualized abdomen:  normal.
 visualized neck: normal.
```

### File max.dev3.106027018.txt ###

```
6191206|3862169|x-ray chest pa/ap view of 09-feb-2018:
results:
post cabg status.
no focal lesion seen in the lung parenchyma.
cp angles and domes of the diaphragm are normal.
both hila are normal. pulmonary vasculature is normal.
cardiac size and configuration is normal.
trachea is central; no mediastinal shift is seen.
bony thorax and soft tissues of the chest wall are normal.
impression: no abnormality detected in the view obtained.
```

### File medall.1.2.826.0.1.3680043.8.437.1.2.2.0.7744.1446378402.18241.txt###
```
x-ray chest (pa view)
the cardio thoracic ratio is normal.
the heart size and configuration are within normal limits.
the aortic arch is normal.
the lung fields show normal broncho-vascular markings.
both the pulmonary hila are normal in size.
the costophrenic and cardiophrenic recesses and the domes of
diaphragm are normal.
the bones and soft tissues of the chest wall show no abnormality.
impression : normal study.
dr.shakthi kumar
radiologist
ss
________________________________________________________
```

In [5]:
dataset1_template = """xr- chest pa  view
findings
lungs: normal.
trachea: normal.
carina: normal.
right and left main bronchi: normal.
pleura: normal.
heart: normal.
right heart border: normal.
left heart border: normal.
pulmonary bay: normal.
pulmonary hila: normal.
aorta: normal.
thoracic spine: normal.
other visualized bones: normal.
visualized soft tissues: normal.
diaphragm: normal.
visualized abdomen:  normal.
visualized neck: normal."""

In [6]:
dataset1_pleural_issue="""
xr- chest pa view
findings
lungs: normal.
trachea: normal.
carina: normal.
right and left main bronchi: normal.
pleura: left costophrenic angle is blunted with thin stripe of homogenous opacity along left lateral chest wall.
heart: normal.
right heart border: normal.
left heart border: normal.
pulmonary bay: normal.
pulmonary hila: normal.
aorta: normal.
thoracic spine: normal.
other visualized bones: normal.
visualized soft tissues: normal.
diaphragm: normal.
visualized abdomen:  normal.
visualized neck: normal."""

In [7]:
dataset2_template="""6191206|3862169|x-ray chest pa/ap view of 09-feb-2018:
results:
post cabg status.
no focal lesion seen in the lung parenchyma.
cp angles and domes of the diaphragm are normal.
both hila are normal. pulmonary vasculature is normal.
cardiac size and configuration is normal.
trachea is central; no mediastinal shift is seen.
bony thorax and soft tissues of the chest wall are normal.
impression: no abnormality detected in the view obtained.
"""

In [8]:
dataset3_template="""
x-ray chest (pa view)
the cardio thoracic ratio is normal.
the heart size and configuration are within normal limits.
the aortic arch is normal.
the lung fields show normal broncho-vascular markings.
both the pulmonary hila are normal in size.
the costophrenic and cardiophrenic recesses and the domes of
diaphragm are normal.
the bones and soft tissues of the chest wall show no abnormality.
impression : normal study.
dr.shakthi kumar
radiologist
ss
________________________________________________________
"""

In [18]:
last_hidden_states4, sentence_embeddings4 = last_hidden_layers(tokenizer1, full_model1, [dataset1_template, dataset1_pleural_issue, dataset2_template, dataset3_template])
stacked_sentence_embeddings4, cosine_sim_matrix4 = calc_cosine_sim_matrix(sentence_embeddings4)
print(stacked_sentence_embeddings4.size())
print(cosine_sim_matrix4)

torch.Size([4, 768])
tensor([[1.0000, 0.9347, 0.5985, 0.5178],
        [0.9347, 1.0000, 0.6101, 0.5002],
        [0.5985, 0.6101, 1.0000, 0.5870],
        [0.5178, 0.5002, 0.5870, 1.0000]])


## New Embeddings ##
Instead of taking sentence embedding to be the vector in final hidden layer corresponding to SOS or EOS, the embedding can be taken for MASK token in the promt 
```
Given the following report: <REPORT>, the final diagnoisis of the patient is <MASK>
```

In [22]:
summarization_prompt = "Report: <REPORT>. The report shows " + "[MASK] " + "on the right side"

def last_hidden_layers2(tokenizer, model, sentence_list):
    last_hidden_states, sentence_embeddings = list(), list()
    with torch.no_grad():
        for s in sentence_list:
            s = summarization_prompt.replace("<REPORT>", s)
            tokens = tokenizer(s, return_tensors='pt', padding=True, truncation=True)
            output = model(**tokens, output_hidden_states=True)
            last_hidden_state = output.hidden_states[-1]
            last_hidden_states.append(last_hidden_state)
            sentence_embeddings.append(last_hidden_state.squeeze()[-2, :])
    return last_hidden_states, sentence_embeddings


In [23]:
masked_prompt = summarization_prompt.replace("<REPORT>", "The report shows small right-sided pleural effusion")
print(masked_prompt)
tokens = tokenizer1(masked_prompt, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    output = full_model1(**tokens)
print(output)

logits_before_softmax = output.logits
prediction = logits_before_softmax[0].argmax(axis=-1)
print(tokenizer1.decode(prediction))


Report: The report shows small right-sided pleural effusion. The report shows [MASK] on the right side
MaskedLMOutput(loss=None, logits=tensor([[[ -9.5234,  -7.7479,  -8.1387,  ...,  -8.9454,  -8.1242,  -8.0690],
         [-10.7803,  -8.8624,  -9.1088,  ...,  -9.6556,  -9.7797,  -9.9205],
         [-12.2636, -10.6406, -11.3034,  ..., -11.3265, -10.7345, -11.0552],
         ...,
         [-10.4796, -12.3347, -12.8499,  ..., -12.3471, -11.9310, -11.8114],
         [-10.7115, -10.3418,  -9.7652,  ..., -10.3161, -10.3574, -10.0931],
         [ -9.5234,  -7.7479,  -8.1387,  ...,  -8.9454,  -8.1242,  -8.0690]]]), hidden_states=None, attentions=None)
p report : the report shows small right - sided pleural effusion. the report shows changes on the right side p


In [26]:
last_hidden_states5, sentence_embeddings5 = last_hidden_layers2(tokenizer1, full_model1, [dataset1_template, dataset1_pleural_issue, dataset2_template, dataset3_template])
stacked_sentence_embeddings5, cosine_sim_matrix5 = calc_cosine_sim_matrix(sentence_embeddings5)
print(stacked_sentence_embeddings5.size())
print(cosine_sim_matrix5)

torch.Size([4, 768])
tensor([[1.0000, 0.9428, 0.8330, 0.8657],
        [0.9428, 1.0000, 0.8673, 0.8997],
        [0.8330, 0.8673, 1.0000, 0.9056],
        [0.8657, 0.8997, 0.9056, 1.0000]])


## Using BioMedLM ##
Using BioMedLM instead of RadBert model for report embeddings

https://huggingface.co/stanford-crfm/BioMedLM

In [32]:
checkpoint2 = "stanford-crfm/BioMedLM"

tokenizer2 = AutoTokenizer.from_pretrained(checkpoint2)
full_model2 = AutoModelForCausalLM.from_pretrained(checkpoint2)
base_model2 = AutoModel.from_pretrained(checkpoint2)

## Sanity Checks for BioMedLM ##

In [35]:
summarization_prompt_gpt = "Report: \n<REPORT>\n Impression: "


masked_sentence = summarization_prompt_gpt.replace('<REPORT>', sentence_list[0])
print(masked_sentence)
tokens = tokenizer2(masked_sentence, return_tensors='pt')
print(tokens)
with torch.no_grad():
    output_full = full_model2(**tokens, output_hidden_states=True)
    output_base = base_model2(**tokens, output_hidden_states=True)

Report: 
The report shows small right-sided pleural effusion
 Impression: 
{'input_ids': tensor([[ 4848,   607,    25,   220,   198,   714,  1492,  3410,  1409,  2497,    12, 12712,  9438,  9814,   198,  2472,   680,    25,   220]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [38]:
print(output_full)
print(output_full.logits.size())
print(len(output_full.hidden_states))
print(output_full.hidden_states[-1].size())

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[  3.0114,   4.2856,   2.6358,  ...,  -1.1197,   0.0818,  -7.1363],
         [ -5.8972,  -4.7492,  -6.2327,  ..., -18.1685, -13.5248, -19.9139],
         [-11.4287,  -8.4990, -11.2120,  ..., -20.2264, -11.8382, -23.7235],
         ...,
         [  9.9027,   6.5636,   5.6208,  ...,  -0.8483,   0.8190,  -6.8225],
         [ -1.3679,  -2.2083,  -5.4045,  ..., -10.0803,  -7.4748, -15.2467],
         [  8.0426,   2.1764,   1.2141,  ...,  -6.7583,  -3.9359, -11.8822]]]), past_key_values=None, hidden_states=(tensor([[[ 0.0443,  0.0031, -0.0161,  ..., -0.0163,  0.0185, -0.0165],
         [-0.0081,  0.0122,  0.0081,  ...,  0.0247, -0.0038,  0.0047],
         [-0.0087,  0.0375, -0.0105,  ...,  0.0159, -0.0046,  0.0343],
         ...,
         [-0.0233,  0.0070,  0.0253,  ...,  0.0025, -0.0032, -0.0032],
         [-0.0114,  0.0267, -0.0103,  ...,  0.0066,  0.0012,  0.0394],
         [-0.0088,  0.0064,  0.0056,  ..., -0.0088,  0.0268,  0

In [40]:
print(output_base)
print(output_base.last_hidden_state.size())
print(len(output_base.hidden_states))
print(output_base.hidden_states[-1].size())

BaseModelOutputWithPastAndCrossAttentions(last_hidden_state=tensor([[[-0.7844,  2.4187, -0.7710,  ...,  0.5041,  0.4723, -0.2954],
         [ 0.6280,  0.0158, -0.7458,  ...,  1.3180, -0.4291, -0.7334],
         [ 0.1725, -0.6851,  0.2948,  ...,  1.0713, -3.1548, -1.3083],
         ...,
         [ 0.5096, -0.4049,  2.1805,  ...,  0.6665, -0.5261,  0.9300],
         [-1.3047,  1.9672,  2.3080,  ..., -0.9352, -2.3571,  1.1515],
         [-0.3793,  2.5821,  0.4518,  ..., -1.0776, -1.4675, -0.2161]]]), past_key_values=None, hidden_states=(tensor([[[ 0.0443,  0.0031, -0.0161,  ..., -0.0163,  0.0185, -0.0165],
         [-0.0081,  0.0122,  0.0081,  ...,  0.0247, -0.0038,  0.0047],
         [-0.0087,  0.0375, -0.0105,  ...,  0.0159, -0.0046,  0.0343],
         ...,
         [-0.0233,  0.0070,  0.0253,  ...,  0.0025, -0.0032, -0.0032],
         [-0.0114,  0.0267, -0.0103,  ...,  0.0066,  0.0012,  0.0394],
         [-0.0088,  0.0064,  0.0056,  ..., -0.0088,  0.0268,  0.0490]]]), tensor([[[ 0.1545

In [42]:
logits_before_softmax = output_full.logits
#print(logits_before_softmax)
print(logits_before_softmax.size())
prediction = logits_before_softmax[0].argmax(axis=-1)
print(prediction)
print(tokenizer2.decode(prediction))

torch.Size([1, 19, 28896])
tensor([12555,   293,     9,    59,   198,  1512,   273,   380,   678,    12,  1761,  9438,  9814,   319,   198,  3117,    25, 20933,   198])
presenting*\
 aim of that but-to pleural effusion with
mediate: Ple



## Using pipeline for Text Generation ##
Using pipeline to generate impressions

In [43]:
generation_pipeline = pipeline('text-generation', model=full_model2, tokenizer=tokenizer2)

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [44]:
output_pipeline = generation_pipeline(masked_sentence)

Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


In [45]:
print(output_pipeline)

[{'generated_text': 'Report: \nThe report shows small right-sided pleural effusion\n Impression: \nCT scans of the patient revealed small right-sided pleural effusion and atelectasis of the right lower lobe. There was small nodule in the right'}]


In [48]:
batched_reports = [summarization_prompt_gpt.replace('<REPORT>', report) for report in [dataset1_template, dataset1_pleural_issue, dataset2_template, dataset3_template]]
output_pipeline2 = generation_pipeline(batched_reports, max_new_tokens=30)

Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:28895 for open-end generation.


In [49]:
print(output_pipeline2)

[[{'generated_text': 'Report: \nxr- chest pa  view\nfindings\nlungs: normal.\ntrachea: normal.\ncarina: normal.\nright and left main bronchi: normal.\npleura: normal.\nheart: normal.\nright heart border: normal.\nleft heart border: normal.\npulmonary bay: normal.\npulmonary hila: normal.\naorta: normal.\nthoracic spine: normal.\nother visualized bones: normal.\nvisualized soft tissues: normal.\ndiaphragm: normal.\nvisualized abdomen:  normal.\nvisualized neck: normal.\n Impression: \\`This patient has a 3rd degree tear, but no loss of substance is seen to the right eye. The conjunctiva is intact and red.'}], [{'generated_text': 'Report: \n\nxr- chest pa view\nfindings\nlungs: normal.\ntrachea: normal.\ncarina: normal.\nright and left main bronchi: normal.\npleura: left costophrenic angle is blunted with thin stripe of homogenous opacity along left lateral chest wall.\nheart: normal.\nright heart border: normal.\nleft heart border: normal.\npulmonary bay: normal.\npulmonary hila: normal