# Fine-tuning RadBert #
Full parameter fine tuning of RadBERT model (RadBERT-RoBERTa-4m model from the RadBERT paper). Fine tuning carried out on the multi-label muti-class classification of reports, where each report can have multiple labels (For ex, a report can havel label consolidation-right and consolidation-2)

In [None]:
import os
import datetime

import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

torch.set_printoptions(linewidth=200)

In [3]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


## Forward pass ##
Implementing RadBERTMultiClassMulti Label PyTorch Model

In [4]:
class RadBERTMultiClassMultiLabel(nn.Module):
    """
    RadBERTMultiClassMultiLabel: Model expects batches of natural language sentences, will
    classify reports with multiple label
    """
    def __init__(self, num_classes, checkpoint):
        super().__init__()
        self.num_classes = num_classes
        self.checkpoint = checkpoint
        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.transformer_encoder = AutoModel.from_pretrained(self.checkpoint)
        self.transformer_encoder_hidden_size = self.transformer_encoder.config.hidden_size
        self.linear_classifier = nn.Linear(self.transformer_encoder_hidden_size, self.num_classes)
    
    def forward(self, x):
        tokenized_inp = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt')
        encoder_out = self.transformer_encoder(**tokenized_inp)
        logits = self.linear_classifier(encoder_out.last_hidden_state[:, 0, :])
        return logits


In [5]:
checkpoint = 'UCSD-VA-health/RadBERT-RoBERTa-4m'
#radbert_multi_model = RadBERTMultiClassMultiLabel(321, checkpoint).to(device)
radbert_multi_model = RadBERTMultiClassMultiLabel(322, checkpoint)

Some weights of RobertaModel were not initialized from the model checkpoint at UCSD-VA-health/RadBERT-RoBERTa-4m and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
print(radbert_multi_model)
print(list(map(lambda x : x.shape, radbert_multi_model.parameters())))

RadBERTMultiClassMultiLabel(
  (transformer_encoder): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
      

## Custom Loss function ##
Custom loss function for multi class multi label classification to handle uncertain tags (tags have value 0 -> absent, 1 -> present, -100 -> uncertain)

In [7]:
class MultiClassMultiLabel(nn.Module):
    def __init__(self, uncertain_label):
        super(MultiClassMultiLabel, self).__init__()
        self.uncertain_label = uncertain_label
    
    def forward(self, output, target):
        certain_mask = (target != self.uncertain_label)
        loss_func = nn.MultiLabelSoftMarginLoss(weight=certain_mask.type(torch.float))
        return loss_func(output, target)

In [8]:
multiclass_multilabel_loss = MultiClassMultiLabel(-100)

### Testing MultiClassMultiLabel loss function ###

In [9]:
logit_tensor = torch.Tensor([[-1.0, 2.0, 1.0, 5.0, -3.0], [4.0, -2.0, 1.0, -1.0, 2.5]])
target_tensor_act = torch.Tensor([[0, 1, -100, 0, 0], [1, -100, 0, 0, 1]])

In [10]:
#certain_mask = (target_tensor_act != -100)
#print(certain_mask.type(torch.float))
#loss_func = nn.MultiLabelSoftMarginLoss(weight=(certain_mask).type(torch.float))
#print(loss_func(logit_tensor, target_tensor_act))
print(multiclass_multilabel_loss(logit_tensor, target_tensor_act))

tensor(0.7219)


In [11]:
import math
from math import log, exp

def log_sigmoid(x):
    return log(1/(1+exp(-1*x)))

In [12]:
loss1 = -1 * (1.0 * log_sigmoid(1.0) + 1.0 * log_sigmoid(2.0) + 0.0 * log_sigmoid(-1.0) + 1.0 * log_sigmoid(-5.0) + 1.0 * log_sigmoid(3.0)) / 5.0
loss2 = -1 * (1.0 * log_sigmoid(4.0) + 0.0 * log_sigmoid(2.0) + 1.0 * log_sigmoid(-1.0) + 1.0 * log_sigmoid(1.0) + 1.0 * log_sigmoid(2.5)) / 5.0

print(loss1)
print(loss2)
print((loss1 + loss2)/2.0)

1.0990984797248111
0.34471260744936094
0.721905543587086


## Custom DataLoader and Dataset ##
Read csv file, get the report from path and prepare the data

In [13]:
class ReportTagsDataset(Dataset):
    def __init__(self, tags_csv_file, report_base_path, text_transform=None, target_transform=None):
        self.report_base_path = report_base_path
        self.tags_csv_file = tags_csv_file

        self.tags_df = pd.read_csv(self.tags_csv_file)
        self.column_names = list(self.tags_df.columns.values)
        self.column_names[0] = 'filename'
        self.tags_df.columns = self.column_names

        self.text_transform = text_transform
        self.target_transform = target_transform
    
    def __len__(self):
        return self.tags_df.shape[0]
    
    def __getitem__(self, index):
        report_path = os.path.join(self.report_base_path, self.tags_df.iloc[index, 0].split('/')[-1] + '.txt')
        report_text = open(report_path).read()
        target_list = torch.Tensor(list(self.tags_df.iloc[index][1:]))
        return report_text, target_list

In [14]:
#report_base_path = "/models_common_e2e/cxr_data/reports/training"
train_reports_base_path = '/home/users/pranav.rao/MiniTasks/Radbert/data/train'
test_reports_base_path = '/home/users/pranav.rao/MiniTasks/Radbert/data/test'
train_tags_file = '/home/users/pranav.rao/Downloads/report_tags_25k_train.csv'
test_tags_file = '/home/users/pranav.rao/Downloads/report_tags_25k_train.csv'

In [15]:
train_data = ReportTagsDataset(train_tags_file, train_reports_base_path)
test_data = ReportTagsDataset(test_tags_file, test_reports_base_path)

In [16]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True)

### Testing Dataset and DataLoader ###

In [17]:
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))

FileNotFoundError: [Errno 2] No such file or directory: '/models_common_e2e/cxr_data/reports/training/ca.phaseii.unit2.1937e108a204179f11fd8e8bf31dd7ec60ced1ea73b3b588270f2573.txt'

In [34]:
print(train_features)
print(train_labels)
print(f"Feature batch shape: {len(train_features)}")
print(f"Labels batch shape: {len(train_labels)}")
report_text = train_features[0]
print(report_text)
label = train_labels[0]
print(f"Label: {label}")

('6434328|5725519|X-Ray Chest PA/AP View of 06-MAY-2018:\nResults:\nNo focal lesion seen in the lung parenchyma. \nCP angles and domes of the diaphragm are normal.\nBoth hila are normal. Pulmonary vasculature is normal.\nCardiac size and configuration is normal.\nTrachea is central; no mediastinal shift is seen.\nBony thorax and soft tissues of the chest wall are normal.\nIMPRESSION: No abnormality detected in the view obtained.\n6434328|5725519|X-Ray Chest PA/AP View of 06-MAY-2018:\nResults:\nNo focal lesion seen in the lung parenchyma. \nCP angles and domes of the diaphragm are normal.\nBoth hila are normal. Pulmonary vasculature is normal.\nCardiac size and configuration is normal.\nTrachea is central; no mediastinal shift is seen.\nBony thorax and soft tissues of the chest wall are normal.\nIMPRESSION: No abnormality detected in the view obtained.', '6915839|6146524|X-Ray Chest PA/AP View of 27-OCT-2018:\nBoth lung fields are clear except small nodular opacity in left parahilar re

In [45]:
logit_tensor = radbert_multi_model(train_features)

In [47]:
print(logit_tensor)
print(logit_tensor.shape)

tensor([[-0.1485, -0.3681,  0.0774,  ..., -0.2707, -0.0061, -0.1722],
        [-0.1415, -0.3708,  0.1239,  ..., -0.2729, -0.0105, -0.1739],
        [-0.1658, -0.3611,  0.0246,  ..., -0.2912,  0.0304, -0.1404],
        ...,
        [-0.1485, -0.3477,  0.0417,  ..., -0.2952,  0.0045, -0.1395],
        [-0.1423, -0.3476,  0.0403,  ..., -0.3019, -0.0042, -0.1491],
        [-0.1514, -0.3525,  0.0412,  ..., -0.3050,  0.0014, -0.1509]], grad_fn=<AddmmBackward0>)
torch.Size([32, 322])


## Fine Tuning ##
Fine tuning the RadBERT model for tags prediction task

### Adam Optimizer ###
Using Adam optimizer with learning rate 3e-5, beta1 = 0.9, beta2 = 0.99, l2 weight decay of 0.01

In [19]:
lr = 3e-5
beta1 = 0.9
beta2 = 0.99
l2_weight_decay = 0.01

In [20]:
optimizer = torch.optim.Adam(radbert_multi_model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=l2_weight_decay)

In [21]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_dataloader):
        inputs, labels = data
        optimizer.zero_grad()
        outputs = radbert_multi_model(inputs)

        # Compute the loss and its gradients
        loss = multiclass_multilabel_loss(outputs, labels)
        loss.backward()
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 100 == 999:
            last_loss = running_loss / 100 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

### The full fine-tuning loog ###

In [22]:
total_epochs = 5

In [24]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))
epoch_number = 0

for epoch in range(total_epochs):
    # Make sure gradient tracking is on, and do a pass over the data
    print('EPOCH {}:'.format(epoch_number + 1))
    radbert_multi_model.train(True)
    avg_loss = train_one_epoch(epoch_number, writer)

    # Set the model to evaluation mode, disabling dropout and using population, statistics for batch normalization
    running_vloss = 0.0
    radbert_multi_model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(train_dataloader):
            vinputs, vlabels = vdata
            voutputs = radbert_multi_model(vinputs)
            vloss = multiclass_multilabel_loss(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch_number + 1)
    writer.flush()
    model_path = '/home/users/pranav.rao/MiniTasks/Radbert/model_{}_{}'.format(timestamp, epoch_number)
    torch.save(radbert_multi_model.state_dict(), model_path)
    epoch_number += 1

EPOCH 1:
