# Fine-tuning RadBert #
Full parameter fine tuning of RadBERT model (RadBERT-RoBERTa-4m model from the RadBERT paper). Fine tuning carried out on the multi-label muti-class classification of reports, where each report can have multiple labels (For ex, a report can havel label consolidation-right and consolidation-2)

In [None]:
import os
import time, datetime
import codecs
from itertools import product

import numpy as np
import pandas as pd

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

import transformers
from transformers import AutoTokenizer, AutoModel
from transformers import pipeline

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

torch.set_printoptions(linewidth=200)

In [None]:
# Get cpu, gpu or mps device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

In [None]:
device='cuda:1'

## Forward pass ##
Implementing RadBERTMultiClassMulti Label PyTorch Model

In [None]:
class RadBERTMultiClassMultiLabel(nn.Module):
    """
    RadBERTMultiClassMultiLabel: Model expects batches of natural language sentences, will
    classify reports with multiple label
    """
    def __init__(self, num_classes, checkpoint, device):
        super().__init__()
        self.num_classes = num_classes
        self.checkpoint = checkpoint
        self.device = device

        self.tokenizer = AutoTokenizer.from_pretrained(self.checkpoint)
        self.transformer_encoder = AutoModel.from_pretrained(self.checkpoint)
        self.transformer_encoder_hidden_size = self.transformer_encoder.config.hidden_size
        self.linear_classifier = nn.Linear(self.transformer_encoder_hidden_size, self.num_classes)
    
    def forward(self, x):
        tokenized_inp = self.tokenizer(x, padding=True, truncation=True, return_tensors='pt').to(self.device)
        encoder_out = self.transformer_encoder(**tokenized_inp)
        logits = self.linear_classifier(encoder_out.last_hidden_state[:, 0, :])
        return logits


In [None]:
checkpoint = 'UCSD-VA-health/RadBERT-RoBERTa-4m'
labels_subset = "normal tuberculosis opacity bronchialdilation density parenchymalopacity ett aorticenlargement mediastinalwidening mediastinalmass\
        copd prominentbronchovascularmarkings bronchitis markings vascularprominence interval interstitiallungdisease bluntedcp effusion cardiomegaly\
        consolidation subtle_normal peffusion lineandtube thickening haziness hilarprominence hilar inhomogenousopacity rotation\
        calcification unfoldedaorta bandlikeopacity aorticcalcification aorticknucklecalcification fibrosis suture cardiacshift degenspine nodule\
        pneumonia inspiration fracture pneumonitis justfibrosis lesion nonaorticcalcification tuberculosispure pleuralthickening feedingtube".split()
num_classes = len(labels_subset)

In [None]:
radbert_multi_model = RadBERTMultiClassMultiLabel(num_classes, checkpoint, device).to(device)

In [None]:
print(radbert_multi_model)
print(list(map(lambda x : x.shape, radbert_multi_model.parameters())))

## Custom Loss function ##
Custom loss function for multi class multi label classification to handle uncertain tags (tags have value 0 -> absent, 1 -> present, -100 -> uncertain)

In [None]:
class MultiClassMultiLabel(nn.Module):
    def __init__(self, uncertain_label):
        super(MultiClassMultiLabel, self).__init__()
        self.uncertain_label = uncertain_label
    
    def forward(self, output, target):
        certain_mask = (target != self.uncertain_label)
        loss_func = nn.MultiLabelSoftMarginLoss(weight=certain_mask.type(torch.float))
        return loss_func(output, target)

In [None]:
multiclass_multilabel_loss = MultiClassMultiLabel(-100).to(device)

### Testing MultiClassMultiLabel loss function ###

In [None]:
logit_tensor = torch.Tensor([[-1.0, 2.0, 1.0, 5.0, -3.0], [4.0, -2.0, 1.0, -1.0, 2.5]]).to(device)
target_tensor_act = torch.Tensor([[0, 1, -100, 0, 0], [1, -100, 0, 0, 1]]).to(device)

In [None]:
#certain_mask = (target_tensor_act != -100)
#print(certain_mask.type(torch.float))
#loss_func = nn.MultiLabelSoftMarginLoss(weight=(certain_mask).type(torch.float))
#print(loss_func(logit_tensor, target_tensor_act))
print(multiclass_multilabel_loss(logit_tensor, target_tensor_act))

In [None]:
import math
from math import log, exp

def log_sigmoid(x):
    return log(1/(1+exp(-1*x)))

In [None]:
loss1 = -1 * (1.0 * log_sigmoid(1.0) + 1.0 * log_sigmoid(2.0) + 0.0 * log_sigmoid(-1.0) + 1.0 * log_sigmoid(-5.0) + 1.0 * log_sigmoid(3.0)) / 5.0
loss2 = -1 * (1.0 * log_sigmoid(4.0) + 0.0 * log_sigmoid(2.0) + 1.0 * log_sigmoid(-1.0) + 1.0 * log_sigmoid(1.0) + 1.0 * log_sigmoid(2.5)) / 5.0

print(loss1)
print(loss2)
print((loss1 + loss2)/2.0)

## Custom DataLoader and Dataset ##
Read csv file, get the report from path and prepare the data

In [None]:
class ReportTagsDataset(Dataset):
    def __init__(self, tags_csv_file, report_base_path, labels_subset=None, text_transform=None, target_transform=None):
        self.report_base_path = report_base_path
        self.tags_csv_file = tags_csv_file

        self.tags_df = pd.read_csv(self.tags_csv_file)
        self.column_names = list(self.tags_df.columns.values)
        self.column_names[0] = 'filename'
        self.tags_df.columns = self.column_names

        self.labels_subset = labels_subset
        self.text_transform = text_transform
        self.target_transform = target_transform
    
    def __len__(self):
        return self.tags_df.shape[0]
    
    def __getitem__(self, index):
        report_path = os.path.join(self.report_base_path, self.tags_df.iloc[index, 0].split('/')[-1] + '.txt')
        #report_text = open(report_path).read()
        report_text = codecs.open(report_path, 'r', encoding='utf-8', errors='ignore').read()
        if self.labels_subset is None:
            target_list = torch.Tensor(list(self.tags_df.iloc[index][1:]))
        else:
            target_list = torch.Tensor(list(self.tags_df[self.labels_subset].iloc[index]))
        return report_text, target_list

In [None]:
#report_base_path = "/models_common_e2e/cxr_data/reports/training"
train_reports_base_path = '/home/users/pranav.rao/MiniTasks/Radbert/data/train'
test_reports_base_path = '/home/users/pranav.rao/MiniTasks/Radbert/data/test'
train_tags_file = '/home/users/pranav.rao/Downloads/report_tags_25k_train.csv'
test_tags_file = '/home/users/pranav.rao/Downloads/report_tags_25k_test.csv'

In [None]:
train_data = ReportTagsDataset(train_tags_file, train_reports_base_path, labels_subset=labels_subset)
test_data = ReportTagsDataset(test_tags_file, test_reports_base_path, labels_subset=labels_subset)

In [None]:
train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True, num_workers=2)
test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True, num_workers=2)
#train_dataloader = DataLoader(train_data, batch_size=32, shuffle=True)
#test_dataloader = DataLoader(test_data, batch_size=32, shuffle=True)

### Testing Dataset and DataLoader ###

In [None]:
# Display image and label.
train_features, train_labels = next(iter(train_dataloader))

In [None]:
print(train_features)
print(train_labels)
print(f"Feature batch shape: {len(train_features)}")
print(f"Labels batch shape: {len(train_labels)}")
report_text = train_features[0]
print(report_text)
label = train_labels[0]
print(f"Label: {label}")

In [None]:
with torch.no_grad():
    logit_tensor = radbert_multi_model(train_features)

In [None]:
print(logit_tensor)
print(logit_tensor.shape)

## Fine Tuning ##
Fine tuning the RadBERT model for tags prediction task

### Adam Optimizer ###
Using Adam optimizer with learning rate 3e-5, beta1 = 0.9, beta2 = 0.99, l2 weight decay of 0.01

In [None]:
lr = 3e-5
beta1 = 0.9
beta2 = 0.99
l2_weight_decay = 0.01

In [None]:
optimizer = torch.optim.Adam(radbert_multi_model.parameters(), lr=lr, betas=(beta1, beta2), weight_decay=l2_weight_decay)

In [None]:
def train_one_epoch(epoch_index, tb_writer):
    running_loss = 0.
    last_loss = 0.

    for i, data in enumerate(train_dataloader):
        inputs, labels = data
        labels = labels.to(device)
        optimizer.zero_grad()
        outputs = radbert_multi_model(inputs)

        # Compute the loss and its gradients
        loss = multiclass_multilabel_loss(outputs, labels)
        loss.backward()
        optimizer.step()

        # Gather data and report
        running_loss += loss.item()
        if i % 50 == 49:
            last_loss = running_loss / 50 # loss per batch
            print('  batch {} loss: {}'.format(i + 1, last_loss))
            tb_x = epoch_index * len(train_dataloader) + i + 1
            tb_writer.add_scalar('Loss/train', last_loss, tb_x)
            running_loss = 0.

    return last_loss

### The full fine-tuning loog ###

In [None]:
total_epochs = 5

In [None]:
# Initializing in a separate cell so we can easily add more epochs to the same run
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
writer = SummaryWriter('runs/fashion_trainer_{}'.format(timestamp))

for epoch in range(total_epochs):
    # Make sure gradient tracking is on, and do a pass over the data
    print('EPOCH {}:'.format(epoch + 1))
    radbert_multi_model.train(True)
    avg_loss = train_one_epoch(epoch, writer)
    model_path = '/home/users/pranav.rao/MiniTasks/Radbert/ModelPool/model_{}_{}'.format(timestamp, epoch)
    torch.save(radbert_multi_model.state_dict(), model_path)

    # Set the model to evaluation mode, disabling dropout and using population, statistics for batch normalization
    running_vloss = 0.0
    radbert_multi_model.eval()

    # Disable gradient computation and reduce memory consumption.
    with torch.no_grad():
        for i, vdata in enumerate(test_dataloader):
            vinputs, vlabels = vdata
            vlabels = vlabels.to(device)
            voutputs = radbert_multi_model(vinputs)
            vloss = multiclass_multilabel_loss(voutputs, vlabels)
            running_vloss += vloss

    avg_vloss = running_vloss / (i + 1)
    print('LOSS train {} valid {}'.format(avg_loss, avg_vloss))

    # Log the running loss averaged per batch for both training and validation
    writer.add_scalars('Training vs. Validation Loss',
                    { 'Training' : avg_loss, 'Validation' : avg_vloss },
                    epoch + 1)
    writer.flush()

## Analyzing fine-tuned models ##

In [None]:
sentence_list = ["The report shows small right-sided pleural effusion", "The report shows small left-sided pleural effusion",\
    "The report shows large right-sided pleural effusion", "The report shows large left-sided pleural effusion",\
    "There are no abnormalities in the report",\
    "There is severe consolidation in the left side","There is severe consolidation in the right side",\
    "There is mild consolidation in the right side", "There is mild consolidation in the left side"
]

sentence1_base = "A <SizeModifier> <AbnormalReport> can be seen in the report in the <LocationModifier> part"
sentence2_base = "The report shows a <SizeModifier> <LocationModifier> <AbnormalReport>"
size_modifiers = ['small', 'large']
loc_modifiers = ['upper-left', 'lower-left', 'right-sided', 'left-sided']
abnormal_report = ['pleural effusion']

l1 = [sentence1_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]
l2 = [sentence2_base.replace('<SizeModifier>', size_mod).replace('<LocationModifier>', loc_mod).replace('<AbnormalReport>', ab_rep) for size_mod, loc_mod, ab_rep in product(size_modifiers, loc_modifiers, abnormal_report)]

negative_sentences = ['The report shows no pleural effusion', 'The report shows no consolidation on any side']
all_sentence_list = l1 + l2 + negative_sentences + sentence_list[4:]

In [None]:
print('\n'.join(all_sentence_list))

In [None]:
def calc_cosine_sim_matrix(sentence_embeddings):
    #stacked_sentence_embeddings = torch.stack(sentence_embeddings)
    stacked_sentence_embeddings = sentence_embeddings
    # Calculate the cosine similarity matrix
    cosine_sim_matrix = F.cosine_similarity(stacked_sentence_embeddings.unsqueeze(1), stacked_sentence_embeddings.unsqueeze(0), dim=2)
    return stacked_sentence_embeddings, cosine_sim_matrix

In [None]:
def get_sentence_embeddings(model, input):
    with torch.no_grad():
        tokenized_inp = model.tokenizer(input, padding=True, truncation=True, return_tensors='pt')
        encoder_out = model.transformer_encoder(**tokenized_inp)
    return encoder_out.last_hidden_state[:, 0, :]

In [None]:
model_path = '/home/users/pranav.rao/MiniTasks/Radbert/ModelPool/model_20230912_143355_0'
checkpoint = 'UCSD-VA-health/RadBERT-RoBERTa-4m'
radbert_multi_model = RadBERTMultiClassMultiLabel(num_classes, checkpoint)
radbert_multi_model.load_state_dict(torch.load(model_path))

In [None]:
all_embeddings = get_sentence_embeddings(radbert_multi_model, all_sentence_list)

In [None]:
print(all_embeddings)
print(all_embeddings.shape)

In [None]:
_, cosine_sim = calc_cosine_sim_matrix(all_embeddings)

In [None]:
print(cosine_sim)

## Analysing the predictions ##

In [None]:
def get_predictions(model, input):
    with torch.no_grad():
        tokenized_inp = model.tokenizer(input, padding=True, truncation=True, return_tensors='pt')
        encoder_out = model.transformer_encoder(**tokenized_inp)
        logits = model.linear_classifier(encoder_out.last_hidden_state[:, 0, :])
        return logits

In [None]:
predictions = get_predictions(radbert_multi_model, all_sentence_list)

In [None]:
print(predictions)
print(predictions.shape)
print(predictions.argmax(dim=-1))
print(predictions[:, 224])

In [None]:
df_tags = pd.read_csv('/home/users/pranav.rao/MiniTasks/Radbert/report_tags_25k_train.csv')

In [None]:
df_tags.describe()

In [None]:
columns = list(df_tags.columns.values)
print(columns[224])