#SOAP Summarization with LongBART with Fine-Tuning

##1. Setup

###1.1 Installing libraries, importing packages and mounting google drive

In [1]:
%%capture
!pip install -q transformers
!pip install -q datasets
!pip install -q sentencepiece
!pip install -q datasets
!pip install rouge
!pip install bert_score
!pip install rouge-score
!pip install sacrebleu
!pip install --upgrade datasets
!pip install evaluate

In [23]:
import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import transformers
import os
import re
import openai
import json



import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from datasets import DatasetDict


# from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import LongformerTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
from bert_score import BERTScorer
#bleu score
import sacrebleu



In [3]:
#mounting Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###1.2 Importing dataset from HuggingFace (save HG token in secrets)

In [4]:
ds = load_dataset("Bilal-Mamji/Medical-summary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/615k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

In [16]:
#validating dataset
print(f"{len(ds['train'])} training pairs")
print(f"{len(ds['validation'])} validation pairs")
print(f"{len(ds['test'])} test pairs")

9250 training pairs
500 validation pairs
250 test pairs


In [6]:
#renaming headers and deleting instructions column
ds = ds.remove_columns(['instruction']) #not relevant to the model baseline
ds = ds.rename_column('input', 'input_text')
ds = ds.rename_column('output', 'target_text')


In [7]:
#saving data to a dataset DatasetDict to work with LongBART tokenization
dataset = DatasetDict({
    'train': ds['train'],
    'validation': ds['validation'],
    'test': ds['test']
})

In [8]:
#initialize tokenizer and model
tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096")
model = BartForConditionalGeneration.from_pretrained("allenai/longformer-base-4096")

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

You are using a model of type longformer to instantiate a model of type bart. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/597M [00:00<?, ?B/s]

Some weights of BartForConditionalGeneration were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['decoder.embed_positions.weight', 'decoder.embed_tokens.weight', 'decoder.layernorm_embedding.bias', 'decoder.layernorm_embedding.weight', 'decoder.layers.0.encoder_attn.k_proj.bias', 'decoder.layers.0.encoder_attn.k_proj.weight', 'decoder.layers.0.encoder_attn.out_proj.bias', 'decoder.layers.0.encoder_attn.out_proj.weight', 'decoder.layers.0.encoder_attn.q_proj.bias', 'decoder.layers.0.encoder_attn.q_proj.weight', 'decoder.layers.0.encoder_attn.v_proj.bias', 'decoder.layers.0.encoder_attn.v_proj.weight', 'decoder.layers.0.encoder_attn_layer_norm.bias', 'decoder.layers.0.encoder_attn_layer_norm.weight', 'decoder.layers.0.fc1.bias', 'decoder.layers.0.fc1.weight', 'decoder.layers.0.fc2.bias', 'decoder.layers.0.fc2.weight', 'decoder.layers.0.final_layer_norm.bias', 'decoder.layers.0.final_layer_norm.weight', 'decoder.layers.0.self_attn.k_p

In [9]:
#tokenize dataset
def preprocess_data(batch):
    #tokenize input_text and target_text with truncating and padding - need max token lengths to capture all context in input and target
    inputs = tokenizer(batch["input_text"], max_length=900, truncation=True, padding="max_length") #ensure you have correct max token length from input_text
    targets = tokenizer(batch["target_text"], max_length=600, truncation=True, padding="max_length") #ensure you have correct max token length from target_text
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

##2. LongBART Training

###2.1 Training arguments - need to research a few arguments and logic

In [10]:
#training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  #saving model and logs to google colab directory (will save to google drive down below)
    per_device_train_batch_size=2,  #examples per GPU during training
    per_device_eval_batch_size=2,  #examples per GPU during evaluation
    gradient_accumulation_steps=8,  #accumulate gradients to simulate a larger batch size - research more
    eval_steps=500,  #evaluate every 500 steps
    save_steps=500,  #save model every 500 steps
    save_total_limit=2,  #keep only the 2 most recent checkpoints
    num_train_epochs=3,  #number of epochs - keep reasonably low due to time constraint
    learning_rate=5e-5,  #learning rate - adjust for later experiments
    fp16=True,  #mixed-precision training - research more, but helps with efficiency during training
    eval_strategy="steps",  #evaluation at specified steps - research
    logging_dir="./logs",  #model logs to google colab directory (will save to google drive below)
    logging_steps=100,  #log every 100 steps
    predict_with_generate=True,  #generate summaries during evaluation
)


###2.2 Training model - needed to use T4 GPU and High-RAM => cost 5 compute units with Google Colab Pro - about 1 hour to train on T4 GPU

In [11]:
#oass necessary arguments to Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

#start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,2.6736,2.56238
1000,2.3242,2.280866
1500,2.2173,2.180705


TrainOutput(global_step=1734, training_loss=2.932330152430199, metrics={'train_runtime': 4081.2207, 'train_samples_per_second': 6.799, 'train_steps_per_second': 0.425, 'total_flos': 3.5394731606016e+16, 'train_loss': 2.932330152430199, 'epoch': 2.9993513513513514})

In [12]:
#saving pretrained model locally google colab
model.save_pretrained("./fine_tuned_longbart")
tokenizer.save_pretrained("./fine_tuned_longbart")

#training evaluation
results = trainer.evaluate()
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 2.1673495769500732, 'eval_runtime': 23.1921, 'eval_samples_per_second': 21.559, 'eval_steps_per_second': 10.78, 'epoch': 2.9993513513513514}


##3. Model Evaluation with Test Data

###3.1 Generating Predictions with Test Data



In [13]:
def generate_predictions(test_dataset, model, tokenizer, device): # Add device parameter to swtich from T4 to local device
    '''function to tokenize the test data input & ground truth and generate predictions'''
    predictions = []
    references = []

    for example in test_dataset:
        #tokenize inputs
        inputs = tokenizer(
            example["input_text"], return_tensors="pt", max_length=900, truncation=True, padding="max_length"
        )
        input_ids = inputs["input_ids"].to(device) #moves inputs tto local machine

        #generate predictions
        output_ids = model.generate(input_ids, max_length=600, num_beams=4) #num_beams can be adjusted for later experiments
        prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predictions.append(prediction)

        #reference text (ground truth)
        references.append(example["target_text"])

    return predictions, references

##3.2 ROUGE Evaluation of the Model

*   ROUGE-1 measures th overlap of unigrams/single words between pred and gt (content overlap)
*   ROUGE-2 measures the overlap of bigrams/two consecutive terms between perd and gt. (phrase level fidelity?)
- ROUGE-L measures the longest common subsequence (LCS) of tokens between pred and gt. (structure overlap)
- ROUGE-Lsum same as ROUGE-L but optimized for summarization task by focusing on sentence level LCS (structure overlap)



In [14]:
#load rouge metric
rouge = load("rouge") #lrouge metric using load function
#gpu to local device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#move model to local
model.to(device)

#generate predictions and references
predictions, references = generate_predictions(dataset["test"], model, tokenizer, device)

#calcuating rouge score
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Results: {'rouge1': 0.3373889758024833, 'rouge2': 0.11417081028935536, 'rougeL': 0.2110423844635518, 'rougeLsum': 0.2942903415381652}


###3.3 BLEU Evaluation of the Model

*   Need to decide if the target_text is considered abstract vs. extractive (mix of both?)
*   Benchmark for abstract summarization is between 10% and 20%
- Need to review other bleu scores.   

| Metric   | Good Score Range | Why                                   |
|----------|------------------|---------------------------------------|
| BLEU-1   | 30–50            | Indicates coverage of key medical terms. |
| BLEU-2   | 15–30            | Captures short medical phrases accurately. |
| BLEU-4   | 10–20            | Suggests logical and contextual alignment. |




In [15]:
#use generate_prediction function
predictions, references = generate_predictions(dataset["test"], model, tokenizer, device)

#BLEU expects references as a list of lists
references = [[ref] for ref in references]

#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)
print("BLEU Score:", bleu_score.score)

BLEU Score: 73.25683430162678


##4. Saving Model and Logs to Google Drive

In [18]:
#saving model to Google drive
save_directory = "/content/drive/My Drive/SOAP_model_baseline"

import os
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

#save the model
model.save_pretrained(save_directory)

#save the tokenizer
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")



Model and tokenizer saved to /content/drive/My Drive/SOAP_model_baseline


##5. Loading Model from Google Drive

In [15]:
#load the model
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(save_directory)

#load the tokenizer
loaded_tokenizer = AutoTokenizer.from_pretrained(save_directory)

print("Model and tokenizer loaded from Google Drive.")