#SOAP Summarization with DistilBART with Fine-Tuning

##1. Setup

###1.1 Installing libraries, importing packages and mounting google drive

In [None]:
#mounting Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install -q transformers
!pip install -q datasets
!pip install -q sentencepiece
!pip install rouge
!pip install bert_score
!pip install rouge-score
!pip install sacrebleu
!pip install --upgrade datasets
!pip install evaluate

In [None]:
import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import transformers
import os
import re
import json
import bert_score
import rouge
import sacrebleu
import evaluate


import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from datasets import DatasetDict


# from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import LongformerTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, BartTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
from bert_score import BERTScorer

import shutil


###1.2 Importing dataset from HuggingFace (save HG token in secrets)

In [None]:
ds = load_dataset("Bilal-Mamji/Medical-summary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/615k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
#validating dataset
print(f"{len(ds['train'])} training pairs")
print(f"{len(ds['validation'])} validation pairs")
print(f"{len(ds['test'])} test pairs")

9250 training pairs
500 validation pairs
250 test pairs


In [None]:
#renaming headers and deleting instructions column
ds = ds.remove_columns(['instruction']) #not relevant to the model baseline
ds = ds.rename_column('input', 'input_text')
ds = ds.rename_column('output', 'target_text')


In [None]:
#saving data to a dataset DatasetDict to work withDistilBART tokenization
dataset = DatasetDict({
    'train': ds['train'],
    'validation': ds['validation'],
    'test': ds['test']
})

##2. DistilBART Training

In [None]:
#initialize tokenizer and model
# tokenizer = LongformerTokenizerFast.from_pretrained("allenai/longformer-base-4096") DNU
# model = BartForConditionalGeneration.from_pretrained("allenai/longformer-base-4096") DNU
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [None]:
#tokenize dataset
def preprocess_data(batch):
    #tokenize input_text and target_text with truncating and padding - need max token lengths to capture all context in input and target
    inputs = tokenizer(batch["input_text"], max_length=900, truncation=True, padding="max_length") #ensure you have correct max token length from input_text
    targets = tokenizer(batch["target_text"], max_length=600, truncation=True, padding="max_length") #ensure you have correct max token length from target_text
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)


Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
from transformers import Seq2SeqTrainingArguments


training_args = Seq2SeqTrainingArguments(
    output_dir="./results_distilbart",  #save checkpoints
    num_train_epochs=3,
    per_device_train_batch_size=4,     #adjust batch size for memory
    gradient_accumulation_steps=8,     #accumulate gradients for larger effective batch size
    eval_strategy="steps",             #evaluate after a certain number of steps
    eval_steps=500,                    #eval ever 500 steps
    save_steps=500,                    #save model checkpoint every 500 steps
    save_total_limit=2,                #saveonly the 2 most recent checkpoints
    learning_rate=5e-5,
    predict_with_generate=True,        #generate text during evaluation
    fp16=True,                         #use mixed precision to save memory
    logging_dir="./logs_distilbart",
    logging_steps=100,                 #log every 100 steps
)

In [None]:
%%time
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
)

trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,0.5467,0.537981




CPU times: user 49min 2s, sys: 9min 51s, total: 58min 53s
Wall time: 59min 3s


TrainOutput(global_step=867, training_loss=0.6884004005511327, metrics={'train_runtime': 3540.6856, 'train_samples_per_second': 7.837, 'train_steps_per_second': 0.245, 'total_flos': 3.7739426881536e+16, 'train_loss': 0.6884004005511327, 'epoch': 2.998702983138781})

In [None]:
#saving model to Google Drive
import shutil

# Save the model and tokenizer locally
model.save_pretrained("./fine_tuned_distilbart", safe_serialization=True)
tokenizer.save_pretrained("./fine_tuned_distilbart")

# Path where the model is saved locally
local_model_path = "/content/fine_tuned_distilbart"  # Corrected this path to match the save_pretrained directory

# Path in Google Drive where you want to save the model
drive_model_path = "/content/drive/My Drive/DistilBARTFolder"

# Copy the entire directory to Google Drive
shutil.copytree(local_model_path, drive_model_path)

print("Model directory uploaded to Google Drive!")

Model directory uploaded to Google Drive!


##3. Evaluation of Fine-tuned Model

In [None]:
#load model from drive
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("/content/drive/My Drive/DistilBARTFolder")
tokenizer = BartTokenizer.from_pretrained("/content/drive/My Drive/DistilBARTFolder")




In [None]:
def preprocess_data(batch):
    #tokenize input_text and target_text with truncating and padding - need max token lengths to capture all context in input and target
    inputs = tokenizer(batch["input_text"], max_length=900, truncation=True, padding="max_length") #ensure you have correct max token length from input_text
    targets = tokenizer(batch["target_text"], max_length=600, truncation=True, padding="max_length") #ensure you have correct max token length from target_text
    inputs["labels"] = targets["input_ids"]
    return inputs

tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=dataset["train"].column_names)

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [None]:
# def generate_predictions(test_dataset, model, tokenizer, device): # Add device parameter to swtich from T4 to local device
#     '''function to tokenize the test data input & ground truth and generate predictions'''
#     predictions = []
#     references = []

#     for example in test_dataset:
#         #tokenize inputs
#         inputs = tokenizer(
#             example["input_text"], return_tensors="pt", max_length=900, truncation=True, padding="max_length"
#         )
#         input_ids = inputs["input_ids"].to(device) #moves inputs tto local machine

#         #generate predictions
#         output_ids = model.generate(input_ids, max_length=600, num_beams=4) #num_beams can be adjusted for later experiments
#         prediction = tokenizer.decode(output_ids[0], skip_special_tokens=True)
#         predictions.append(prediction)

#         #reference text (ground truth)
#         references.append(example["target_text"])

#     return predictions, references

In [None]:
def generate_prediction(input_text):
    '''generate predictions from input_text, this case text dataset.
    This will be used for human evaluation'''
    inputs = tokenizer(input_text, return_tensors="pt", max_length=900, truncation=True, padding=True)

    #move input tensors to the same device as the model
    input_ids = inputs["input_ids"].to(model.device)

    outputs = model.generate(
        input_ids,
        max_length=512,
        do_sample=True,
        top_k=50,        #sampling for diversity
        top_p=0.95,
        temperature=1.0, #xontrols randomness
        num_beams=4,     #better predictions
        length_penalty=2.0,
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

###3.1 Human Evaluation Test Set

In [None]:
#measure entire cell process time
%%time
import random


# random.seed(42)

num_samples = 10

# random_samples = dataset["test"].shuffle(seed=42).select(range(num_samples))
subset_samples = dataset["test"].select(range(num_samples))

#empty list to save outputs, will be used to export to Google Drive below
results = []


for idx, row in enumerate(subset_samples):
    input_text = row["input_text"]
    target_text = row["target_text"]
    prediction = generate_prediction(input_text)

    # print(f"\nSample {idx + 1}:")
    # print(f"Input Text:\n{input_text}\n")
    # print(f"Target Text (Ground Truth):\n{target_text}\n")
    # print(f"Model Prediction:\n{prediction}\n")
    # print("-" * 80)

    # Append to results
    results.append({
        "Input Text": input_text,
        "Target Text": target_text,
        "Model Prediction": prediction
    })


    print(f"Processed Sample {idx + 1}/{len(subset_samples)}")


Processed Sample 1/10
Processed Sample 2/10
Processed Sample 3/10
Processed Sample 4/10
Processed Sample 5/10
Processed Sample 6/10
Processed Sample 7/10
Processed Sample 8/10
Processed Sample 9/10
Processed Sample 10/10
CPU times: user 17min 3s, sys: 4.67 s, total: 17min 7s
Wall time: 2min 51s


In [None]:
#save subset to drive as csv


df = pd.DataFrame(results)

#define the output path in Google Drive
output_path = "/content/drive/My Drive/DistilBARTbaseHEval.csv"

#save to CSV
df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")

Results saved to /content/drive/My Drive/DistilBARTbaseHEval.csv


In [None]:

output_path = "/content/drive/My Drive/DistilBARTbaseHEval.txt"

#sve to a text file
with open(output_path, "w") as f:
    for result in results:
        f.write(f"Input Text:\n{result['Input Text']}\n\n")
        # f.write(f"Target Text:\n{result['Target Text']}\n\n")
        f.write(f"Model Prediction:\n{result['Model Prediction']}\n")
        f.write("-" * 80 + "\n")

print(f"Results saved to {output_path}")

Results saved to /content/drive/My Drive/DistilBARTbaseHEval.txt


##3.2 ROUGE Evaluation

In [None]:
#load rouge metric
rouge = load("rouge") #lrouge metric using load function
#gpu to local device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#move model to local
model.to(device)

#generate predictions and references
predictions = []
references = []
for row in dataset["test"]:  #iterate through the dataset
    input_text = row["input_text"]
    target_text = row["target_text"]  #extract the target text
    prediction = generate_prediction(input_text)
    predictions.append(prediction)
    references.append(target_text)

#calcuating rouge score
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)



ROUGE Results: {'rouge1': 0.6741573622855186, 'rouge2': 0.4303047754125429, 'rougeL': 0.511797296304096, 'rougeLsum': 0.6055654452561663}


###3.3 BLEU Evaluation of the Model

*   Need to decide if the target_text is considered abstract vs. extractive (mix of both?)
*   Benchmark for abstract summarization is between 10% and 20%
- Need to review other bleu scores.   

| Metric   | Good Score Range | Why                                   |
|----------|------------------|---------------------------------------|
| BLEU-1   | 30–50            | Indicates coverage of key medical terms. |
| BLEU-2   | 15–30            | Captures short medical phrases accurately. |
| BLEU-4   | 10–20            | Suggests logical and contextual alignment. |




In [None]:
# ipython-input-16-341afa8f0746
#use generate_prediction function
# predictions, references = generate_predictions(dataset["test"], model, tokenizer, device)
predictions = []
references = []
for row in dataset["test"]:
  prediction = generate_prediction(row['input_text'])
  predictions.append(prediction)
  references.append(row['target_text'])


#BLEU expects references as a list of lists
references = [[ref] for ref in references]

#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)
print("BLEU Score:", bleu_score.score)

BLEU Score: 72.97596922693418


In [None]:
#BERT Score function
def evaluate_bertscore(predictions, references, lang="en"):
    #BERTScore
    P, R, F1 = bert_score.score(predictions, references, lang=lang, verbose=True)


    return {
        "BERTScore Precision": P.mean().item(),
        "BERTScore Recall": R.mean().item(),
        "BERTScore F1": F1.mean().item()
    }

print(evaluate_bertscore(predictions, references))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 31.21 seconds, 8.01 sentences/sec
{'BERTScore Precision': 0.9195564985275269, 'BERTScore Recall': 0.9212003350257874, 'BERTScore F1': 0.920314610004425}


In [None]:

%%time
from bert_score import score
from sklearn.metrics import accuracy_score
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
import re
from collections import defaultdict

def split_soap_text(document):
    """
    Splits a SOAP note document into its four components: Subjective, Objective, Assessment, and Plan.
    """
    patterns = {
        "Subjective": r"(?:Subjective:|S:)(.*?)(?:Objective:|O:|Assessment:|A:|Plan:|P:|$)",
        "Objective": r"(?:Objective:|O:)(.*?)(?:Assessment:|A:|Plan:|P:|$)",
        "Assessment": r"(?:Assessment:|A:)(.*?)(?:Plan:|P:|$)",
        "Plan": r"(?:Plan:|P:)(.*)",
    }

    soap_parts = {}
    for part, pattern in patterns.items():
        match = re.search(pattern, document, re.DOTALL | re.IGNORECASE)
        if match:
            soap_parts[part] = match.group(1).strip()
        else:
            soap_parts[part] = ""
    return soap_parts

def evaluate_bleu_corpus(references, predictions):
    """
    Compute BLEU score for the corpus.
    """

    from nltk.translate.bleu_score import corpus_bleu
    import re

    ref_tokens = [[ref.split()] for ref in references]
    pred_tokens = [pred.split() for pred in predictions]
    return corpus_bleu(ref_tokens, pred_tokens)

def evaluate_rouge_corpus(references, predictions):
    """
    Compute average ROUGE scores for the corpus.
    """

    from rouge_score import rouge_scorer
    from collections import defaultdict

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = defaultdict(list)

    for ref, pred in zip(references, predictions):
        scores = scorer.score(ref, pred)
        for key, value in scores.items():
            rouge_scores[key].append(value.fmeasure)

    return {key: sum(values) / len(values) for key, values in rouge_scores.items()}

def evaluate_bert_score_corpus(references, predictions):
    """
    Compute BERTScore for the corpus.
    """

    P, R, F1 = score(predictions, references, lang="en")
    return {
        "BERT_Precision": P.mean().item(),
        "BERT_Recall": R.mean().item(),
        "BERT_F1": F1.mean().item()
    }


predictions = []
references = []
for row in dataset["test"]:
  prediction = generate_prediction(row['input_text'])
  predictions.append(prediction)
  references.append(row['target_text'])

#import necessary library for regular expressions
import re

#split SOAP elements
all_predictions_split = [split_soap_text(doc) for doc in predictions]
all_references_split = [split_soap_text(doc) for doc in references]

#evaluate each SOAP element
evaluation_results = {}

for key in ["Subjective", "Objective", "Assessment", "Plan"]:
    pred_parts = [pred[key] for pred in all_predictions_split]
    ref_parts = [ref[key] for ref in all_references_split]

    bleu_score = evaluate_bleu_corpus(ref_parts, pred_parts)
    rouge_scores = evaluate_rouge_corpus(ref_parts, pred_parts)
    bert_scores = evaluate_bert_score_corpus(ref_parts, pred_parts)

    evaluation_results[key] = {
        "BLEU": bleu_score,
        **rouge_scores,
        **bert_scores
    }

#display results
for key, metrics in evaluation_results.items():
    print(f"{key} Evaluation:")
    for metric, score in metrics.items():
        print(f"  {metric}: {score:.4f}")
    print()



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['ro

Subjective Evaluation:
  BLEU: 0.3801
  rouge1: 0.6868
  rouge2: 0.4983
  rougeL: 0.5890
  BERT_Precision: 0.9393
  BERT_Recall: 0.9374
  BERT_F1: 0.9381

Objective Evaluation:
  BLEU: 0.3723
  rouge1: 0.6920
  rouge2: 0.5077
  rougeL: 0.6066
  BERT_Precision: 0.9286
  BERT_Recall: 0.9359
  BERT_F1: 0.9320

Assessment Evaluation:
  BLEU: 0.1861
  rouge1: 0.4821
  rouge2: 0.2606
  rougeL: 0.3946
  BERT_Precision: 0.8999
  BERT_Recall: 0.8982
  BERT_F1: 0.8989

Plan Evaluation:
  BLEU: 0.1768
  rouge1: 0.5349
  rouge2: 0.2631
  rougeL: 0.3805
  BERT_Precision: 0.9037
  BERT_Recall: 0.8996
  BERT_F1: 0.9015

CPU times: user 5h 38min 5s, sys: 33.3 s, total: 5h 38min 39s
Wall time: 1h 25min 16s


