#SOAP Summarization with DistilBART with No Training

##1. Setup

###1.1 Installing libraries, importing packages and mounting google drive

In [2]:
%%capture
!pip install -q transformers
!pip install -q datasets
!pip install -q sentencepiece
!pip install rouge
!pip install bert_score
!pip install rouge-score
!pip install sacrebleu
!pip install --upgrade datasets
!pip install evaluate

In [3]:
import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import transformers
import os
import re
import json
import bert_score
import rouge
import sacrebleu
import evaluate


import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from datasets import DatasetDict


# from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import LongformerTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, BartTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
from bert_score import BERTScorer

import shutil


In [4]:
#mounting Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###1.2 Importing dataset from HuggingFace (save HG token in secrets)

In [5]:
ds = load_dataset("Bilal-Mamji/Medical-summary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/615k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

In [6]:
#validating dataset
print(f"{len(ds['train'])} training pairs")
print(f"{len(ds['validation'])} validation pairs")
print(f"{len(ds['test'])} test pairs")

9250 training pairs
500 validation pairs
250 test pairs


In [7]:
#renaming headers and deleting instructions column
ds = ds.remove_columns(['instruction']) #not relevant to the model baseline
ds = ds.rename_column('input', 'input_text')
ds = ds.rename_column('output', 'target_text')


In [8]:
#saving data to a dataset DatasetDict to work with LongBART tokenization
dataset = DatasetDict({
    'train': ds['train'],
    'validation': ds['validation'],
    'test': ds['test']
})

##2. DistilBART Evaluation

In [9]:
#initialize model and tokenizer from Hugging Face

model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

In [10]:
#define prompt - create multiple if needed
prompt = "Generate a SOAP note based on the following dialogue between doctor and patient: "

In [11]:
basic_instruct = "Summarize: "
SOAP_instruct = "Create a medical SOAP summary of this dialogue.: "
SOAP_instruct_full = "Create a Medical SOAP note summary from the dialogue, following these guidelines: S (Subjective): Summarize the patient's reported symptoms, including chief complaint and relevant history. Rely on the patient's statements as the primary source and ensure standardized terminology. O (Objective): Highlight critical findings such as vital signs, lab results, and imaging, emphasizing important details like the side of the body affected and specific dosages. Include normal ranges where relevant. A (Assessment): Offer a concise assessment combining subjective and objective data. State the primary diagnosis and any differential diagnoses, noting potential complications and the prognostic outlook. P (Plan): Outline the management plan, covering medication, diet, consultations, and education. Ensure to mention necessary referrals to other specialties and address compliance challenges. Considerations: Compile the report based solely on the transcript provided. Maintain confidentiality and document sensitively. Use concise medical jargon and abbreviations for effective doctor communication. Please format the summary in a clean, simple list format without using markdown or bullet points. Use 'S:', 'O:', 'A:', 'P:' directly followed by the text. Avoid any styling or special characters."

In [17]:
#defining evaluation functions (bert and bleu)

#BERT Score function
def evaluate_bertscore(predictions, references, lang="en"):
    # Calculate BERTScore
    P, R, F1 = bert_score.score(predictions, references, lang=lang, verbose=True)

    # Return precision, recall, and F1 as average scores
    return {
        "BERTScore Precision": P.mean().item(),
        "BERTScore Recall": R.mean().item(),
        "BERTScore F1": F1.mean().item()
    }

#BLEU Score function
bleu = evaluate.load("bleu")

def evaluate_bleu(predictions, references):
    # Prepare BLEU inputs
    predictions_split = [pred.split() for pred in predictions]
    references_split = [[ref.split()] for ref in references]  # Wrap reference in a list for multiple references

    # Compute BLEU score
    bleu_score = bleu.compute(predictions=predictions_split, references=references_split)
    return bleu_score

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

In [41]:
#generate predictions and begin evaluation process with rouge

# Load ROUGE metric
rouge = evaluate.load("rouge")

def evaluate_model_on_test_data(model, tokenizer, test_data, max_input_length=900, max_output_length=600):
    predictions = []
    references = []

    for sample in test_data:
        # Add prompt to input text
        basic_instruct = "Summarize: "
        SOAP_instruct = "Create a medical SOAP summary of this dialogue.: "
        SOAP_instruct_full = "Create a Medical SOAP note summary from the dialogue, following these guidelines: S (Subjective): Summarize the patient's reported symptoms, including chief complaint and relevant history. Rely on the patient's statements as the primary source and ensure standardized terminology. O (Objective): Highlight critical findings such as vital signs, lab results, and imaging, emphasizing important details like the side of the body affected and specific dosages. Include normal ranges where relevant. A (Assessment): Offer a concise assessment combining subjective and objective data. State the primary diagnosis and any differential diagnoses, noting potential complications and the prognostic outlook. P (Plan): Outline the management plan, covering medication, diet, consultations, and education. Ensure to mention necessary referrals to other specialties and address compliance challenges. Considerations: Compile the report based solely on the transcript provided. Maintain confidentiality and document sensitively. Use concise medical jargon and abbreviations for effective doctor communication. Please format the summary in a clean, simple list format without using markdown or bullet points. Use 'S:', 'O:', 'A:', 'P:' directly followed by the text. Avoid any styling or special characters: "

        # prompt = SOAP_instruct
        prompt = SOAP_instruct
        input_text = prompt + sample["input_text"]

        # Tokenize input
        inputs = tokenizer(
            input_text,
            return_tensors="pt",
            truncation=True,
            padding="max_length",
            max_length=max_input_length
        )

        # Generate prediction
        outputs = model.generate(
            inputs["input_ids"],
            max_length=max_output_length,
            num_beams=4,
            early_stopping=True
        )

        # Decode prediction and reference
        prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
        reference = sample["target_text"]

        # Append to lists
        predictions.append(prediction)
        references.append(reference)

    # Calculate ROUGE and BERT scores
    rouge_scores = rouge.compute(predictions=predictions, references=references)
    bertscore = evaluate_bertscore(predictions, references)
    return rouge_scores, bertscore, predictions, references

In [43]:
%%time
# Select a subset of the test data for evaluation (e.g., first 100 samples) - use .select(range(100))
test_data = dataset["test"].select(range(10)) #total test set of 250 examples

# Evaluate the model
rouge_scores, bertscore, predictions, references = evaluate_model_on_test_data(model, tokenizer, test_data)

#BLEU expects references as a list of lists
references = [[ref] for ref in references]

#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)

# Print ROUGE scores
print("ROUGE Scores:", json.dumps(rouge_scores, indent=2))
print("-"*80)
print("BLEU Score:", bleu_score.score)
print("-"*80)
print("BERTScore Scores:", json.dumps(bertscore, indent=2))
print("-"*80)

# print("ROUGE Scores:", rouge_scores)
# print("-"*80)
# print("Bleu Scores:", bleu_score)
# print("-"*80)
# print("BERTScore Scores:", bertscore)
print("-"*80)

# Display a few predictions and their references
for i in range(5):  # Show first 5 samples
    print(f"Input: {test_data[i]['input_text']}")
    print(f"Generated SOAP Note: {predictions[i]}")
    print(f"Reference SOAP Note: {references[i]}")
    print("-" * 80)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 1.30 seconds, 7.72 sentences/sec
ROUGE Scores: {
  "rouge1": 0.21530296284002617,
  "rouge2": 0.10751781423802227,
  "rougeL": 0.16605863654917058,
  "rougeLsum": 0.18664613242876393
}
--------------------------------------------------------------------------------
BLEU Score: 3.0959535111579233
--------------------------------------------------------------------------------
BERTScore Scores: {
  "BERTScore Precision": 0.8851026296615601,
  "BERTScore Recall": 0.8105083703994751,
  "BERTScore F1": 0.8460744619369507
}
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
Input: Doctor: Hello, can you please tell me about your past medical history?
Patient: Hi, I don't have any past medical history.
Doctor: Okay. What brings you in today?
Patient: I've been experiencing painless blurry vision in my right eye for a week now. I've also had intermittent fevers, headache, body

##3. ROUGE Results with 10 samples



1.   Using SOAP_Instruct Prompt:  

  ROUGE Scores: {'rouge1': 0.2239990018423448, 'rouge2': 0.10995432529502855, 'rougeL': 0.162565966372689, 'rougeLsum': 0.18941799841109633}.  

  CPU times: user 3min 4s, sys: 755 ms, total: 3min 5s
Wall time: 46.6 s

2.   Using SOAP_Instruct_Long:  
dfds

