In [1]:
#mounting Google Drive to save model
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
# !pip install scispacy
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz
# !pip install --upgrade scispacy
!pip install -q torch_geometric
!pip install -q transformers
!pip install -q datasets
# !pip install -q sentencepiece
!pip install rouge
!pip install bert_score
!pip install rouge-score
!pip install sacrebleu
# !pip install --upgrade datasets
!pip install evaluate

In [3]:
%%capture
# import spacy
# import scispacy
# from scispacy.linking import EntityLinker

from torch_geometric.nn import GCNConv
from torch_geometric.data import Data
import torch


import tensorflow as tf
import numpy as np
import pandas as pd
import torch
import transformers
import os
import re
import json
import bert_score
import rouge
import sacrebleu
import evaluate

import matplotlib.pyplot as plt
import seaborn as sns

from datasets import load_dataset
from datasets import DatasetDict


# from transformers import pipeline, T5Tokenizer, TFT5Model, T5ForConditionalGeneration, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import LongformerTokenizerFast, BartForConditionalGeneration, Trainer, TrainingArguments, BartTokenizer
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq

#evaluation packages
#rogue score
from rouge import Rouge
from evaluate import load
# BERTScore leverages the pre-trained contextual embeddings from BERT and matches words in candidate and reference sentences by cosine similarity.
from bert_score import BERTScorer

import shutil

In [4]:
ds = load_dataset("Bilal-Mamji/Medical-summary")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/567 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/21.9M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/615k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9250 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/500 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/250 [00:00<?, ? examples/s]

In [5]:
#validating dataset
print(f"{len(ds['train'])} training pairs")
print(f"{len(ds['validation'])} validation pairs")
print(f"{len(ds['test'])} test pairs")

9250 training pairs
500 validation pairs
250 test pairs


In [6]:
#renaming headers and deleting instructions column
ds = ds.remove_columns(['instruction']) #not relevant to the model baseline
ds = ds.rename_column('input', 'input_text')
ds = ds.rename_column('output', 'target_text')


In [7]:
#saving data to a dataset DatasetDict to work with DistilBART tokenization
dataset = DatasetDict({
    'train': ds['train'],
    'validation': ds['validation'],
    'test': ds['test']
})

In [8]:
from transformers import BartForConditionalGeneration, BartTokenizer
model = BartForConditionalGeneration.from_pretrained("sshleifer/distilbart-cnn-12-6")
tokenizer = BartTokenizer.from_pretrained("sshleifer/distilbart-cnn-12-6")

config.json:   0%|          | 0.00/1.80k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

##2. Graph Convoluted Network (GNC) Setup

In [9]:

# Define GCN Model
class GCN(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim):
        super(GCN, self).__init__()
        self.conv1 = GCNConv(input_dim, hidden_dim)
        self.conv2 = GCNConv(hidden_dim, hidden_dim)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index)
        return x

# Example Graph Construction (Update for dynamic graphs)
def construct_graph():
    edge_index = torch.tensor([[0, 1], [1, 0]], dtype=torch.long)  # Example: Symptom -> Disease
    node_features = torch.eye(2)  # One-hot encoding for 2 nodes
    return Data(x=node_features, edge_index=edge_index)

# Instantiate GCN Model
gcn_model = GCN(input_dim=2, hidden_dim=8)

In [10]:
def sliding_window_chunking(text, tokenizer, max_length=900, stride=256):
    """
    Split text into overlapping chunks using a sliding window.
    """
    tokens = tokenizer(text, truncation=False, return_tensors="pt")["input_ids"][0]
    chunks = []
    start = 0
    while start < len(tokens):
        end = min(start + max_length, len(tokens))
        chunk = tokens[start:end]
        chunk_text = tokenizer.decode(chunk, skip_special_tokens=True)
        chunks.append(chunk_text)
        start += max_length - stride
    return chunks

##3. Preprocessing Function

In [11]:
def preprocess_data(batch):
    """
    Preprocess the dataset by generating GCN embeddings, applying sliding window, and tokenizing input and target texts.
    """
    input_ids = []
    attention_masks = []
    labels = []

    for input_text, target_text in zip(batch["input_text"], batch["target_text"]):
        # Construct graph and generate GCN embeddings
        graph = construct_graph()
        gcn_embeddings = gcn_model(graph).detach().numpy()
        gcn_info = " ".join([f"Node_{i}_embedding: {emb.tolist()}" for i, emb in enumerate(gcn_embeddings)])

        # Combine GCN info with input text
        enriched_text = input_text + " " + gcn_info

        # Apply sliding window if text is too long
        chunks = sliding_window_chunking(enriched_text, tokenizer)

        # Tokenize each chunk and target text
        for chunk in chunks:
            tokenized_input = tokenizer(chunk, max_length=900, truncation=True, padding="max_length")
            tokenized_target = tokenizer(target_text, max_length=600, truncation=True, padding="max_length")

            input_ids.append(tokenized_input["input_ids"])
            attention_masks.append(tokenized_input["attention_mask"])
            labels.append(tokenized_target["input_ids"])

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "labels": labels,
    }

##4. Tokenization and Training with GNC

In [12]:
from datasets import DatasetDict

# Tokenize the dataset
tokenized_datasets = dataset.map(preprocess_data, batched=True, remove_columns=["input_text", "target_text"])

Map:   0%|          | 0/9250 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/250 [00:00<?, ? examples/s]

In [13]:
tokenized_datasets.save_to_disk("/content/drive/My Drive/GNC_DistilBART_Tokenized_Dataset")

Saving the dataset (0/1 shards):   0%|          | 0/18022 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/980 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/484 [00:00<?, ? examples/s]

In [14]:
# Check if the validation set exists and is tokenized
print(tokenized_datasets["validation"])

# Check a few samples
print(tokenized_datasets["validation"][0])

Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 980
})
{'input_ids': [0, 41152, 35, 20920, 328, 38, 192, 14, 47, 58, 4997, 7, 84, 1098, 13, 10, 10665, 2329, 225, 1975, 9636, 179, 4982, 14978, 971, 15408, 11, 110, 235, 2853, 45776, 4, 1534, 14, 4577, 116, 50118, 18276, 4843, 35, 3216, 6, 14, 18, 235, 4, 50118, 41152, 35, 166, 33, 1904, 10, 22575, 26063, 7089, 7, 25806, 3894, 5, 16570, 4, 3945, 47, 2950, 19, 42, 7089, 116, 50118, 18276, 4843, 35, 1491, 269, 6, 64, 47, 3922, 24, 7, 162, 116, 50118, 41152, 35, 1525, 768, 4, 1590, 5, 7089, 6, 47, 40, 28, 2325, 11, 5, 314, 30972, 5044, 1792, 20818, 737, 223, 937, 40687, 4, 166, 40, 172, 146, 10, 204, 12, 13753, 3024, 5853, 10699, 13, 5, 1049, 4103, 11, 5, 2958, 3222, 10111, 337, 980, 23, 5, 34988, 18884, 31867, 516, 4, 50118, 18276, 4843, 35, 8487, 6, 38, 192, 4, 50118, 41152, 35, 83, 7725, 31914, 368, 40, 28, 341, 7, 1157, 5, 43576, 9, 10, 9538, 43500, 1043, 31468, 6, 253, 40229, 32226, 1182, 5206, 2187, 6, 

In [15]:
# Training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results_distilbart_gcn",  # Directory to save checkpoints
    num_train_epochs=3,                    # Number of training epochs
    per_device_train_batch_size=4,         # Batch size
    gradient_accumulation_steps=2,         # Accumulate gradients for larger effective batch size
    eval_steps=500,                        # Perform evaluation every 500 steps
    save_steps=500,                        # Save model checkpoint every 500 steps
    learning_rate=5e-5,                    # Learning rate
    predict_with_generate=True,            # Generate text during evaluation
    logging_dir="./logs",                  # Directory for logs
    logging_steps=100,                     # Log every 100 steps
)


In [16]:
from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer, EarlyStoppingCallback

training_args = Seq2SeqTrainingArguments(
    output_dir="./results_distilbart_gcn",  # Directory to save checkpoints
    num_train_epochs=3,                    # Number of training epochs
    per_device_train_batch_size=4,         # Batch size
    gradient_accumulation_steps=2,         # Accumulate gradients for larger effective batch size
    eval_steps=500,                        # Perform evaluation every 500 steps
    save_steps=500,                        # Save model checkpoint every 500 steps
    save_total_limit=2,                    # Keep only the 2 most recent checkpoints
    learning_rate=5e-5,                    # Learning rate
    predict_with_generate=True,            # Generate text during evaluation
    logging_dir="./logs",                  # Directory for logs
    logging_steps=100,                     # Log every 100 steps
    eval_strategy="steps",                 # Enable evaluation during training
    fp16=True,                             # Enable mixed precision training
    load_best_model_at_end=True            # Load the best model at the end
)

early_stopping_callback = EarlyStoppingCallback(
    early_stopping_patience=3  # Stop training if no improvement after 3 evaluations
)

In [17]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    callbacks=[early_stopping_callback]
)

# Train the model
trainer.train()

  trainer = Seq2SeqTrainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Step,Training Loss,Validation Loss
500,1.21,1.091402
1000,1.045,1.007082
1500,1.0084,0.95812
2000,0.9686,0.931377
2500,0.8854,0.909308
3000,0.8721,0.893841
3500,0.8517,0.888223
4000,0.8374,0.871458
4500,0.8367,0.864955
5000,0.7799,0.871898




TrainOutput(global_step=6759, training_loss=0.9163678097714332, metrics={'train_runtime': 1677.3051, 'train_samples_per_second': 32.234, 'train_steps_per_second': 4.03, 'total_flos': 7.35551497396224e+16, 'train_loss': 0.9163678097714332, 'epoch': 3.0})

In [18]:
# Evaluate the model on the validation set
results = trainer.evaluate(eval_dataset=tokenized_datasets["validation"])
print("Evaluation Results:", results)

Evaluation Results: {'eval_loss': 0.8569899201393127, 'eval_runtime': 7.5418, 'eval_samples_per_second': 129.943, 'eval_steps_per_second': 16.309, 'epoch': 3.0}


In [19]:

import shutil

# Save the model and tokenizer locally
model.save_pretrained("./distilbart_gcn", safe_serialization=True)
tokenizer.save_pretrained("./distilbart_gcn")

# Path where the model is saved locally
local_model_path = "/content/distilbart_gcn"  # Corrected this path to match the save_pretrained directory

# Path in Google Drive where you want to save the model
drive_model_path = "/content/drive/My Drive/DistilBARTFolder_GCN"

# Copy the entire directory to Google Drive
shutil.copytree(local_model_path, drive_model_path)

print("Model directory uploaded to Google Drive!")

Model directory uploaded to Google Drive!


In [20]:
def generate_prediction(input_text):
    '''function to generate predictions from input_text, this case text dataset.
    This will be used for human evaluation'''
    inputs = tokenizer(input_text, return_tensors="pt", max_length=900, truncation=True, padding=True)

    # Move input tensors to the same device as the model
    input_ids = inputs["input_ids"].to(model.device)  # Assuming model is on GPU

    outputs = model.generate(
        input_ids,
        max_length=512,  # Adjust based on your expected output length
        do_sample=True,  # Enable sampling
        top_k=50,        # Top-k sampling for diversity
        top_p=0.95,      # Nucleus sampling
        temperature=1.0, # Controls randomness
        num_beams=4,     # Beam search for better predictions
        length_penalty=2.0,
        early_stopping=True
    )
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

In [21]:
#measure entire cell process time
%%time
import random

# Set a random seed for reproducibility
random.seed(42)
# Number of samples to display
num_samples = 10

# Select random samples from the Hugging Face Dataset
# random_samples = dataset["test"].shuffle(seed=42).select(range(num_samples))
subset_samples = dataset["test"].select(range(num_samples))

#empty list to save outputs, will be used to export to Google Drive below
results = []

# Generate predictions and print results
for idx, row in enumerate(subset_samples):
    input_text = row["input_text"]
    target_text = row["target_text"]
    prediction = generate_prediction(input_text)

    # print(f"\nSample {idx + 1}:")
    # print(f"Input Text:\n{input_text}\n")
    # print(f"Target Text (Ground Truth):\n{target_text}\n")
    # print(f"Model Prediction:\n{prediction}\n")
    # print("-" * 80)

    # Append to results
    results.append({
        "Input Text": input_text,
        "Target Text": target_text,
        "Model Prediction": prediction
    })

    # Optional: Print progress
    print(f"Processed Sample {idx + 1}/{len(subset_samples)}")


Processed Sample 1/10
Processed Sample 2/10
Processed Sample 3/10
Processed Sample 4/10
Processed Sample 5/10
Processed Sample 6/10
Processed Sample 7/10
Processed Sample 8/10
Processed Sample 9/10
Processed Sample 10/10
CPU times: user 48.7 s, sys: 140 ms, total: 48.9 s
Wall time: 49.1 s


##4.1 Human Evaluation

In [22]:
# Convert results to a DataFrame
df = pd.DataFrame(results)

# Define the output path in Google Drive
output_path = "/content/drive/My Drive/DistilBART_GNC_HE.csv"

# Save to CSV
df.to_csv(output_path, index=False)

print(f"Results saved to {output_path}")

Results saved to /content/drive/My Drive/DistilBART_GNC_HE.csv


##4.2 ROUGE Evaluation

In [23]:
#load rouge metric
rouge = load("rouge") #lrouge metric using load function
#gpu to local device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

#move model to local
model.to(device)

#generate predictions and references
predictions = []
references = []
for row in dataset["test"]:  # Iterate through the dataset
    input_text = row["input_text"]
    target_text = row["target_text"]  # Extract the target text
    prediction = generate_prediction(input_text)
    predictions.append(prediction)
    references.append(target_text)

#calcuating rouge score
rouge_results = rouge.compute(predictions=predictions, references=references)
print("ROUGE Results:", rouge_results)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

ROUGE Results: {'rouge1': 0.662220052098518, 'rouge2': 0.4037440578537126, 'rougeL': 0.49209870881728723, 'rougeLsum': 0.5877912404693673}


##4.3 BLEU Evaluation

In [24]:
# ipython-input-16-341afa8f0746
#use generate_prediction function
# predictions, references = generate_predictions(dataset["test"], model, tokenizer, device)
predictions = []
references = []
for row in dataset["test"]:
  prediction = generate_prediction(row['input_text'])
  predictions.append(prediction)
  references.append(row['target_text'])


#BLEU expects references as a list of lists
references = [[ref] for ref in references]

#calculating BLEU score
bleu_score = sacrebleu.corpus_bleu(predictions, references)
print("BLEU Score:", bleu_score.score)

BLEU Score: 71.85994740624416


##4.4 BERT Score Evaluation

In [25]:
#BERT Score function
def evaluate_bertscore(predictions, references, lang="en"):
    # Calculate BERTScore
    P, R, F1 = bert_score.score(predictions, references, lang=lang, verbose=True)

    # Return precision, recall, and F1 as average scores
    return {
        "BERTScore Precision": P.mean().item(),
        "BERTScore Recall": R.mean().item(),
        "BERTScore F1": F1.mean().item()
    }

print(evaluate_bertscore(predictions, references))

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/8 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/4 [00:00<?, ?it/s]

done in 8.41 seconds, 29.74 sentences/sec
{'BERTScore Precision': 0.9162518382072449, 'BERTScore Recall': 0.920674741268158, 'BERTScore F1': 0.91841721534729}
