#Dataset

In [None]:
!pip install --upgrade datasets

from datasets import load_dataset
dataset = load_dataset("midas/inspec", "raw")



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
train_sample = dataset["train"][0]
print("Fields in the sample: ", [key for key in train_sample.keys()])
print("Tokenized Document: ", train_sample["document"])
print("Document BIO Tags: ", train_sample["doc_bio_tags"])
print("Extractive/present Keyphrases: ", train_sample["extractive_keyphrases"])
print("Abstractive/absent Keyphrases: ", train_sample["abstractive_keyphrases"])
print("\n-----------\n")

# sample from the validation split
print("Sample from validation dataset split")
validation_sample = dataset["validation"][0]
print("Fields in the sample: ", [key for key in validation_sample.keys()])
print("Tokenized Document: ", validation_sample["document"])
print("Document BIO Tags: ", validation_sample["doc_bio_tags"])
print("Extractive/present Keyphrases: ", validation_sample["extractive_keyphrases"])
print("Abstractive/absent Keyphrases: ", validation_sample["abstractive_keyphrases"])
print("\n-----------\n")

# sample from the test split
print("Sample from test dataset split")
test_sample = dataset["test"][0]
print("Fields in the sample: ", [key for key in test_sample.keys()])
print("Tokenized Document: ", test_sample["document"])
print("Document BIO Tags: ", test_sample["doc_bio_tags"])
print("Extractive/present Keyphrases: ", test_sample["extractive_keyphrases"])
print("Abstractive/absent Keyphrases: ", test_sample["abstractive_keyphrases"])
print("\n-----------\n")

Fields in the sample:  ['id', 'document', 'doc_bio_tags', 'extractive_keyphrases', 'abstractive_keyphrases', 'other_metadata']
Tokenized Document:  ['A', 'conflict', 'between', 'language', 'and', 'atomistic', 'information', 'Fred', 'Dretske', 'and', 'Jerry', 'Fodor', 'are', 'responsible', 'for', 'popularizing', 'three', 'well-known', 'theses', 'in', 'contemporary', 'philosophy', 'of', 'mind', ':', 'the', 'thesis', 'of', 'Information-Based', 'Semantics', '-LRB-', 'IBS', '-RRB-', ',', 'the', 'thesis', 'of', 'Content', 'Atomism', '-LRB-', 'Atomism', '-RRB-', 'and', 'the', 'thesis', 'of', 'the', 'Language', 'of', 'Thought', '-LRB-', 'LOT', '-RRB-', '.', 'LOT', 'concerns', 'the', 'semantically', 'relevant', 'structure', 'of', 'representations', 'involved', 'in', 'cognitive', 'states', 'such', 'as', 'beliefs', 'and', 'desires', '.', 'It', 'maintains', 'that', 'all', 'such', 'representations', 'must', 'have', 'syntactic', 'structures', 'mirroring', 'the', 'structure', 'of', 'their', 'contents

# Before Fine tuning

In [None]:
!pip install --upgrade evaluate rouge_score

from datasets import load_dataset
import evaluate
from transformers import PegasusForConditionalGeneration, PegasusTokenizer
import torch
!pip install evaluate rouge_score
from evaluate import load
import numpy as np



In [None]:
rouge = evaluate.load("rouge")

# Load Pegasus model and tokenizer
model_name = "google/pegasus-xsum"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name).to(device)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/87.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.52M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-xsum and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/259 [00:00<?, ?B/s]

In [None]:
def generate_texts(documents, prefix="", max_length=60, num_beams=5):
    generated_texts = []
    for doc_tokens in documents:
        # Join tokens into a string
        doc_text = " ".join(doc_tokens) if isinstance(doc_tokens, list) else doc_tokens
        input_text = prefix + doc_text

        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest").to(device)
        outputs = model.generate(**inputs, max_length=max_length, num_beams=num_beams)
        decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_texts.append(decoded)
    return generated_texts


In [None]:
# Prepare documents and references for testing
test_documents = dataset['test']['document']  # list of token lists
reference_keyphrases = dataset['test']['abstractive_keyphrases']  # list of lists of phrases

# Join reference keyphrases as a single string (ROUGE expects string references)
reference_keyphrases_joined = [" ".join(kps) for kps in reference_keyphrases]

# Generate keyphrases with prefix
generated_keyphrases = generate_texts(test_documents, prefix="keyphrases: ")

# Generate summaries without prefix
generated_summaries = generate_texts(test_documents, prefix=":")

In [None]:
print("Sample document tokens:")
print(test_documents[0])

print("\nGenerated Summary:")
print(generated_summaries[0])

print("\nGenerated Keyphrases:")
print(generated_keyphrases[0])

print("\nReference Keyphrases:")
print(reference_keyphrases_joined[0])

# Compute ROUGE for keyphrases
rouge_scores = rouge.compute(predictions=generated_keyphrases, references=reference_keyphrases_joined)

print("\nROUGE Scores for Keyphrases:")
for k, v in rouge_scores.items():
    print(f"{k}: {v:.4f}")

Sample document tokens:
['A', 'new', 'graphical', 'user', 'interface', 'for', 'fast', 'construction', 'of', 'computation', 'phantoms', 'and', 'MCNP', 'calculations', ':', 'application', 'to', 'calibration', 'of', 'in', 'vivo', 'measurement', 'systems', 'Reports', 'on', 'a', 'new', 'utility', 'for', 'development', 'of', 'computational', 'phantoms', 'for', 'Monte', 'Carlo', 'calculations', 'and', 'data', 'analysis', 'for', 'in', 'vivo', 'measurements', 'of', 'radionuclides', 'deposited', 'in', 'tissues', '.', 'The', 'individual', 'properties', 'of', 'each', 'worker', 'can', 'be', 'acquired', 'for', 'a', 'rather', 'precise', 'geometric', 'representation', 'of', 'his', '-LRB-', 'her', '-RRB-', 'anatomy', ',', 'which', 'is', 'particularly', 'important', 'for', 'low', 'energy', 'gamma', 'ray', 'emitting', 'sources', 'such', 'as', 'thorium', ',', 'uranium', ',', 'plutonium', 'and', 'other', 'actinides', '.', 'The', 'software', 'enables', 'automatic', 'creation', 'of', 'an', 'MCNP', 'input', '

# Fine tuned

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

model_name = "google/pegasus-large"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [None]:
from transformers import AutoTokenizer

def tokenize_function(examples):
    # Join the token list into plain string
    inputs = [" ".join(doc) for doc in examples["document"]]
    # Join target keyphrases into a single string (separated by '; ')
    targets = ["; ".join(keys) for keys in examples["abstractive_keyphrases"]]

    # Tokenize the inputs and targets
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=64, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

# Tokenize all splits
tokenized_datasets = dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from peft import get_peft_model, LoraConfig, TaskType

# Create LoRA configuration
lora_config = LoraConfig(
    r=8,                         # LoRA rank
    lora_alpha=32,               # LoRA scaling factor
    target_modules=["q_proj", "v_proj"]  , # LoRA target modules; common for Pegasus
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM  # For Pegasus-style encoder-decoder
)

# Wrap model with LoRA
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 1,572,864 || all params: 572,369,920 || trainable%: 0.2748


In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-6)

scaler = torch.cuda.amp.GradScaler(enabled=True)

  scaler = torch.cuda.amp.GradScaler(enabled=True)


In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",  # Output directory for logs and checkpoints
    eval_strategy="epoch",
    # learning_rate=1e-4,
    # Reduce batch size
    per_device_train_batch_size=4, # Reduced from 16
    per_device_eval_batch_size=4,
    num_train_epochs=16,
    weight_decay=0.01,
    save_total_limit=3,
    predict_with_generate=True,
    fp16=True,  # Set to True if your GPU supports FP16 (recommended for memory saving)
    # Add gradient accumulation steps
    gradient_accumulation_steps=8, # Accumulate gradients over 2 steps
    logging_steps=100,
    #lr_scheduler_type='linear', # Learning rate scheduler
    #warmup_steps=100,
)

# Initialize the Trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    optimizers=(optimizer, None),
)

# Continue with trainer.train()

  trainer = Seq2SeqTrainer(
No label_names provided for model class `PeftModelForSeq2SeqLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,12.097081
2,No log,12.093914
3,No log,12.090529
4,11.795400,12.086926
5,11.795400,12.08281
6,11.795400,12.07901
7,11.525800,12.075549
8,11.525800,12.072231
9,11.525800,12.069525
10,11.558200,12.067349


TrainOutput(global_step=496, training_loss=11.636766003024194, metrics={'train_runtime': 1632.1052, 'train_samples_per_second': 9.803, 'train_steps_per_second': 0.304, 'total_flos': 2.5094852752441344e+16, 'train_loss': 11.636766003024194, 'epoch': 15.512})

In [None]:
trainer.save_model("./results/final_checkpoint")

In [None]:
!ls ./results

checkpoint-496	final_checkpoint  runs


In [None]:
# Save the fine-tuned model with the PEFT adapter
trainer.save_model("./results/final_checkpoint")

In [None]:
from peft import PeftModel
# Load the base model
base_model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large")

# Load the LoRA adapter on top of the base model
finetuned_model = PeftModel.from_pretrained(base_model, "./results/final_checkpoint")

# Load the tokenizer
finetuned_tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

# Move the model to the appropriate device
device = "cuda" if torch.cuda.is_available() else "cpu"
finetuned_model.to(device)

def generate_texts_finetuned(documents, prefix="", max_length=60, num_beams=5):
    generated_texts = []
    for doc_tokens in documents:
        # Join tokens into a string
        doc_text = " ".join(doc_tokens) if isinstance(doc_tokens, list) else doc_tokens
        input_text = prefix + doc_text

        inputs = finetuned_tokenizer(input_text, return_tensors="pt", truncation=True, padding="longest").to(device)
        outputs = finetuned_model.generate(**inputs, max_length=max_length, num_beams=num_beams)
        decoded = finetuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
        generated_texts.append(decoded)
    return generated_texts

generated_keyphrases_finetuned = generate_texts_finetuned(test_documents, prefix="keyphrases: ")


generated_summaries_finetuned = generate_texts_finetuned(test_documents, prefix=":") # Using ":" based on your original code

rouge_scores_keyphrases_finetuned = rouge.compute(predictions=generated_keyphrases_finetuned, references=reference_keyphrases_joined)

print("\nROUGE Scores for Fine-tuned Keyphrases:")
for k, v in rouge_scores_keyphrases_finetuned.items():
    print(f"{k}: {v:.4f}")

print("\nSample Generated Summary (Fine-tuned):")
print(generated_summaries_finetuned[0])

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ROUGE Scores for Fine-tuned Keyphrases:
rouge1: 0.1506
rouge2: 0.0737
rougeL: 0.1296
rougeLsum: 0.1293

Sample Generated Summary (Fine-tuned):
The utility was tested for low energy emitters in plastic and biological tissues as well as for computed tomography and magnetic resonance imaging scanning information


In [None]:
print("\nGenerated Keyphrases:")
print(generated_keyphrases_finetuned[0])


Generated Keyphrases:
The utility was tested for low energy emitters in plastic and biological tissues as well as for computed tomography and magnetic resonance imaging scanning information
