In [None]:
!pip install datasets
!pip install transformers
!pip install py7zr
!pip install sentencepiece
!pip install rouge-score
!pip install bert-score
!pip install git+https://github.com/google-research/bleurt.git
!pip install nltk

!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0.0,>=0.14.0 (from datasets)
  Downloading huggingface_hub-0.16.4-py3-none-a

In [None]:
import torch
import numpy as np
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from bleurt import score as bleurt_score
from nltk.translate.bleu_score import sentence_bleu

In [None]:
from datasets import load_metric
rouge = load_metric('rouge')
bleurt_metric = load_metric('bleurt')

from nltk.translate.bleu_score import sentence_bleu

import bert_score


def calculate_scores(model, dataloader, tokenizer, device):
    model.eval()

    rouge_scores_list = []
    total_bert_score = 0
    total_bleurt_score = 0
    total_bleu_score = 0
    num_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, target_ids = [b.to(device) for b in batch]

            summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
            pred_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
            real_summaries = tokenizer.batch_decode(target_ids, skip_special_tokens=True)

            # ROUGE Scores
            rouge_scores = rouge.compute(predictions=pred_summaries, references=real_summaries)
            rouge_scores_list.append(rouge_scores)

            # BERTScore
            P, R, bert_scores = bert_score.score(pred_summaries, real_summaries, lang="en", model_type='bert-base-uncased', rescale_with_baseline=True)
            total_bert_score += torch.mean(bert_scores).item()

            # BLEURT Scores
            bleurt_scores = bleurt_metric.compute(predictions=pred_summaries, references=real_summaries)['scores']
            total_bleurt_score += np.mean(bleurt_scores)

            # BLEU Scores
            bleu_scores = [sentence_bleu([ref], pred, weights=(1.0, 0, 0, 0)) for ref, pred in zip(real_summaries, pred_summaries)]
            total_bleu_score += np.mean(bleu_scores)

            num_batches += 1

    average_bert_score = total_bert_score / num_batches
    average_bleurt_score = total_bleurt_score / num_batches
    average_bleu_score = total_bleu_score / num_batches

    # Calculating average ROUGE scores
    rouge1 = np.mean([score['rouge1'].mid.fmeasure for score in rouge_scores_list])
    rouge2 = np.mean([score['rouge2'].mid.fmeasure for score in rouge_scores_list])
    rougeL = np.mean([score['rougeL'].mid.fmeasure for score in rouge_scores_list])

    overall_rouge = np.mean([rouge1, rouge2, rougeL])


    average_scores = {'rouge1': rouge1, 'rouge2': rouge2, 'rougeL': rougeL, 'overall_rouge': overall_rouge, 'bert': average_bert_score, 'bleurt': average_bleurt_score, 'bleu': average_bleu_score}

    return average_scores



In [None]:
from datasets import load_metric
rouge = load_metric('rouge')
from sklearn.model_selection import train_test_split
bleurt_metric = load_metric('bleurt')

def fine_tune(model, tokenizer, dataset):
    num_epochs = 5
    batch_size = 3
    learning_rate = 1e-5

    train_dataset = dataset['train']
    validation_dataset = dataset['validation']
    test_dataset = dataset['test']

    tokenized_inputs_train = tokenizer(train_dataset['dialogue'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    tokenized_targets_train = tokenizer(train_dataset['summary'], truncation=True, padding='max_length', max_length=64, return_tensors='pt')

    tokenized_inputs_val = tokenizer(validation_dataset['dialogue'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    tokenized_targets_val = tokenizer(validation_dataset['summary'], truncation=True, padding='max_length', max_length=64, return_tensors='pt')

    tokenized_inputs_test = tokenizer(test_dataset['dialogue'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
    tokenized_targets_test = tokenizer(test_dataset['summary'], truncation=True, padding='max_length', max_length=64, return_tensors='pt')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_dataset = torch.utils.data.TensorDataset(tokenized_inputs_train['input_ids'], tokenized_inputs_train['attention_mask'], tokenized_targets_train['input_ids'])
    val_dataset = torch.utils.data.TensorDataset(tokenized_inputs_val['input_ids'], tokenized_inputs_val['attention_mask'], tokenized_targets_val['input_ids'])
    test_dataset = torch.utils.data.TensorDataset(tokenized_inputs_test['input_ids'], tokenized_inputs_test['attention_mask'], tokenized_targets_test['input_ids'])

    train_loader = DataLoader(train_dataset, batch_size=batch_size)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for batch in train_loader:
            input_ids, attention_mask, target_ids = [b.to(device) for b in batch]

            decoder_input_ids = target_ids[:, :-1].contiguous()
            labels = target_ids[:, 1:].clone().detach()
            labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

            optimizer.zero_grad()

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=labels)
            loss = outputs.loss
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()

        average_train_loss = total_train_loss / len(train_loader)

        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, target_ids = [b.to(device) for b in batch]

                decoder_input_ids = target_ids[:, :-1].contiguous()
                labels = target_ids[:, 1:].clone().detach()
                labels[target_ids[:, 1:] == tokenizer.pad_token_id] = -100

                outputs = model(input_ids=input_ids, attention_mask=attention_mask, decoder_input_ids=decoder_input_ids, labels=labels)
                loss = outputs.loss
                total_val_loss += loss.item()

        average_val_loss = total_val_loss / len(val_loader)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Average Training Loss: {average_train_loss}')
        print(f'Average Validation Loss: {average_val_loss}')

    return model, val_loader,test_loader





In [None]:
# Load the dataset
dataset = load_dataset('samsum')

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
model_name = 'google/pegasus-large'
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/3.09k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-large and are newly initialized: ['model.encoder.embed_positions.weight', 'model.decoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading (…)neration_config.json:   0%|          | 0.00/260 [00:00<?, ?B/s]

In [None]:
# Call the function to fine-tune the model
fine_tuned_model, val_loader,test_loader = fine_tune(model, tokenizer, dataset)




Epoch 1/5
Average Training Loss: 1.3474608975912252
Average Validation Loss: 1.5883903453201602
Epoch 2/5
Average Training Loss: 1.317011929965024
Average Validation Loss: 1.5964932323812129
Epoch 3/5
Average Training Loss: 1.287001117302332
Average Validation Loss: 1.6058561651261298
Epoch 4/5
Average Training Loss: 1.2588802863468442
Average Validation Loss: 1.6083259263754763
Epoch 5/5
Average Training Loss: 1.2287679203802835
Average Validation Loss: 1.618514169703473


In [None]:
fine_tuned_model.save_pretrained('fine_tuned_pegasus_large')

In [None]:
# Calculate scores on the test set
scores = calculate_scores(fine_tuned_model, test_loader, tokenizer,device)




The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [None]:
# Print out the scores
print(f'ROUGE-1 Score: {scores["rouge1"]}')
print(f'ROUGE-2 Score: {scores["rouge2"]}')
print(f'ROUGE-L Score: {scores["rougeL"]}')
print(f'Overall ROUGE Score: {scores["overall_rouge"]}')
print(f'BERTScore: {scores["bert"]}')
print(f'BLEURT Score: {scores["bleurt"]}')
print(f'BLEU Score: {scores["bleu"]}')


ROUGE-1 Score: 0.46610802080105634
ROUGE-2 Score: 0.24992436560626163
ROUGE-L Score: 0.3874400535216279
Overall ROUGE Score: 0.3678241466429819
BERTScore: 0.4788511668915277
BLEURT Score: -0.6239330146975712
BLEU Score: 0.5282755938961953


In [None]:
def summarize_text(text, model, tokenizer,device):
    # Input Tokenization
    input_ids = tokenizer.encode(text, truncation=True, max_length=512, return_tensors='pt')

    # Move input_ids to device
    input_ids = input_ids.to(device)


    # Generating summary
    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Decoding the summary tokens back to text
    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

    return summary

In [None]:
# Select a sample from the dataset
sample = dataset['test'][0]
input_text = sample['dialogue']

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary = summarize_text(input_text, fine_tuned_model, tokenizer,device)


In [None]:
  print("Summary:", summary)

Summary: can't find Betty's number. Larry called her last time they were at the park together. Hannah would rather Amanda texted Larry.


In [None]:
pip install rouge_score



In [None]:
from datasets import load_metric


In [None]:
# ROUGE metric
rouge_metric = load_metric("rouge")


In [None]:
# Calculating ROUGE scores
references = [sample['summary']]
predictions = [summary]
rouge_scores = rouge_metric.compute(predictions=predictions, references=references)

In [None]:
# Print ROUGE scores
print("ROUGE scores:", rouge_scores)

ROUGE scores: {'rouge1': AggregateScore(low=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216), mid=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216), high=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216)), 'rouge2': AggregateScore(low=Score(precision=0.3333333333333333, recall=0.13333333333333333, fmeasure=0.19047619047619044), mid=Score(precision=0.3333333333333333, recall=0.13333333333333333, fmeasure=0.19047619047619044), high=Score(precision=0.3333333333333333, recall=0.13333333333333333, fmeasure=0.19047619047619044)), 'rougeL': AggregateScore(low=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216), mid=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216), high=Score(precision=0.7142857142857143, recall=0.3125, fmeasure=0.43478260869565216)), 'rougeLsum': AggregateScore(low=Score(precision=0.7142857142857143, recall=0.3125, fmeasure