In [2]:
!pip install datasets
!pip install transformers
!pip install py7zr
!pip install sentencepiece
!pip install rouge-score
!pip install bert-score
!pip install git+https://github.com/google-research/bleurt.git
!pip install nltk
!pip install rouge
!pip install bleurt
!pip install sentencepiece

!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-0woomhr_
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-0woomhr_
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
--2023-07-31 08:56:21--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 74.125.68.128, 64.233.170.128, 74.125.24.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|74.125.68.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip.3’


2023-07-31 08:56:40 (21.3 MB/s) - ‘bleurt-base-128.zip.3’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
replace bleurt-base-128/v

In [1]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW
from datasets import load_dataset
from nltk.translate.bleu_score import corpus_bleu
from rouge_score import rouge_scorer
from bleurt import score
from bert_score import BERTScorer
import numpy as np
from nltk.translate.bleu_score import sentence_bleu


from transformers import T5Tokenizer


In [2]:
# Load the dataset
dataset = load_dataset("samsum")
train_data = dataset["train"]
val_data = dataset["validation"]

In [3]:
# Load the pre-trained T5-base model and tokenizer
model = T5ForConditionalGeneration.from_pretrained("t5-base")
tokenizer = T5Tokenizer.from_pretrained('t5-base')

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This means that tokens that come after special tokens will not be properly handled. We recommend you to read the related pull request available at https://github.com/huggingface/transformers/pull/24565


In [4]:
def preprocess_function(examples):
    inputs = [f'dialogue: {d}' for d in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=256, truncation=True, padding="max_length")
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["summary"], max_length=128, truncation=True, padding="max_length")

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [5]:
# Apply the preprocessing function to the train and validation data
train_data = train_data.map(preprocess_function, batched=True)
val_data = val_data.map(preprocess_function, batched=True)



# Define the optimizer
optimizer = AdamW(model.parameters(), lr=1e-4)

# Fine-tune the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



T5ForConditionalGeneration(
  (shared): Embedding(32128, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32128, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [6]:
def collate_fn(batch):
    input_ids = []
    attention_mask = []
    labels = []

    for item in batch:
        input_ids.append(torch.tensor(item["input_ids"]))
        attention_mask.append(torch.tensor(item["attention_mask"]))
        labels.append(torch.tensor(item["labels"]))

    return {
        "input_ids": torch.stack(input_ids, dim=0),
        "attention_mask": torch.stack(attention_mask, dim=0),
        "labels": torch.stack(labels, dim=0)
    }

# Create data loaders
batch_size = 8  # Define the batch size as desired
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

num_epochs = 5
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode for each epoch
    total_loss = 0.0
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_train_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Training Loss: {avg_train_loss}")

    model.eval()
    total_val_loss = 0.0
    for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Validation Loss: {avg_val_loss}")


Epoch 1/5, Training Loss: 0.45939115569982414
Epoch 1/5, Validation Loss: 0.33070452352171964
Epoch 2/5, Training Loss: 0.3410264914501438
Epoch 2/5, Validation Loss: 0.3271441296177003
Epoch 3/5, Training Loss: 0.30942126234890194
Epoch 3/5, Validation Loss: 0.3210155185565208
Epoch 4/5, Training Loss: 0.2826859116651336
Epoch 4/5, Validation Loss: 0.32964357892865115
Epoch 5/5, Training Loss: 0.2586086437098043
Epoch 5/5, Validation Loss: 0.3296921192442329


In [33]:
test_data = dataset["test"]
test_data = test_data.map(preprocess_function, batched=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)


In [19]:
# Save the fine-tuned model after all epochs
model.save_pretrained('t5-base')

In [20]:
# Function to generate summaries using the fine-tuned model
def summarize_text(text, model, tokenizer, device):
    # Input Tokenization
    input_ids = tokenizer.encode(text, truncation=True, max_length=256, return_tensors='pt')

    # Move input_ids to device
    input_ids = input_ids.to(device)

    # Generating summary
    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Decoding the summary tokens back to text
    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

    return summary


In [21]:
# Select a sample from the dataset
sample = dataset['test'][0]
input_text = sample['dialogue']

In [22]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary = summarize_text(input_text, fine_tuned_model, tokenizer,device)


In [23]:
  print("Summary:", summary)

Summary: Amanda is looking for Betty's number. Larry called her last time they were at the park together.


In [29]:
# Calculate the performance measures
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
bleurt_scorer = score.BleurtScorer("bleurt-base-128")
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)

references = []  # List to store the ground truth summaries
predictions = []  # List to store the predicted summaries

for batch in test_dataloader:
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    # Convert the tensors to text using the tokenizer
    inputs = tokenizer.batch_decode(input_ids, skip_special_tokens=True)
    targets = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Add the texts to the corresponding lists
    references.extend(targets)

    # Generate the summaries and add them to the list
    with torch.no_grad():
        output_ids = fine_tuned_model.generate(input_ids=input_ids, attention_mask=attention_mask)
        outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
        predictions.extend(outputs)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
print(f"Number of predictions: {len(predictions)}")
print(f"Number of references: {len(references)}")


Number of predictions: 819
Number of references: 819


In [31]:
import numpy as np
from rouge import Rouge
from bleurt import score
from bert_score import BERTScorer
# Assume that we have our predicted summaries and reference summaries
#predictions = []  # Your list of predicted summaries
#references = []  # Your list of reference summaries



# Instantiate the metric calculators
rouge = Rouge()
#bleurt_scorer = score.BleurtScorer('bleurt/test_checkpoint')
bert_scorer = BERTScorer(lang="en", rescale_with_baseline=True)

# Calculate ROUGE
rouge_scores = rouge.get_scores(predictions, references, avg=True)
rouge1_f = rouge_scores['rouge-1']['f']
rouge2_f = rouge_scores['rouge-2']['f']
rougeL_f = rouge_scores['rouge-l']['f']
overall_rouge = (rouge1_f + rouge2_f + rougeL_f) / 3

# Calculate BLEURT
bleurt_scores = bleurt_scorer.score(references=references, candidates=predictions)
mean_bleurt_score = np.mean(bleurt_scores)

# Calculate BERTScore
P, R, F1 = bert_scorer.score(predictions, references)
mean_bert_score = F1.mean().item()

# Calculate BLEU Score
bleu_scores = [sentence_bleu([ref], pred, weights=(1.0, 0, 0, 0)) for ref, pred in zip(references, predictions)]
mean_bleu_score = np.mean(bleu_scores)






Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [32]:
print("ROUGE-1 F-Measure:", rouge1_f)
print("ROUGE-2 F-Measure:", rouge2_f)
print("ROUGE-L F-Measure:", rougeL_f)
print("Overall ROUGE Score:", overall_rouge)
print("BLEURT Score:", mean_bleurt_score)
print('BERTScore: {:.4f}'.format(mean_bert_score))
print('BLEU Score: {:.4f}'.format(mean_bleu_score))

ROUGE-1 F-Measure: 0.4489576388620427
ROUGE-2 F-Measure: 0.20606424087671885
ROUGE-L F-Measure: 0.41766134455839976
Overall ROUGE Score: 0.35756107476572047
BLEURT Score: -0.4335427177894829
BERTScore: 0.4668
BLEU Score: 0.5047
