In [None]:
!pip install datasets transformers py7zr sentencepiece rouge-score bert-score nltk

!pip install git+https://github.com/google-research/bleurt.git

!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip

Collecting git+https://github.com/google-research/bleurt.git
  Cloning https://github.com/google-research/bleurt.git to /tmp/pip-req-build-u7v_m3_0
  Running command git clone --filter=blob:none --quiet https://github.com/google-research/bleurt.git /tmp/pip-req-build-u7v_m3_0
  Resolved https://github.com/google-research/bleurt.git to commit cebe7e6f996b40910cfaa520a63db47807e3bf5c
  Preparing metadata (setup.py) ... [?25l[?25hdone
--2023-07-31 14:43:24--  https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.31.128, 142.251.18.128, 142.250.153.128, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.31.128|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 405489453 (387M) [application/zip]
Saving to: ‘bleurt-base-128.zip.1’


2023-07-31 14:43:35 (36.3 MB/s) - ‘bleurt-base-128.zip.1’ saved [405489453/405489453]

Archive:  bleurt-base-128.zip
replace bleurt-base-1

In [None]:
import torch
import numpy as np
from transformers import AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_metric

In [None]:
# Load the dataset
dataset = load_dataset('samsum')

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
tokenizer = GPT2Tokenizer.from_pretrained('gpt2', padding_side='left')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [None]:
tokenizer.add_special_tokens({'eos_token': '</s>', 'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

Embedding(50259, 768)

In [None]:
print(f'Size of model vocabulary: {model.config.vocab_size}')

Size of model vocabulary: 50259


In [None]:
def process_data(dialogue, summary, tokenizer, max_length):
    input_text = dialogue + tokenizer.eos_token + summary
    tokenized = tokenizer.encode_plus(input_text, return_tensors='pt', truncation=True, max_length=max_length)
    labels = tokenized['input_ids'].clone()
    return tokenized['input_ids'][0], tokenized['attention_mask'][0], labels[0]

In [None]:
def train(model, optimizer, train_loader, device, tokenizer_pad_token_id):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        input_ids = input_ids[:, :-1].contiguous()
        attention_mask = attention_mask[:, :-1].contiguous()
        labels = labels[:, 1:].contiguous()
        labels[labels == tokenizer_pad_token_id] = -100

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_train_loss = total_train_loss / len(train_loader)
    return average_train_loss


In [None]:
def validate(model, val_loader, device, tokenizer_pad_token_id):
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            input_ids = input_ids[:, :-1].contiguous()
            attention_mask = attention_mask[:, :-1].contiguous()
            labels = labels[:, 1:].contiguous()
            labels[labels == tokenizer_pad_token_id] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    average_val_loss = total_val_loss / len(val_loader)
    return average_val_loss

In [None]:
def fine_tune(model, tokenizer, dataset, max_length=256):
    num_epochs = 5
    batch_size = 3
    learning_rate = 1e-5

    train_dataset = dataset['train']
    validation_dataset = dataset['validation']

    tokenized_inputs_train = tokenizer(train_dataset['dialogue'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    tokenized_targets_train = tokenizer(train_dataset['summary'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

    tokenized_inputs_val = tokenizer(validation_dataset['dialogue'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    tokenized_targets_val = tokenizer(validation_dataset['summary'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_dataset = TensorDataset(tokenized_inputs_train['input_ids'], tokenized_inputs_train['attention_mask'], tokenized_targets_train['input_ids'])
    val_dataset = TensorDataset(tokenized_inputs_val['input_ids'], tokenized_inputs_val['attention_mask'], tokenized_targets_val['input_ids'])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        average_train_loss = train(model, optimizer, train_loader, device, tokenizer.pad_token_id)
        average_val_loss = validate(model, val_loader, device, tokenizer.pad_token_id)

        print(f'Epoch {epoch + 1}/{num_epochs}')
        print(f'Average Training Loss: {average_train_loss}')
        print(f'Average Validation Loss: {average_val_loss}')

    return model


In [None]:
# Call the function to fine-tune the model
fine_tuned_model = fine_tune(model, tokenizer, dataset, max_length=256)

Epoch 1/5
Average Training Loss: 6.480368547783294
Average Validation Loss: 5.735387786404117
Epoch 2/5
Average Training Loss: 5.637575908630595
Average Validation Loss: 5.450006142640725
Epoch 3/5
Average Training Loss: 5.3782674130245764
Average Validation Loss: 5.298047046521645
Epoch 4/5
Average Training Loss: 5.188557880518514
Average Validation Loss: 5.24362513695881
Epoch 5/5
Average Training Loss: 5.076605138456329
Average Validation Loss: 5.201459838357164


In [None]:
fine_tuned_model.save_pretrained('fine_tuned_gpt2')

In [None]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

fine_tuned_model = GPT2LMHeadModel.from_pretrained('fine_tuned_gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('fine_tuned_gpt2')

In [None]:
import torch
import numpy as np
from nltk.translate.bleu_score import sentence_bleu
import bert_score
from datasets import load_metric
rouge = load_metric('rouge')
bleurt_metric = load_metric('bleurt')

  rouge = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

In [None]:
def generate_summaries(model, input_ids, attention_mask, target_ids, tokenizer):
    summary_ids = model.generate(input_ids,
                                 attention_mask=attention_mask,
                                 pad_token_id=tokenizer.pad_token_id,
                                 max_length=256,  # Increase this value
                                 num_beams=4,
                                 early_stopping=True)
    pred_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    real_summaries = tokenizer.batch_decode(target_ids, skip_special_tokens=True)

    return pred_summaries, real_summaries

In [None]:
def calculate_scores(model, dataloader, tokenizer, device):
    model.eval()

    total_scores = {
        "bert": 0,
        "bleurt": 0,
        "bleu": 0,
        "rouge1": 0,
        "rouge2": 0,
        "rougeL": 0
    }
    num_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, target_ids = [b.to(device) for b in batch]
            pred_summaries, real_summaries = generate_summaries(model, input_ids, attention_mask, target_ids, tokenizer)

            # Calculate scores
            total_scores = update_scores(total_scores, pred_summaries, real_summaries)
            num_batches += 1

    # Calculate averages
    average_scores = {metric: total / num_batches for metric, total in total_scores.items()}

    average_scores["overall_rouge"] = np.mean([average_scores["rouge1"], average_scores["rouge2"], average_scores["rougeL"]])

    return average_scores



In [None]:
def update_scores(total_scores, pred_summaries, real_summaries):
    # ROUGE Scores
    rouge_scores = rouge.compute(predictions=pred_summaries, references=real_summaries)
    total_scores["rouge1"] += np.mean([score['rouge1'].mid.fmeasure for score in rouge_scores])
    total_scores["rouge2"] += np.mean([score['rouge2'].mid.fmeasure for score in rouge_scores])
    total_scores["rougeL"] += np.mean([score['rougeL'].mid.fmeasure for score in rouge_scores])

    # BERTScore
    P, R, bert_scores = bert_score.score(pred_summaries, real_summaries, lang="en", model_type='bert-base-uncased', rescale_with_baseline=True)
    total_scores["bert"] += torch.mean(bert_scores).item()

    # BLEURT Scores
    bleurt_scores = bleurt_metric.compute(predictions=pred_summaries, references=real_summaries)['scores']
    total_scores["bleurt"] += np.mean(bleurt_scores)

    # BLEU Scores
    bleu_scores = [sentence_bleu([ref], pred, weights=(1.0, 0, 0, 0)) for ref, pred in zip(real_summaries, pred_summaries)]
    total_scores["bleu"] += np.mean(bleu_scores)

    return total_scores


In [None]:
test_dataset = dataset['test']
tokenized_inputs_test = tokenizer(test_dataset['dialogue'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
tokenized_targets_test = tokenizer(test_dataset['summary'], truncation=True, padding='max_length', max_length=64, return_tensors='pt')
test_dataset = torch.utils.data.TensorDataset(tokenized_inputs_test['input_ids'], tokenized_inputs_test['attention_mask'], tokenized_targets_test['input_ids'])
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=True)

In [None]:
# Calculate scores on the test set
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scores = calculate_scores(fine_tuned_model, test_loader, tokenizer, device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Print out the scores
print(f'ROUGE-1 Score: {scores["rouge1"]}')
print(f'ROUGE-2 Score: {scores["rouge2"]}')
print(f'ROUGE-L Score: {scores["rougeL"]}')
print(f'Overall ROUGE Score: {scores["overall_rouge"]}')
print(f'BERTScore: {scores["bert"]}')
print(f'BLEURT Score: {scores["bleurt"]}')
print(f'BLEU Score: {scores["bleu"]}')


ROUGE-1 Score: 0.1563830560597174
ROUGE-2 Score: 0.04906724213410677
ROUGE-L Score: 0.11723127430619057
Overall ROUGE Score: 0.10756052416667157
BERTScore: 0.1321133449920982
BLEURT Score: -1.2442681577252235
BLEU Score: 0.13287259176879135
