In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd '/content/drive/My Drive/project_PR'

Mounted at /content/drive
/content/drive/My Drive/project_PR


In [None]:
!pip install datasets transformers py7zr sentencepiece rouge-score bert-score nltk

!pip install git+https://github.com/google-research/bleurt.git

!wget https://storage.googleapis.com/bleurt-oss/bleurt-base-128.zip
!unzip bleurt-base-128.zip

Collecting datasets
  Downloading datasets-2.14.1-py3-none-any.whl (492 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/492.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.7/492.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.4/492.4 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.3.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-

In [None]:
import torch
import numpy as np
from transformers import AdamW
from datasets import load_dataset
from torch.utils.data import DataLoader
from rouge_score import rouge_scorer
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from datasets import load_metric

In [None]:
# Load the dataset
dataset = load_dataset('samsum')

Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [None]:
from transformers import ProphetNetTokenizer, ProphetNetForConditionalGeneration

model_name = 'microsoft/prophetnet-large-uncased'
tokenizer = ProphetNetTokenizer.from_pretrained(model_name)
model = ProphetNetForConditionalGeneration.from_pretrained(model_name)

Downloading (…)prophetnet.tokenizer:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/90.0 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]



Downloading model.safetensors:   0%|          | 0.00/1.57G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [None]:
def train(model, optimizer, train_loader, device, pad_token_id):
    model.train()
    total_train_loss = 0

    for batch in train_loader:
        input_ids, attention_mask, labels = [b.to(device) for b in batch]

        input_ids = input_ids[:, :-1].contiguous()
        attention_mask = attention_mask[:, :-1].contiguous()
        labels = labels[:, 1:].contiguous()
        labels[labels == pad_token_id] = -100

        optimizer.zero_grad()

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_train_loss += loss.item()

        loss.backward()
        optimizer.step()

    average_train_loss = total_train_loss / len(train_loader)
    return average_train_loss

In [None]:
def validate(model, val_loader, device, pad_token_id):
    model.eval()
    total_val_loss = 0

    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = [b.to(device) for b in batch]

            input_ids = input_ids[:, :-1].contiguous()
            attention_mask = attention_mask[:, :-1].contiguous()
            labels = labels[:, 1:].contiguous()
            labels[labels == pad_token_id] = -100

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_val_loss += loss.item()

    average_val_loss = total_val_loss / len(val_loader)
    return average_val_loss

In [None]:
def fine_tune(model, tokenizer, dataset, max_length=256):
    num_epochs = 5
    batch_size = 3
    learning_rate = 1e-5

    train_dataset = dataset['train']
    validation_dataset = dataset['validation']

    tokenized_inputs_train = tokenizer(train_dataset['dialogue'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    tokenized_targets_train = tokenizer(train_dataset['summary'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

    tokenized_inputs_val = tokenizer(validation_dataset['dialogue'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')
    tokenized_targets_val = tokenizer(validation_dataset['summary'], truncation=True, padding='max_length', max_length=max_length, return_tensors='pt')

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    train_dataset = TensorDataset(tokenized_inputs_train['input_ids'], tokenized_inputs_train['attention_mask'], tokenized_targets_train['input_ids'])
    val_dataset = TensorDataset(tokenized_inputs_val['input_ids'], tokenized_inputs_val['attention_mask'], tokenized_targets_val['input_ids'])

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

    optimizer = AdamW(model.parameters(), lr=learning_rate)

    for epoch in range(num_epochs):
        train_loss = train(model, optimizer, train_loader, device, tokenizer.pad_token_id)
        val_loss = validate(model, val_loader, device, tokenizer.pad_token_id)

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Average Training Loss: {train_loss:.4f}')
        print(f'Average Validation Loss: {val_loss:.4f}')

    return model

In [None]:
# Call the function to fine-tune the model
fine_tuned_model, val_loader = fine_tune(model, tokenizer, dataset)

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch 1/5
Average Training Loss: 2.5118642381169214
Average Validation Loss: 2.124574656888242
Epoch 2/5
Average Training Loss: 2.094359465157323
Average Validation Loss: 2.0236440290024866
Epoch 3/5
Average Training Loss: 1.8156157259639134
Average Validation Loss: 1.9793188593763136
Epoch 4/5
Average Training Loss: 1.5879341540074936
Average Validation Loss: 2.000244283195817
Epoch 5/5
Average Training Loss: 1.4029129356073038
Average Validation Loss: 2.0391941422071214


In [None]:
fine_tuned_model.save_pretrained('fine_tuned_prophetnet')

In [None]:
from datasets import load_metric
rouge = load_metric('rouge')
from sklearn.model_selection import train_test_split
bleurt_metric = load_metric('bleurt')

  rouge = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/1.97k [00:00<?, ?B/s]



Downloading data:   0%|          | 0.00/405M [00:00<?, ?B/s]

In [None]:
from datasets import load_metric
rouge = load_metric('rouge')
bleurt_metric = load_metric('bleurt')
from nltk.translate.bleu_score import sentence_bleu
import bert_score



In [None]:
def generate_summaries_without_attention_mask(model, input_ids, target_ids, tokenizer):
    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)
    pred_summaries = tokenizer.batch_decode(summary_ids, skip_special_tokens=True)
    real_summaries = tokenizer.batch_decode(target_ids, skip_special_tokens=True)

    return pred_summaries, real_summaries

In [None]:
def update_scores(total_scores, pred_summaries, real_summaries):
    # ROUGE Scores
    rouge_scores = rouge.compute(predictions=pred_summaries, references=real_summaries)
    total_scores["rouge1"] += np.mean([score['rouge1'].mid.fmeasure for score in rouge_scores])
    total_scores["rouge2"] += np.mean([score['rouge2'].mid.fmeasure for score in rouge_scores])
    total_scores["rougeL"] += np.mean([score['rougeL'].mid.fmeasure for score in rouge_scores])

    # BERTScore
    P, R, bert_scores = bert_score.score(pred_summaries, real_summaries, lang="en", model_type='bert-base-uncased', rescale_with_baseline=True)
    total_scores["bert"] += torch.mean(bert_scores).item()

    # BLEURT Scores
    bleurt_scores = bleurt_metric.compute(predictions=pred_summaries, references=real_summaries)['scores']
    total_scores["bleurt"] += np.mean(bleurt_scores)

    # BLEU Scores
    bleu_scores = [sentence_bleu([ref], pred, weights=(1.0, 0, 0, 0)) for ref, pred in zip(real_summaries, pred_summaries)]
    total_scores["bleu"] += np.mean(bleu_scores)

    return total_scores

In [None]:
def calculate_scores(model, dataloader, tokenizer, device):
    model.eval()

    total_scores = {
        "bert": 0,
        "bleurt": 0,
        "bleu": 0,
        "rouge1": 0,
        "rouge2": 0,
        "rougeL": 0
    }
    num_batches = 0

    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, target_ids = [b.to(device) for b in batch]
            pred_summaries, real_summaries = generate_summaries_without_attention_mask(model, input_ids, target_ids, tokenizer)

            # Calculate scores
            total_scores = update_scores(total_scores, pred_summaries, real_summaries)
            num_batches += 1

    # Calculate averages
    average_scores = {metric: total / num_batches for metric, total in total_scores.items()}
    average_scores["overall_rouge"] = np.mean([average_scores["rouge1"], average_scores["rouge2"], average_scores["rougeL"]])

    return average_scores

In [None]:
# Calculate scores on the test set
test_dataset = dataset['test']
tokenized_inputs_test = tokenizer(test_dataset['dialogue'], truncation=True, padding='max_length', max_length=128, return_tensors='pt')
tokenized_targets_test = tokenizer(test_dataset['summary'], truncation=True, padding='max_length', max_length=64, return_tensors='pt')
test_dataset = torch.utils.data.TensorDataset(tokenized_inputs_test['input_ids'], tokenized_inputs_test['attention_mask'], tokenized_targets_test['input_ids'])
test_loader = DataLoader(test_dataset, batch_size=3, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scores = calculate_scores(fine_tuned_model, test_loader, tokenizer, device)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
# Print out the scores
print(f'ROUGE-1 Score: {scores["rouge1"]}')
print(f'ROUGE-2 Score: {scores["rouge2"]}')
print(f'ROUGE-L Score: {scores["rougeL"]}')
print(f'Overall ROUGE Score: {scores["overall_rouge"]}')
print(f'BERTScore: {scores["bert"]}')
print(f'BLEURT Score: {scores["bleurt"]}')
print(f'BLEU Score: {scores["bleu"]}')


ROUGE-1 Score: 0.4437227790826602
ROUGE-2 Score: 0.2103031329661813
ROUGE-L Score: 0.3469829986034725
Overall ROUGE Score: 0.3336696368841047
BERTScore: 0.49538950627540057
BLEURT Score: -0.41521845807085983
BLEU Score: 0.5948179293693816


In [None]:
def summarize_text(text, model, tokenizer,device):
    # Input Tokenization
    input_ids = tokenizer.encode(text, truncation=True, max_length=512, return_tensors='pt')

    # Move input_ids to device
    input_ids = input_ids.to(device)


    # Generating summary
    summary_ids = model.generate(input_ids, max_length=128, num_beams=4, early_stopping=True)

    # Decoding the summary tokens back to text
    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)

    return summary

In [None]:
# Select a sample from the dataset
sample = dataset['test'][0]
input_text = sample['dialogue']

In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
summary = summarize_text(input_text, fine_tuned_model, tokenizer,device)


In [None]:
  print("Summary:", summary)

Summary: hannah is looking for betty's number, because larry called her last time they were at the park together. amanda doesn't know him very well, so she will text him.


In [None]:
pip install rouge_score



In [None]:
from datasets import load_metric
rouge_metric = load_metric("rouge")
references = [sample['summary']]
predictions = [summary]
rouge_scores = rouge_metric.compute(predictions=predictions, references=references)

In [None]:
# Print ROUGE scores
print("ROUGE scores:", rouge_scores)

ROUGE scores: {'rouge1': AggregateScore(low=Score(precision=0.2903225806451613, recall=0.5625, fmeasure=0.3829787234042554), mid=Score(precision=0.2903225806451613, recall=0.5625, fmeasure=0.3829787234042554), high=Score(precision=0.2903225806451613, recall=0.5625, fmeasure=0.3829787234042554)), 'rouge2': AggregateScore(low=Score(precision=0.13333333333333333, recall=0.26666666666666666, fmeasure=0.17777777777777776), mid=Score(precision=0.13333333333333333, recall=0.26666666666666666, fmeasure=0.17777777777777776), high=Score(precision=0.13333333333333333, recall=0.26666666666666666, fmeasure=0.17777777777777776)), 'rougeL': AggregateScore(low=Score(precision=0.25806451612903225, recall=0.5, fmeasure=0.3404255319148936), mid=Score(precision=0.25806451612903225, recall=0.5, fmeasure=0.3404255319148936), high=Score(precision=0.25806451612903225, recall=0.5, fmeasure=0.3404255319148936)), 'rougeLsum': AggregateScore(low=Score(precision=0.25806451612903225, recall=0.5, fmeasure=0.34042553