In [1]:
import pandas as pd
import numpy as np
import torch 
import torch.nn as nn
import torch.utils.data as data
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
!pip install evaluate
!pip install rouge-score
import evaluate
from nltk.translate.bleu_score import corpus_bleu
import glob

Collecting evaluate
  Obtaining dependency information for evaluate from https://files.pythonhosted.org/packages/70/63/7644a1eb7b0297e585a6adec98ed9e575309bb973c33b394dae66bc35c69/evaluate-0.4.1-py3-none-any.whl.metadata
  Downloading evaluate-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.1
Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=239d223c0e67a03a2d7c39828036d7862f83bfedd0eb18f229d044ccdee644e1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be



In [2]:
texts = []
for path in glob.glob('/kaggle/input/bbc-news-summary/BBC News Summary/News Articles/*/*', recursive=True):
    with open(path, mode='r', encoding='ISO-8859-1') as file:
        text = file.read()
        file.close()
        texts.append(text)

summaries = []
for path in glob.glob('/kaggle/input/bbc-news-summary/BBC News Summary/Summaries/*/*', recursive=True):
    with open(path, mode='r', encoding='ISO-8859-1') as file:
        summary = file.read()
        file.close()
        summaries.append(summary)

In [3]:
df = pd.DataFrame(data={'text': texts, 'summary': summaries})
df = df.sample(frac=1)
n = df.shape[0]
df_train = df.iloc[0: int(n * 0.9)]
df_test = df.iloc[int(n * 0.9): ]
x_train = df_train['text'].to_list()
y_train = df_train['summary'].to_list()
x_test = df_test['text'].to_list()
y_test = df_test['summary'].to_list()

Unnamed: 0,text,summary
33,Blair backs 'pre-election budget'\n\nTony Blai...,Mr Blair praised his chancellor for his role i...
2031,Yukos unit buyer faces loan claim\n\nThe owner...,Yukos' owner Menatep Group says it will ask Ro...
252,UK helps raped Rwandan women\n\nBritain is to ...,The plight of the infected women was overshado...
1668,Stars shine on Bafta red carpet\n\nHollywood s...,"Keanu Reeves, who presented the best actress a..."
2152,Israeli economy picking up pace\n\nIsrael's ec...,The main driver of the faster-than-expected ex...
...,...,...
1687,Levy takes Whitbread novel prize\n\nOrange Pri...,Orange Prize winner Andrea Levy has seen her b...
818,Celtic unhappy over Bulgaria date\n\nMartin O'...,"""When we were out playing Barcelona, I spoke w..."
1509,No jail for singer Courtney Love\n\nSinger Cou...,In a separate case relating to the same incide...
481,England coach faces rap after row\n\nEngland c...,"Robinson had said he was ""livid"" about Kaplan'..."


In [4]:
class MyDataset(data.Dataset):
    def __init__(self, text, summary, tokenizer):
        super().__init__()
        self.x = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt")
        self.y = tokenizer(summary, padding=True, truncation=True, max_length=512, return_tensors="pt")
        self.text = text
        self.summary = summary
            
    def __getitem__(self, index):
        text_dict = {k: v[index] for k, v in self.x.items()}
        summary_dict = {k: v[index] for k, v in self.y.items()}
        return (text_dict, summary_dict, self.text[index], self.summary[index])

    def __len__(self):
        return self.x['input_ids'].shape[0]

In [5]:
batch_size = 4
learning_rate = 1e-4
num_epochs = 5
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-base')
model = AutoModelForSeq2SeqLM.from_pretrained('facebook/bart-base')    
model = model.to(device)
optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=0.1)
rouge = evaluate.load('rouge')
train_dataset = MyDataset(x_train, y_train, tokenizer)
train_loader = data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)
test_dataset = MyDataset(x_test, y_test, tokenizer)
test_loader = data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=2, pin_memory=True)

Downloading config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
for epoch in range(1, num_epochs + 1):  
    model.train()
    print('epoch:', epoch)
    train_epoch_loss, train_n = 0, 0
    for text_dict, summary_dict, text, summary in train_loader:
        text_dict = {k:v.long().to(device) for k,v in text_dict.items()}
        summary_dict = {k:v.long().to(device) for k,v in summary_dict.items()}
        loss = model(**text_dict, labels=summary_dict['input_ids']).loss
        optimizer.zero_grad()
        loss.backward()
        optimizer.step() 
        
    model.eval() 
    with torch.no_grad():
        for mode, loader in zip(['train', 'test'], [train_loader, test_loader]):
            epoch_loss, epoch_bleu_score, epoch_rouge1_score, epoch_rouge2_score, epoch_rougeL_score, num_samples = 0, 0, 0, 0, 0, 0
            for text_dict, summary_dict, text, summary in loader:
                text_dict = {k: v.long().to(device) for k,v in text_dict.items()}
                summary_dict = {k: v.long().to(device) for k,v in summary_dict.items()}
                loss = model(**text_dict, labels=summary_dict['input_ids']).loss
                outputs = model.generate(**text_dict, max_length=50)
                predicted_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
                bleu_score = corpus_bleu(summary, predicted_text)
                rouge_score = rouge.compute(predictions=predicted_text, references=summary)
                epoch_bleu_score += bleu_score * len(summary)
                epoch_rouge1_score += rouge_score['rouge1'] * len(summary) 
                epoch_rouge2_score += rouge_score['rouge2'] * len(summary)
                epoch_rougeL_score += rouge_score['rougeL'] * len(summary)
                epoch_loss += loss.item() * len(summary)
                num_samples += len(summary)

            epoch_loss = epoch_loss / num_samples
            epoch_bleu_score = epoch_bleu_score / num_samples
            epoch_rouge1_score = epoch_rouge1_score / num_samples
            epoch_rouge2_score = epoch_rouge2_score / num_samples
            epoch_rougeL_score = epoch_rougeL_score / num_samples
            print(mode, '- loss:', f'{epoch_loss:.2}')    
            print('bleu score:', f'{epoch_bleu_score:.4}') 
            print('rouge1 score:', f'{epoch_rouge1_score:.4}') 
            print('rouge2 score:', f'{epoch_rouge2_score:.4}') 
            print('rougeL score:', f'{epoch_rougeL_score:.4}') 

epoch: 1


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train - loss: 0.22
bleu score: 0.6231
rouge1 score: 0.3733
rouge2 score: 0.338
rougeL score: 0.3301


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test - loss: 0.23
bleu score: 0.6266
rouge1 score: 0.3693
rouge2 score: 0.3228
rougeL score: 0.3158
epoch: 2


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train - loss: 0.19
bleu score: 0.6245
rouge1 score: 0.3935
rouge2 score: 0.3685
rougeL score: 0.3519


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test - loss: 0.23
bleu score: 0.6254
rouge1 score: 0.3824
rouge2 score: 0.3423
rougeL score: 0.3319
epoch: 3


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train - loss: 0.15
bleu score: 0.6243
rouge1 score: 0.4023
rouge2 score: 0.3816
rougeL score: 0.3718


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test - loss: 0.23
bleu score: 0.6262
rouge1 score: 0.379
rouge2 score: 0.3395
rougeL score: 0.3326
epoch: 4


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train - loss: 0.12
bleu score: 0.6233
rouge1 score: 0.4075
rouge2 score: 0.3901
rougeL score: 0.3818


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test - loss: 0.24
bleu score: 0.6261
rouge1 score: 0.392
rouge2 score: 0.36
rougeL score: 0.3503
epoch: 5


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

train - loss: 0.094
bleu score: 0.6227
rouge1 score: 0.409
rouge2 score: 0.3913
rougeL score: 0.3828


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


test - loss: 0.25
bleu score: 0.626
rouge1 score: 0.3952
rouge2 score: 0.3612
rougeL score: 0.3496
epoch: 6


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
