In [2]:
import time
import pandas as pd


import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader


from transformers import BartTokenizer, BartForConditionalGeneration

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [4]:

torch.backends.cudnn.deterministic = True


new_repo = "text_summarizer"
repo_name = "EducativeCS2023/bart-base-summarization"

In [5]:

df = pd.read_csv('BBCarticles.csv', encoding='latin-1')


df = df[['Text', 'Summary']]
df.Text = 'summarize: ' + df.Text

split_ratio = 0.025

train_dataset = df.sample(frac=split_ratio).reset_index(drop=True)

eval_dataset = df.drop(train_dataset.index).sample(frac=split_ratio).reset_index(drop=True)


print("Training Dataset Size:", train_dataset.shape)
print("Evaluation Dataset Size:", eval_dataset.shape)


df.head(3)

Training Dataset Size: (56, 2)
Evaluation Dataset Size: (54, 2)


Unnamed: 0,Text,Summary
0,summarize: Ad sales boost Time Warner profit\n...,TimeWarner said fourth quarter sales rose 2% t...
1,summarize: Dollar gains on Greenspan speech\n\...,The dollar has hit its highest level against t...
2,summarize: Yukos unit buyer faces loan claim\n...,Yukos' owner Menatep Group says it will ask Ro...


In [6]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.Summary = self.data.Summary
        self.Text = self.data.Text

    def __len__(self):
        return len(self.Summary)

    def __getitem__(self, index):
        Text = str(self.Text[index])
        Text = ' '.join(Text.split())

        Summary = str(self.Summary[index])
        Summary = ' '.join(Summary.split())
        source_encoded = self.tokenizer(Text, max_length=self.source_len, padding='max_length', truncation=True, return_tensors='pt')
        target_encoded = self.tokenizer(Summary, max_length=self.summ_len, padding='max_length', truncation=True, return_tensors='pt')

        source_ids = source_encoded['input_ids'].squeeze()
        source_mask = source_encoded['attention_mask'].squeeze()
        target_ids = target_encoded['input_ids'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long)
        }

In [7]:

tokenizer = BartTokenizer.from_pretrained(repo_name)


training_set = CustomDataset(train_dataset, tokenizer, 512, 150)
eval_set = CustomDataset(eval_dataset, tokenizer, 512, 150)


training_loader = DataLoader(
    training_set,
    batch_size=2,
    shuffle=True,
    num_workers=0
)
eval_loader = DataLoader(
    eval_set,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

In [8]:

model = BartForConditionalGeneration.from_pretrained(repo_name)


model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)


In [9]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for batch_index, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)
        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]

        if batch_index % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [10]:

for epoch in range(2):
    print(f"Training epoch: {epoch+1}/{2}")
    train(epoch, tokenizer, model, device, training_loader, optimizer)

Training epoch: 1/2
Epoch: 0, Loss:  0.12322605401277542
Training epoch: 2/2
Epoch: 1, Loss:  0.2622039318084717


In [11]:
model.save_pretrained("./model")
tokenizer.save_pretrained("./model")



('./model\\tokenizer_config.json',
 './model\\special_tokens_map.json',
 './model\\vocab.json',
 './model\\merges.txt',
 './model\\added_tokens.json')

In [12]:
def predict(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch_index, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)
            y = data['target_ids'].to(device, dtype=torch.long)
            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

            if batch_index % 100 == 0:
                print(f'Completed {batch_index} batches')

            predictions.extend(preds)
            actuals.extend(target)

    return predictions, actuals

In [16]:

start_time = time.time()

model = BartForConditionalGeneration.from_pretrained('./model')
tokenizer = BartTokenizer.from_pretrained('./model')


predictions, actuals = predict(tokenizer, model, device, eval_loader)

results = pd.DataFrame({'predictions': predictions, 'actuals': actuals})

results.to_csv('results.csv')

end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken for predictions: {time_taken:.2f} seconds")

results.head()

Completed 0 batches
Time taken for predictions: 503.16 seconds


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Unnamed: 0,predictions,actuals
0,said he was determined his current veto on wh...,"""As Lord Woolf also acknowledged, it is highly..."
1,"fight continued outside, where ""one of the of...","The fight continued outside, where ""one of the..."
2,when Tony Blackburn won the first show in 200...,"Pasquale follows Kerry McFadden, Phil Tufnell ..."
3,ia Mirza continued her remarkable rise with vi...,"""It was such a tough first-round match and I a..."
4,"Valentine, who was born in the United States ...","Mr Valentine, who was born in the United State..."


In [17]:
import evaluate

rouge_score = evaluate.load("rouge")


scores = rouge_score.compute(
    predictions=results['predictions'],
    references=results['actuals']
)


rouge_scores_df = pd.DataFrame([scores]).transpose()

rouge_scores_df.head()

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 2.37MB/s]


Unnamed: 0,0
rouge1,0.722372
rouge2,0.642142
rougeL,0.555983
rougeLsum,0.557192
