In [13]:
import time
import numpy as np
import pandas as pd


import torch
import torch.nn.functional as F
from torch import cuda
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler


from transformers import BartTokenizer, BartForConditionalGeneration

from huggingface_hub import interpreter_login

from transformers import __version__ as transformers_version

In [14]:
interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): ··········
Add token as git credential? (Y/n) y
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (store).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [15]:
device = 'cuda' if cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [11]:

torch.backends.cudnn.deterministic = True


new_repo = "text_summarizer"
repo_name = "EducativeCS2023/bart-base-summarization"

In [6]:

df = pd.read_csv('/content/BBCarticles.csv', encoding='latin-1')


df = df[['Text', 'Summary']]
df.Text = 'summarize: ' + df.Text

split_ratio = 0.025

train_dataset = df.sample(frac=split_ratio).reset_index(drop=True)

eval_dataset = df.drop(train_dataset.index).sample(frac=split_ratio).reset_index(drop=True)


print("Training Dataset Size:", train_dataset.shape)
print("Evaluation Dataset Size:", eval_dataset.shape)


df.head(3)

Training Dataset Size: (56, 2)
Evaluation Dataset Size: (54, 2)


Unnamed: 0,Text,Summary
0,summarize: Ad sales boost Time Warner profit\n...,TimeWarner said fourth quarter sales rose 2% t...
1,summarize: Dollar gains on Greenspan speech\n\...,The dollar has hit its highest level against t...
2,summarize: Yukos unit buyer faces loan claim\n...,Yukos' owner Menatep Group says it will ask Ro...


In [7]:
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.Summary = self.data.Summary
        self.Text = self.data.Text

    def __len__(self):
        return len(self.Summary)
        Text = str(self.Text[index])
        Text = ' '.join(Text.split())

        Summary = str(self.Summary[index])
        Summary = ' '.join(Summary.split())
        source_encoded = self.tokenizer(Text, max_length=self.source_len, padding='max_length', truncation=True, return_tensors='pt')
        target_encoded = self.tokenizer(Summary, max_length=self.summ_len, padding='max_length', truncation=True, return_tensors='pt')

        source_ids = source_encoded['input_ids'].squeeze()
        source_mask = source_encoded['attention_mask'].squeeze()
        target_ids = target_encoded['input_ids'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long)
        }

In [17]:

tokenizer = BartTokenizer.from_pretrained(repo_name)


tokenizer.push_to_hub(new_repo)


training_set = CustomDataset(train_dataset, tokenizer, 512, 150)
eval_set = CustomDataset(eval_dataset, tokenizer, 512, 150)


training_loader = DataLoader(
    training_set,
    batch_size=2,
    shuffle=True,
    num_workers=0
)
eval_loader = DataLoader(
    eval_set,
    batch_size=2,
    shuffle=False,
    num_workers=0
)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

No files have been modified since last commit. Skipping to prevent empty commit.


In [19]:

model = BartForConditionalGeneration.from_pretrained(repo_name)


model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)


config.json:   0%|          | 0.00/1.75k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

In [22]:
def train(epoch, tokenizer, model, device, loader, optimizer):
    model.train()
    for batch_index, data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype=torch.long)
        y_ids = y[:, :-1].contiguous()
        labels = y[:, 1:].clone().detach()
        labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype=torch.long)
        mask = data['source_mask'].to(device, dtype=torch.long)
        outputs = model(input_ids=ids, attention_mask=mask, decoder_input_ids=y_ids, labels=labels)
        loss = outputs[0]

        if batch_index % 500 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [23]:

for epoch in range(2):
    print(f"Training epoch: {epoch+1}/{2}")
    train(epoch, tokenizer, model, device, training_loader, optimizer)

Training epoch: 1/2
Epoch: 0, Loss:  0.46102428436279297
Training epoch: 2/2
Epoch: 1, Loss:  0.5255913734436035


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

Model successfully pushed to Hugging Face Hub


In [32]:
model.save_pretrained("content/model")
tokenizer.save_pretrained("content/tokenizer")

Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_bos_token_id': 0, 'forced_eos_token_id': 2}


('content/tokenizer/tokenizer_config.json',
 'content/tokenizer/special_tokens_map.json',
 'content/tokenizer/vocab.json',
 'content/tokenizer/merges.txt',
 'content/tokenizer/added_tokens.json')

In [25]:
def predict(tokenizer, model, device, loader):
    model.eval()
    predictions = []
    actuals = []
    with torch.no_grad():
        for batch_index, data in enumerate(loader, 0):
            ids = data['source_ids'].to(device, dtype=torch.long)
            mask = data['source_mask'].to(device, dtype=torch.long)
            y = data['target_ids'].to(device, dtype=torch.long)
            generated_ids = model.generate(
                input_ids=ids,
                attention_mask=mask,
                max_length=150,
                num_beams=2,
                repetition_penalty=2.5,
                length_penalty=1.0,
                early_stopping=True
            )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_ids]
            target = [tokenizer.decode(t, skip_special_tokens=True, clean_up_tokenization_spaces=True) for t in y]

            if batch_index % 100 == 0:
                print(f'Completed {batch_index} batches')

            predictions.extend(preds)
            actuals.extend(target)

    return predictions, actuals

In [35]:

start_time = time.time()

model = BartForConditionalGeneration.from_pretrained('content/model')
tokenizer = BartTokenizer.from_pretrained('content/tokenizer')


predictions, actuals = predict(tokenizer, model, device, eval_loader)

results = pd.DataFrame({'predictions': predictions, 'actuals': actuals})

results.to_csv('/content/results.csv')

end_time = time.time()
time_taken = end_time - start_time
print(f"Time taken for predictions: {time_taken:.2f} seconds")

results.head()

Completed 0 batches
Time taken for predictions: 536.51 seconds


Unnamed: 0,predictions,actuals
0,"Keanu Reeves, best known for his role in the ...","""When I was 15 years old I asked my mom if it ..."
1,"usingly for consumers, the technology that com...","Instead, said Mr Doctorow, DRM systems were in..."
2,Blair is likely to name 5 May as election day...,Tony Blair is likely to name 5 May as election...
3,of public bodies are ill-prepared for the Fre...,"Committee chairman Alan Beith said: ""The DCA h..."
4,Incredibles movie has beaten Shrek 2 to the m...,The Incredibles movie has beaten Shrek 2 to th...


In [28]:
pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from evaluate)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [

In [30]:
pip install rouge_score

Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=5cba388776ea2a54ef21c4de6291a7e9f37b6e4177fa47a22ec45e0f8ca94e7e
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [31]:
import evaluate

rouge_score = evaluate.load("rouge")


scores = rouge_score.compute(
    predictions=results['predictions'],
    references=results['actuals']
)


rouge_scores_df = pd.DataFrame([scores]).transpose()

rouge_scores_df.head()

Unnamed: 0,0
rouge1,0.751096
rouge2,0.667168
rougeL,0.560821
rougeLsum,0.561324
