In [7]:
!pip install datasets rouge transformers

[0m

In [2]:
pip install nltk 

Collecting nltk
  Downloading nltk-3.9.1-py3-none-any.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting regex>=2021.8.3
  Downloading regex-2024.9.11-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (782 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m782.7/782.7 KB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hCollecting joblib
  Downloading joblib-1.4.2-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.8/301.8 KB[0m [31m21.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting click
  Downloading click-8.1.7-py3-none-any.whl (97 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 KB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: regex, joblib, click, nltk
Successfully installed click-8.1.7 joblib-1.4.2 nltk-3.9.1 regex-2024.9.11
[0mNote:

In [2]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting threadpoolctl>=3.1.0
  Downloading threadpoolctl-3.5.0-py3-none-any.whl (18 kB)
Collecting scipy>=1.6.0
  Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.5.2 scipy-1.14.1 threadpoolctl-3.5.0
[0mNote: you may need to restart the kernel to use updated packages.


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import load_dataset
from tqdm import tqdm
import random
import numpy as np
from rouge import Rouge

In [None]:
# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

In [None]:
# Load pre-trained BART model and tokenizer
model_name = "facebook/bart-large-cnn"
tokenizer = BartTokenizer.from_pretrained(model_name)
model = BartForConditionalGeneration.from_pretrained(model_name)

In [None]:
# Load dataset from Hugging Face
dataset = load_dataset("cnn_dailymail", "3.0.0")

In [None]:
# Take only 10% of the training and test data
train_val_size = int(len(dataset['train']) * 0.1)
test_size = int(len(dataset['test']) * 0.1)

train_val_data = dataset['train'].select(range(train_val_size))
test_data = dataset['test'].select(range(test_size))

In [None]:
# Manually split train and validation
val_size = int(train_val_size * 0.1)
train_size = train_val_size - val_size

train_data = train_val_data.select(range(train_size))
val_data = train_val_data.select(range(train_size, train_val_size))

In [None]:
MAX_LENGTH = 1024
BATCH_SIZE = 4
LEARNING_RATE = 2e-5

In [None]:
# Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, data, tokenizer, max_length):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(item['article'], max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        with self.tokenizer.as_target_tokenizer():
            labels = self.tokenizer(item['highlights'], max_length=self.max_length, truncation=True, padding='max_length', return_tensors='pt')
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
            'labels': labels['input_ids'].squeeze()
        }

In [None]:
# Create datasets
train_dataset = SummarizationDataset(train_data, tokenizer, MAX_LENGTH)
val_dataset = SummarizationDataset(val_data, tokenizer, MAX_LENGTH)
test_dataset = SummarizationDataset(test_data, tokenizer, MAX_LENGTH)

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

In [None]:
# optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

In [None]:
# Training function
def train(model, dataloader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in tqdm(dataloader, desc="Training"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [None]:
# Evaluation function
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            total_loss += loss.item()
    
    return total_loss / len(dataloader)

In [None]:
# Training loop with early stopping
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

N_EPOCHS = 3
PATIENCE = 2

best_valid_loss = float('inf')
epochs_without_improvement = 0

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, device)
    valid_loss = evaluate(model, val_loader, device)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bart_summarizer.pt')
        epochs_without_improvement = 0
    else:
        epochs_without_improvement += 1
    
    print(f'Epoch: {epoch+1:02}')
    print(f'\tTrain Loss: {train_loss:.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f}')
    
    if epochs_without_improvement == PATIENCE:
        print(f'Early stopping after {epoch+1} epochs without improvement.')
        break

# Load the best model
model.load_state_dict(torch.load('bart_summarizer.pt'))

In [None]:
# Generate summary
def generate_summary(model, tokenizer, article, max_length=150):
    inputs = tokenizer(article, max_length=1024, truncation=True, return_tensors='pt').to(device)
    summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=max_length, early_stopping=True)
    return tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [5]:
# Evaluation metrics
rouge = Rouge()

def calculate_rouge(hypotheses, references):
    return rouge.get_scores(hypotheses, references, avg=True)

# Test the model
test_articles = []
test_summaries = []
generated_summaries = []

for batch in tqdm(test_loader, desc="Testing"):
    articles = tokenizer.batch_decode(batch['input_ids'], skip_special_tokens=True)
    summaries = tokenizer.batch_decode(batch['labels'], skip_special_tokens=True)
    
    for article in articles:
        generated = generate_summary(model, tokenizer, article)
        
        test_articles.append(article)
        test_summaries.append(summaries[articles.index(article)])
        generated_summaries.append(generated)

# Calculate ROUGE scores
rouge_scores = calculate_rouge(generated_summaries, test_summaries)

print("ROUGE-1 F1 Score:", rouge_scores['rouge-1']['f'])
print("ROUGE-2 F1 Score:", rouge_scores['rouge-2']['f'])
print("ROUGE-L F1 Score:", rouge_scores['rouge-l']['f'])

# Print a sample summary
sample_idx = random.randint(0, len(test_articles) - 1)
print("\nSample Article:")
print(test_articles[sample_idx][:500] + "...")  
print("\nActual Summary:")
print(test_summaries[sample_idx])
print("\nGenerated Summary:")
print(generated_summaries[sample_idx])

# Created/Modified files during execution:
print("bart_summarizer.pt")

Training: 100%|████████████████████████████████████████████████████████| 6460/6460 [33:21<00:00,  3.23it/s]
Evaluating: 100%|████████████████████████████████████████████████████████| 718/718 [01:16<00:00,  9.35it/s]


Epoch: 01
	Train Loss: 0.093
	 Val. Loss: 0.077


Training: 100%|████████████████████████████████████████████████████████| 6460/6460 [33:14<00:00,  3.24it/s]
Evaluating: 100%|████████████████████████████████████████████████████████| 718/718 [01:14<00:00,  9.64it/s]


Epoch: 02
	Train Loss: 0.060
	 Val. Loss: 0.080


Training:  10%|█████▌                                                   | 624/6460 [03:13<29:41,  3.28it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training: 100%|████████████████████████████████████████████████████████| 6460/6460 [33:19<00:00,  3.23it/s]
Evaluating: 100%|████████████████████████████████████████████████████████| 718/718 [01:16<00:00,  9.39it/s]
  model.load_state_dict(torch.load('bart_summarizer.pt'))


Epoch: 03
	Train Loss: 0.046
	 Val. Loss: 0.083
Early stopping after 3 epochs without improvement.


Testing: 100%|███████████████████████████████████████████████████████████| 288/288 [08:35<00:00,  1.79s/it]


ROUGE-1 F1 Score: 0.30388266783945317
ROUGE-2 F1 Score: 0.11718122237973874
ROUGE-L F1 Score: 0.28229055415944043

Sample Article:
(CNN)Lady Antebellum singer Hillary Scott's tour bus caught fire on a Texas freeway Thursday morning, but everyone on board was safely evacuated. Michael Barnett captured dramatic video of the fire, on Interstate 30 just northeast of Dallas, and uploaded it to CNN iReport. Smoke and flames poured from the rear of the bus as traffic slowed to a crawl and Barnett slowly approached in his vehicle. As he drew closer to the bus, Barnett decided to stop filming because he didn't know what to expect. "...

Actual Summary:
Country band Lady Antebellum's bus caught fire Thursday on a Texas freeway.
A CNN iReporter captured the dramatic scene on video.
Singer Hillary Scott shared a pic of the charred bus on Instagram.

Generated Summary:
Hillary Scott's tour bus caught fire on a Texas freeway Thursday morning.
Michael Barnett captured dramatic video of the fire, on I

SAMPLE CODE

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import random
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from collections import Counter
import math

# Download NLTK data
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

# Set random seed for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

# Sample dataset
sample_data = [
    {
        "article": "The quick brown fox jumps over the lazy dog. It was a beautiful day in the forest. The birds were singing and the trees were swaying in the breeze.",
        "summary": "Fox jumps over dog on a nice day in the forest."
    },
    {
        "article": "Scientists have discovered a new species of dinosaur in Argentina. The fossils suggest it was one of the largest animals to ever walk the Earth.",
        "summary": "New giant dinosaur species found in Argentina."
    },
    {
        "article": "A new study shows that drinking coffee may have health benefits. Researchers found that moderate coffee consumption is associated with a lower risk of heart disease.",
        "summary": "Coffee drinking linked to lower heart disease risk."
    }
]

# Tokenizer
def tokenize(text):
    return word_tokenize(text.lower())

# Vocabulary building
def build_vocab(data, min_freq=1):
    counter = Counter()
    for item in data:
        counter.update(tokenize(item['article']))
        counter.update(tokenize(item['summary']))

    vocab = {'<unk>': 0, '<pad>': 1, '<sos>': 2, '<eos>': 3}
    for word, count in counter.items():
        if count >= min_freq:
            vocab[word] = len(vocab)
    return vocab

# Build vocabulary
vocab = build_vocab(sample_data)

# Constants
MAX_LENGTH = 100
BATCH_SIZE = 2
LEARNING_RATE = 0.001

# Dataset class
class SummarizationDataset(Dataset):
    def __init__(self, data, vocab, max_length):
        self.data = data
        self.vocab = vocab
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        article = tokenize(item['article'])[:self.max_length]
        summary = tokenize(item['summary'])[:self.max_length]

        article_ids = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in article] + [self.vocab['<eos>']]
        summary_ids = [self.vocab['<sos>']] + [self.vocab.get(token, self.vocab['<unk>']) for token in summary] + [self.vocab['<eos>']]

        # Pad sequences to ensure equal length in each batch
        article_ids = article_ids + [self.vocab['<pad>']] * (self.max_length - len(article_ids))
        summary_ids = summary_ids + [self.vocab['<pad>']] * (self.max_length - len(summary_ids))


        return torch.tensor(article_ids), torch.tensor(summary_ids)

# Create datasets
dataset = SummarizationDataset(sample_data, vocab, MAX_LENGTH)
train_loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

# Transformer model
class TransformerSummarizer(nn.Module):
    def __init__(self, vocab_size, d_model, nhead, num_encoder_layers, num_decoder_layers, dim_feedforward, dropout):
        super(TransformerSummarizer, self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, dropout)

        self.transformer = nn.Transformer(
            d_model=d_model,
            nhead=nhead,
            num_encoder_layers=num_encoder_layers,
            num_decoder_layers=num_decoder_layers,
            dim_feedforward=dim_feedforward,
            dropout=dropout
        )

        self.fc_out = nn.Linear(d_model, vocab_size)

    def forward(self, src, trg):
        src = self.embedding(src) * math.sqrt(self.d_model)
        trg = self.embedding(trg) * math.sqrt(self.d_model)

        src = self.pos_encoder(src)
        trg = self.pos_encoder(trg)

        src_mask = self.transformer.generate_square_subsequent_mask(src.size(1)).to(src.device)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg.size(1)).to(trg.device)

        output = self.transformer(src.transpose(0, 1), trg.transpose(0, 1), src_mask, trg_mask)
        output = self.fc_out(output.transpose(0, 1))

        return output

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return self.dropout(x)

# Hyperparameters
VOCAB_SIZE = len(vocab)
D_MODEL = 128
NHEAD = 4
NUM_ENCODER_LAYERS = 2
NUM_DECODER_LAYERS = 2
DIM_FEEDFORWARD = 256
DROPOUT = 0.1

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = TransformerSummarizer(VOCAB_SIZE, D_MODEL, NHEAD, NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, DIM_FEEDFORWARD, DROPOUT).to(device)

# Optimizer and loss function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss(ignore_index=vocab['<pad>'])

# Training function
def train(model, iterator, optimizer, criterion, clip):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        src, trg = batch
        src, trg = src.to(device), trg.to(device)

        optimizer.zero_grad()
        output = model(src, trg[:, :-1])
        output_dim = output.shape[-1]
        output = output.contiguous().view(-1, output_dim)
        trg = trg[:, 1:].contiguous().view(-1)

        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        optimizer.step()

        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

# Training loop
N_EPOCHS = 100
CLIP = 1

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_loader, optimizer, criterion, CLIP)
    print(f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}')

# Generate summary
def generate_summary(model, src, max_len=50):
    model.eval()

    src_tensor = src.unsqueeze(0).to(device)
    trg_tensor = torch.tensor([[vocab['<sos>']]], dtype=torch.long).to(device)

    for _ in range(max_len):
        with torch.no_grad():
            output = model(src_tensor, trg_tensor)

        pred_token = output.argmax(2)[:, -1].item()
        trg_tensor = torch.cat([trg_tensor, torch.tensor([[pred_token]], dtype=torch.long).to(device)], dim=1)

        if pred_token == vocab['<eos>']:
            break

    return trg_tensor.squeeze(0)

# Test the model
for item in sample_data:
    article = item['article']
    actual_summary = item['summary']

    src_tensor = torch.tensor([vocab.get(token, vocab['<unk>']) for token in tokenize(article)], dtype=torch.long).to(device)
    generated_ids = generate_summary(model, src_tensor)
    generated_summary = ' '.join([list(vocab.keys())[list(vocab.values()).index(idx)] for idx in generated_ids if idx not in [vocab['<sos>'], vocab['<eos>'], vocab['<pad>']]])

    print("\nArticle:", article)
    print("Actual Summary:", actual_summary)
    print("Generated Summary:", generated_summary)




Epoch: 01, Train Loss: 4.433
Epoch: 02, Train Loss: 3.720
Epoch: 03, Train Loss: 3.211
Epoch: 04, Train Loss: 2.941
Epoch: 05, Train Loss: 2.592
Epoch: 06, Train Loss: 2.249
Epoch: 07, Train Loss: 2.055
Epoch: 08, Train Loss: 1.701
Epoch: 09, Train Loss: 1.443
Epoch: 10, Train Loss: 1.232
Epoch: 11, Train Loss: 1.061
Epoch: 12, Train Loss: 0.907
Epoch: 13, Train Loss: 0.767
Epoch: 14, Train Loss: 0.634
Epoch: 15, Train Loss: 0.507
Epoch: 16, Train Loss: 0.536
Epoch: 17, Train Loss: 0.371
Epoch: 18, Train Loss: 0.311
Epoch: 19, Train Loss: 0.280
Epoch: 20, Train Loss: 0.242
Epoch: 21, Train Loss: 0.178
Epoch: 22, Train Loss: 0.155
Epoch: 23, Train Loss: 0.137
Epoch: 24, Train Loss: 0.107
Epoch: 25, Train Loss: 0.096
Epoch: 26, Train Loss: 0.091
Epoch: 27, Train Loss: 0.069
Epoch: 28, Train Loss: 0.056
Epoch: 29, Train Loss: 0.052
Epoch: 30, Train Loss: 0.042
Epoch: 31, Train Loss: 0.041
Epoch: 32, Train Loss: 0.037
Epoch: 33, Train Loss: 0.032
Epoch: 34, Train Loss: 0.032
Epoch: 35, Tra