Importing the required libraries

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from tqdm import tqdm

import copy

  from .autonotebook import tqdm as notebook_tqdm


setting device to 'cuda' if available else 'cpu' for training the model

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


downloading dataset and the tokenizer to tokenize the data

In [3]:
dataset = load_dataset("cnn_dailymail", "3.0.0")

tokenizer = AutoTokenizer.from_pretrained("t5-small")

extracting the data from the downloaded dataset

In [4]:
new_dataset = copy.deepcopy(dataset)

train_subset = dataset["train"].select(range(int(len(dataset["train"]) * 0.1)))

print(len(train_subset))

new_dataset["train"] = train_subset

28711


preprocessing the data before feeding it to the model

In [5]:
def preprocess_function(examples):
    # Truncate or pad inputs and outputs
    inputs = ["summarize: " + article for article in examples["article"]]
    targets = examples["highlights"]
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(targets, max_length=128, truncation=True, padding="max_length")
    
    # Adjust labels to ignore padding during loss computation
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_datasets = new_dataset.map(preprocess_function, batched=True)

preparing a dataloader to load data in batches in the model

In [6]:
train_dataset = tokenized_datasets["train"]
val_dataset = tokenized_datasets["validation"]
test_dataset = tokenized_datasets["test"]

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

train_loader = DataLoader(train_dataset, batch_size=12, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)
test_loader = DataLoader(test_dataset, batch_size=8)

encoder part of the seq2seq model

In [7]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hidden_dim, n_layers, dropout):
        super(Encoder, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        # Embedding layer
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        # LSTM
        self.rnn = nn.LSTM(emb_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
    
    def forward(self, src):
        embedded = self.embedding(src)  # (batch_size, seq_len)
        outputs, (hidden, cell) = self.rnn(embedded)  # outputs: (batch_size, seq_len, hidden_dim)
        return outputs, (hidden, cell)

attentioni mechanism to focus on the important parts of the input sequence

In [8]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim * 2, hidden_dim)
        self.v = nn.Parameter(torch.rand(hidden_dim))

    def forward(self, decoder_hidden, encoder_outputs):
        batch_size = encoder_outputs.size(0)
        seq_len = encoder_outputs.size(1)
        
        # Repeat decoder hidden state for each time step
        decoder_hidden = decoder_hidden[0]
        decoder_hidden = decoder_hidden.unsqueeze(1).repeat(1, seq_len, 1)
        
        # Calculate attention scores
        energy = torch.tanh(self.attn(torch.cat((decoder_hidden, encoder_outputs), dim=2)))  # (batch_size, seq_len, hidden_dim)
        attention = torch.sum(self.v * energy, dim=2)  # (batch_size, seq_len)
        
        # Compute attention weights
        attention_weights = torch.softmax(attention, dim=1)  # (batch_size, seq_len)
        
        # Compute context vector
        context_vector = torch.bmm(attention_weights.unsqueeze(1), encoder_outputs)  # (batch_size, 1, hidden_dim)
        context_vector = context_vector.squeeze(1)  # (batch_size, hidden_dim)
        
        return context_vector, attention_weights


decoder part of the seq2seq model

In [9]:
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hidden_dim, n_layers, dropout, attention):
        super(Decoder, self).__init__()
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.dropout = dropout
        self.attention = attention
        
        # Embedding layer
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        # LSTM
        self.rnn = nn.LSTM(emb_dim + hidden_dim, hidden_dim, n_layers, dropout=dropout, batch_first=True)
        
        # Linear layer to output probabilities over the vocabulary
        self.fc_out = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, input, decoder_hidden, encoder_outputs):
        embedded = self.embedding(input).unsqueeze(1)  # (batch_size, 1, emb_dim)
        
        # Get context vector from attention
        context_vector, _ = self.attention(decoder_hidden[0], encoder_outputs)  # (batch_size, hidden_dim)
        
        # Concatenate context vector with embedded input token
        rnn_input = torch.cat((embedded, context_vector.unsqueeze(1)), dim=2)  # (batch_size, 1, emb_dim + hidden_dim)
        
        # Pass through LSTM
        output, (hidden, cell) = self.rnn(rnn_input, decoder_hidden)
        
        # Predict the next word
        prediction = self.fc_out(output.squeeze(1))  # (batch_size, output_dim)
        
        return prediction, (hidden, cell)


getting all together to form the seq2seq model

In [10]:
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device, pad_idx):
        super(Seq2Seq, self).__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.pad_idx = pad_idx
    
    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.size(0)
        trg_len = trg.size(1)
        batch_size = src.size(0)
        
        # Initialize tensors to hold the outputs and hidden states
        outputs = torch.zeros(batch_size, trg_len, self.decoder.output_dim).to(self.device)
        
        # Encoder outputs and hidden state
        encoder_outputs, (hidden, cell) = self.encoder(src)
        
        # First input to the decoder is the <sos> token (start of sequence)
        input = trg[:, 0]
        
        for t in range(1, trg_len):
            # Forward pass through the decoder
            output, (hidden, cell) = self.decoder(input, (hidden, cell), encoder_outputs)
            outputs[:, t] = output
            
            # Get the highest probability token and decide whether to use teacher forcing
            top1 = output.argmax(1)  
            input = trg[:, t] if torch.rand(1).item() < teacher_forcing_ratio else top1
        
        return outputs


some parameters for training the model

In [11]:
INPUT_DIM = len(tokenizer.vocab)
OUTPUT_DIM = len(tokenizer.vocab)
EMB_DIM = 256
HIDDEN_DIM = 512
N_LAYERS = 2
DROPOUT = 0.5
PAD_IDX = tokenizer.pad_token_id

initializing the model and optimizer

In [12]:
encoder = Encoder(INPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT)
attention = Attention(HIDDEN_DIM)
decoder = Decoder(OUTPUT_DIM, EMB_DIM, HIDDEN_DIM, N_LAYERS, DROPOUT, attention)
model = Seq2Seq(encoder, decoder, device=device, pad_idx=PAD_IDX).to(device)

optimizer = optim.Adam(model.parameters())

def calculate_loss(pred, target):
    # Compute the loss ignoring the padding token
    loss = F.cross_entropy(pred.view(-1, pred.shape[-1]), target.view(-1), ignore_index=PAD_IDX)
    return loss

traing loop for the model

In [13]:
def train(model, train_loader, val_loader, optimizer, device, num_epochs=5):
    for epoch in range(num_epochs):  # Iterate over epochs
        model.train()
        total_train_loss = 0
        
        # Training phase with progress bar
        with tqdm(train_loader, unit="batch", desc=f"Epoch {epoch+1}/{num_epochs}") as tepoch:
            for batch in tepoch:
                src = batch["input_ids"].to(device)
                trg = batch["labels"].to(device)
                
                optimizer.zero_grad()
                output = model(src, trg)
                
                loss = calculate_loss(output, trg)
                loss.backward()
                optimizer.step()
                
                total_train_loss += loss.item()
                tepoch.set_postfix(train_loss=total_train_loss / (tepoch.n + 1))  # Display loss in progress bar
        
        # Training loss for the epoch
        avg_train_loss = total_train_loss / len(train_loader)
        print(f"Epoch {epoch+1} - Training Loss: {avg_train_loss:.4f}")

        # Validation phase
        model.eval()
        total_val_loss = 0
        
        with torch.no_grad():  # Disable gradient calculation for validation
            with tqdm(val_loader, unit="batch", desc=f"Validation {epoch+1}/{num_epochs}") as vepoch:
                for batch in vepoch:
                    src = batch["input_ids"].to(device)
                    trg = batch["labels"].to(device)

                    output = model(src, trg)
                    
                    loss = calculate_loss(output, trg)
                    total_val_loss += loss.item()
                    vepoch.set_postfix(val_loss=total_val_loss / (vepoch.n + 1))  # Display loss in progress bar
        
        # Validation loss for the epoch
        avg_val_loss = total_val_loss / len(val_loader)
        print(f"Epoch {epoch+1} - Validation Loss: {avg_val_loss:.4f}")

training the model

In [14]:
train(model, train_loader, val_loader, optimizer, device, num_epochs=5)

Epoch 1/5: 100%|██████████| 2393/2393 [48:05<00:00,  1.21s/batch, train_loss=6.72]


Epoch 1 - Training Loss: 6.7205


Validation 1/5: 100%|██████████| 1671/1671 [06:30<00:00,  4.28batch/s, val_loss=6.68]


Epoch 1 - Validation Loss: 6.6767


Epoch 2/5: 100%|██████████| 2393/2393 [48:09<00:00,  1.21s/batch, train_loss=6.25]


Epoch 2 - Training Loss: 6.2505


Validation 2/5: 100%|██████████| 1671/1671 [06:29<00:00,  4.29batch/s, val_loss=6.48]


Epoch 2 - Validation Loss: 6.4814


Epoch 3/5: 100%|██████████| 2393/2393 [48:06<00:00,  1.21s/batch, train_loss=6]   


Epoch 3 - Training Loss: 6.0006


Validation 3/5: 100%|██████████| 1671/1671 [06:28<00:00,  4.30batch/s, val_loss=6.38]


Epoch 3 - Validation Loss: 6.3830


Epoch 4/5: 100%|██████████| 2393/2393 [47:52<00:00,  1.20s/batch, train_loss=5.81]


Epoch 4 - Training Loss: 5.8096


Validation 4/5: 100%|██████████| 1671/1671 [06:28<00:00,  4.30batch/s, val_loss=6.33]


Epoch 4 - Validation Loss: 6.3317


Epoch 5/5: 100%|██████████| 2393/2393 [47:53<00:00,  1.20s/batch, train_loss=5.67]


Epoch 5 - Training Loss: 5.6658


Validation 5/5: 100%|██████████| 1671/1671 [06:28<00:00,  4.30batch/s, val_loss=6.3] 

Epoch 5 - Validation Loss: 6.2993





preparing data to test the model

In [15]:
test_data = next(iter(test_loader))

input_data = test_data["input_ids"].to(device)
input_labels = test_data["labels"].to(device)

# The special tokens for T5
sos_token = tokenizer.cls_token  # T5 does not use <sos>; instead, <pad> is used to start decoding
eos_token = tokenizer.eos_token  # End of sequence token for T5

# Retrieve the indices
sos_idx = tokenizer.pad_token_id  # T5 uses <pad> (padding token) as the starting token
eos_idx = tokenizer.eos_token_id  # Index of the <eos> token

print(f"<sos> Token: {sos_token}, Index: {sos_idx}")
print(f"<eos> Token: {eos_token}, Index: {eos_idx}")


<sos> Token: None, Index: 0
<eos> Token: </s>, Index: 1


making prediction on the test data

In [16]:
def predict(model, src_tensor, max_len=50):
    model.eval()
    with torch.no_grad():
        # Encode the input sentence
        encoder_outputs, (hidden, cell) = model.encoder(src_tensor)

        # Start with the <sos> token
        trg_indexes = [sos_idx]  # Replace with your <sos> token index

        for _ in range(max_len):
            trg_tensor = torch.tensor([trg_indexes[-1]], dtype=torch.long).to(device)

            # Decode the next token
            output, (hidden, cell) = model.decoder(trg_tensor, (hidden, cell), encoder_outputs)

            # Get the token with the highest probability
            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)

            # Stop if the <eos> token is generated
            if pred_token == eos_idx:  # Replace with your <eos> token index
                break

        return trg_indexes


predicting from the input test data

In [17]:
prediction = predict(model, input_data[6].unsqueeze(0))

input_sentence = tokenizer.decode(input_data[6])
predicted_sentence = tokenizer.decode(prediction)
print("Input Sentence:", input_sentence)
print("Predicted Sentence:", predicted_sentence)


Input Sentence: summarize: (CNN)Governments around the world are using the threat of terrorism -- real or perceived -- to advance executions, Amnesty International alleges in its annual report on the death penalty. "The dark trend of governments using the death penalty in a futile attempt to tackle real or imaginary threats to state security and public safety was stark last year," said Salil Shetty, Amnesty's Secretary General in a release. "It is shameful that so many states around the world are essentially playing with people's lives -- putting people to death for 'terrorism' or to quell internal instability on the ill-conceived premise of deterrence." The report, "Death Sentences and Executions 2014," cites the example of Pakistan lifting a six-year moratorium on the execution of civilians following the horrific attack on a school in Peshawar in December. China is also mentioned, as having used the death penalty as a tool in its "Strike Hard" campaign against terrorism in the restiv