In [None]:
!pip install datasets

Preprocess the dataset

In [None]:
import torch
import torch.nn as nn
import math
from datasets import load_dataset
model_checkpoint="Helsinki-NLP/opus-mt-en-fr"
from transformers import AutoTokenizer


raw_data=load_dataset('kde4',lang1='en',lang2='fr')
split_dataset=raw_data['train'].train_test_split(train_size=0.8,seed=14)
tokenizer=AutoTokenizer.from_pretrained(model_checkpoint)

def preprocess(examples):
  en_inputs=[ex['en'] for ex in examples['translation']]
  fr_targets=[ex['fr'] for ex in examples['translation']]
  model_inputs=tokenizer(en_inputs,text_target=fr_targets,max_length=64,padding='max_length',truncation=True)

  return model_inputs

tokenized_datasets = split_dataset.map(preprocess,batched=True,remove_columns=split_dataset["train"].column_names)

train_data=tokenized_datasets['train']
test_data=tokenized_datasets['test']



from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True, collate_fn=data_collator, num_workers=2)
test_loader= DataLoader(test_data, batch_size=32, shuffle=False, collate_fn=data_collator, num_workers=2)

Use the GPU

In [None]:
torch.cuda.is_available()
print(torch.cuda.get_device_name(0))


Tesla T4


Define the components of the model(positional encoding, encoder, decoder)

In [None]:
import torch
import torch.nn as nn
import math
import matplotlib.pyplot as plt

class PositionalEncoding(nn.Module):
    def __init__(self, max_len, d_model):
        super().__init__()
        self.pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2) * (-math.log(1000) / d_model))

        self.pe[:, 0::2] = torch.sin(position * div_term)
        self.pe[:, 1::2] = torch.cos(position * div_term)
        self.pe = self.pe.unsqueeze(0)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)].to(device)

class transformer_model(nn.Module):
    def __init__(self, vocab_size, d_model, max_len, nhead, feedforward_dim, dropout, num_layers):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(max_len, d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=feedforward_dim, dropout=dropout, batch_first=True)
        self.encoder_stack = nn.TransformerEncoder(self.encoder_layer, num_layers=num_layers)
        self.decoder_layer = nn.TransformerDecoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=feedforward_dim, dropout=dropout, batch_first=True)
        self.decoder_stack = nn.TransformerDecoder(self.decoder_layer, num_layers=num_layers)
        self.linear_layer = nn.Linear(d_model, vocab_size)

    def forward(self, x, tgt, src_mask=None, tgt_mask=None, src_key_padding_mask=None, tgt_key_padding_mask=None):
        input_embeddings = self.embeddings(x)
        input_embeddings = self.positional_encoding(input_embeddings)
        tgt_embeddings = self.embeddings(tgt)
        tgt_embeddings = self.positional_encoding(tgt_embeddings)

        encoder_output = self.encoder_stack(input_embeddings, src_key_padding_mask=src_key_padding_mask)
        decoder_output = self.decoder_stack(
            tgt_embeddings, encoder_output, tgt_mask=tgt_mask, memory_mask=src_mask,
            tgt_key_padding_mask=tgt_key_padding_mask, memory_key_padding_mask=src_key_padding_mask
        )

        output = self.linear_layer(decoder_output)
        return output

def generate_square_subsequent_mask(sz):
    mask = torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
    return mask





Train the model

In [None]:
#define the model parameters
vocab_size = len(tokenizer.get_vocab())
d_model = 512
max_len = 64
nhead = 8
feedforward_dim = 2048
dropout = 0.1
num_layers = 12
num_epochs = 3

# initialize the model
model = transformer_model(vocab_size, d_model, max_len, nhead, feedforward_dim, dropout, num_layers)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
loss_fn = nn.CrossEntropyLoss()

# Initialize lists to store loss and accuracy for each epoch
loss_values = []
accuracy_values = []

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_tokens = 0

    for batch in train_loader:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        target_inputs = labels[:, :-1]
        target_labels = labels[:, 1:].contiguous().view(-1)

        tgt_mask = generate_square_subsequent_mask(target_inputs.size(1)).to(device)
        src_key_padding_mask = (input_ids == tokenizer.pad_token_id).to(device)
        tgt_key_padding_mask = (target_inputs == tokenizer.pad_token_id).to(device)

        outputs = model(input_ids, target_inputs, tgt_mask=tgt_mask, src_key_padding_mask=src_key_padding_mask, tgt_key_padding_mask=tgt_key_padding_mask)
        logits = outputs.view(-1, vocab_size)

        target_labels_mask = target_labels != tokenizer.pad_token_id
        loss = loss_fn(logits[target_labels_mask], target_labels[target_labels_mask])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        # Calculate accuracy
        preds = logits.argmax(dim=-1)
        correct_predictions += (preds[target_labels_mask] == target_labels[target_labels_mask]).sum().item()
        total_tokens += target_labels_mask.sum().item()

    avg_loss = total_loss / len(train_loader)
    accuracy = correct_predictions / total_tokens if total_tokens > 0 else 0

    # Store values for plotting
    loss_values.append(avg_loss)
    accuracy_values.append(accuracy)

    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {avg_loss:.4f}, Accuracy: {accuracy:.4f}")

Plot the results

In [None]:
# Plotting loss and accuracy over epochs
plt.figure(figsize=(12, 5))

# Plot Loss
plt.subplot(1, 2, 1)
plt.plot(range(1, num_epochs + 1), loss_values, marker='o', color='blue')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')

# Plot Accuracy
plt.subplot(1, 2, 2)
plt.plot(range(1, num_epochs + 1), accuracy_values, marker='o', color='green')
plt.title('Accuracy over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.show()

fucntion for translating a sentence

In [None]:
def translate(sentence):
    model.eval()
    with torch.no_grad():
        # Tokenize the input sentence
        input_ids = tokenizer(sentence, return_tensors="pt", padding='max_length', max_length=64, truncation=True).input_ids.to(device)
        tgt_input = torch.zeros((1, 64), dtype=torch.long).to(device)

        # Start decoding with the <sos> token
        tgt_input[0, 0] = tokenizer.convert_tokens_to_ids("<sos>")
        for i in range(1, 64):
            outputs = model(input_ids, tgt_input[:, :i])
            next_token_logits = outputs[0, i - 1, :]  # Get the last token logits
            next_token = torch.argmax(next_token_logits).unsqueeze(0).unsqueeze(0)  # Get the predicted token
            tgt_input[0, i] = next_token  # Add predicted token to target input

            if next_token.item() == tokenizer.convert_tokens_to_ids("<eos>"):  # if <eos> token is predicted then stop
                break

        # Decode  result
        translated_sentence = tokenizer.decode(tgt_input[0], skip_special_tokens=True)
        return translated_sentence


Calculate the BLUE score of the translated sentense

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize

def calculate_bleu(reference, translated):
    reference_tokens = [word_tokenize(reference)]
    translated_tokens = word_tokenize(translated)


    bleu_score = sentence_bleu(reference_tokens, translated_tokens)
    return bleu_score


english_sentence = "hello everyone how are you all?"
translated_sentence = translate(english_sentence)
reference_sentence = "bonjour à tous, comment allez-vous?"


bleu_score = calculate_bleu(reference_sentence, translated_sentence)

print(f"Translated: {translated_sentence}")
print(f"BLEU Score: {bleu_score:.4f}")