In [1]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

## 1. Get the Data

In [2]:
!pip install transformers



In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[

In [4]:
from datasets import load_dataset

# Load the opus_books dataset for English-Hindi
raw_dataset = load_dataset("kde4", lang1="en", lang2="fr")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/5.10k [00:00<?, ?B/s]

The repository for kde4 contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/kde4.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/7.05M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/210173 [00:00<?, ? examples/s]

In [5]:
raw_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 210173
    })
})

In [6]:
raw_dataset["train"]["translation"][0]

{'en': 'Lauri Watts', 'fr': 'Lauri Watts'}

## 2. Process the Data

In [7]:
split_dataset = raw_dataset["train"].train_test_split(test_size=0.5, seed=42)

In [8]:
split_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'translation'],
        num_rows: 105086
    })
    test: Dataset({
        features: ['id', 'translation'],
        num_rows: 105087
    })
})

In [9]:
train_data = split_dataset["train"]
test_data = split_dataset["test"]

train_data, test_data

(Dataset({
     features: ['id', 'translation'],
     num_rows: 105086
 }),
 Dataset({
     features: ['id', 'translation'],
     num_rows: 105087
 }))

In [10]:
# Minimizing our Dataset size for testing!!
# Hugging Face datasets' train_test_split and scikit-learn's train_test_split have different behaviors.
# Hugging Face's train_test_split returns a dictionary with keys 'train' and 'test',
# whereas scikit-learn's train_test_split returns a tuple, which can be unpacked.

train = train_data.train_test_split(train_size=1_000, seed=42)
test = test_data.train_test_split(train_size=1_000, seed=42)

In [11]:
train_dataset =  train["train"]
test_dataset = test["train"]

In [12]:
# we must use tokenizer for our language if not try building own tokenizer

In [13]:
from transformers import AutoTokenizer

model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, return_tensors="pt")

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]



In [14]:
split_dataset["train"]["translation"][1000]

{'en': 'Auto-Add to Collection',
 'fr': 'Ajouter automatiquement à la collection'}

In [15]:
eng_sentence = split_dataset["train"][10]["translation"]["en"]
fr_sentence = split_dataset["train"][10]["translation"]["fr"]

In [16]:
inputs = tokenizer(eng_sentence, text_target=fr_sentence)
inputs

{'input_ids': [3937, 16292, 18318, 26226, 6886, 7, 15, 2589, 7, 2758, 4004, 24, 2212, 26, 471, 18318, 18, 4, 2321, 5369, 2, 65, 271, 5935, 18318, 9, 28, 10, 77, 6740, 1437, 3, 616, 52, 2259, 18, 15, 13878, 3674, 15675, 1192, 33, 3707, 147, 232, 151, 5369, 12, 45, 11316, 30, 15, 1690, 9012, 14249, 3, 84, 1581, 943, 32, 3150, 1557, 2, 11367, 7, 271, 14250, 3, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [3556, 46990, 34749, 51, 4270, 23, 38, 2016, 5, 4027, 2603, 24, 112, 27, 521, 46990, 31, 193, 2988, 13650, 2, 65, 203, 46990, 9, 5, 654, 28, 11, 34, 2428, 6740, 3, 793, 70, 9338, 31, 38, 1192, 5, 4821, 3544, 6091, 2147, 5, 28632, 652, 15480, 36, 38, 1845, 2407, 6844, 3, 375, 722, 1581, 95, 1790, 777, 21, 3188, 2, 12057, 23, 203, 15480, 3, 0]}

In [17]:
eng_sentence, fr_sentence

('Each exported slideshow consists of a series of image files (one for each slide in the original presentation, plus two title slides) and an index file. They are created in a fairly complex folder structure that allows more than one presentation to be stored on a single memory stick. A simple example is shown below, consisting of two presentations.',
 'Chaque diapositive exportée consiste en une série de fichiers images (une pour chaque diapositive dans votre présentation originale, plus deux diapositives de titre) et un fichier index. Ils sont créés dans une structure de dossiers relativement complexe permettant de stocker plusieurs présentations sur une seule carte mémoire. Un exemple simple se trouve ci-dessous, consistant en deux présentations.')

In [18]:
print(tokenizer.encode("& Abaisser le cadre"))
tokenizer.encode("Formula")

[402, 5999, 1794, 7036, 19, 361, 0]


[29359, 0]

In [19]:
wrong_targets = tokenizer(fr_sentence)
print(tokenizer.convert_ids_to_tokens(wrong_targets["input_ids"]))
print(tokenizer.convert_ids_to_tokens(inputs["labels"]))

['▁Cha', 'que', '▁dia', 'positive', '▁export', 'ée', '▁consist', 'e', '▁en', '▁une', '▁s', 'érie', '▁de', '▁fi', 'chi', 'ers', '▁images', '▁(', 'une', '▁pour', '▁cha', 'que', '▁dia', 'positive', '▁dans', '▁vo', 't', 're', '▁pré', 'sent', 'ation', '▁original', 'e', ',', '▁plus', '▁de', 'ux', '▁dia', 'positive', 's', '▁de', '▁tit', 're', ')', '▁et', '▁un', '▁fi', 'chi', 'er', '▁index', '.', '▁Il', 's', '▁son', 't', '▁c', 'ré', 'és', '▁dans', '▁une', '▁structure', '▁de', '▁dossier', 's', '▁relative', 'ment', '▁complex', 'e', '▁per', 'met', 'tant', '▁de', '▁stock', 'er', '▁plus', 'i', 'eurs', '▁pré', 'sent', 'ations', '▁sur', '▁une', '▁se', 'ule', '▁cart', 'e', '▁m', 'é', 'm', 'oire', '.', '▁Un', '▁ex', 'e', 'mple', '▁simple', '▁se', '▁tro', 'u', 've', '▁ci', '-', 'des', 's', 'ous', ',', '▁consist', 'ant', '▁en', '▁de', 'ux', '▁pré', 'sent', 'ations', '.', '</s>']
['▁Chaque', '▁diapositive', '▁exporté', 'e', '▁consiste', '▁en', '▁une', '▁série', '▁de', '▁fichiers', '▁images', '▁(', 'une', 

In [20]:
max_length = 128

def preprocess_function(examples):
    inputs = [ex["en"] for ex in examples["translation"]]
    targets = [ex["fr"] for ex in examples["translation"]]
    model_inputs = tokenizer(
        inputs, text_target=targets, padding="max_length", max_length=128, truncation=True
    )
    return {"input_ids": model_inputs["input_ids"], "labels": model_inputs["labels"]}

In [21]:
train_tokenized = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [22]:
test_tokenized = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=split_dataset["train"].column_names,
)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [23]:

train_tokenized, test_tokenized

(Dataset({
     features: ['input_ids', 'labels'],
     num_rows: 1000
 }),
 Dataset({
     features: ['input_ids', 'labels'],
     num_rows: 1000
 }))

In [24]:
tokenizer.decode(train_tokenized["input_ids"][0])

'Acknowledge Alarm</s> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad>'

In [25]:
tokenizer.decode(595)

'members'

In [26]:
# train_tokenized[0]

## 4. Data Collation

In [27]:
import torch

In [28]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [29]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer

class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads):
        super(SelfAttention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        assert (self.head_dim * heads == embed_size), "Embed size needs to be divisible by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads*self.head_dim, embed_size)

    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, query_len, self.heads, self.head_dim)

        values = self.values(values)
        keys = self.keys(keys)
        queries = self.queries(queries)

        energy = torch.einsum("nqhd, nkhd->nhqk", queries, keys)

        if mask is not None:
            energy = energy.masked_fill(mask == 0, float("-1e20"))

        attention = torch.softmax(energy / (self.embed_size ** (1/2)), dim=3)

        out = torch.einsum("nhql, nlhd->nqhd", attention, values).reshape(
            N, query_len, self.heads*self.head_dim
        )

        out = self.fc_out(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)
        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size),
            nn.ReLU(),
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out

class Encoder(nn.Module):
    def __init__(self, src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length):
        super(Encoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList(
            [TransformerBlock(embed_size, heads, dropout, forward_expansion) for _ in range(num_layers)]
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        out = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.layers:
            out = layer(out, out, out, mask)
        return out

class DecoderBlock(nn.Module):
    def __init__(self, embed_size, heads, forward_expansion, dropout, device):
        super(DecoderBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm = nn.LayerNorm(embed_size)
        self.transformer_block = TransformerBlock(embed_size, heads, dropout, forward_expansion)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, value, key, src_mask, trg_mask):
        attention = self.attention(x, x, x, trg_mask)
        query = self.dropout(self.norm(attention + x))
        out = self.transformer_block(value, key, query, src_mask)
        return out

class Decoder(nn.Module):
    def __init__(self, trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length):
        super(Decoder, self).__init__()
        self.device = device
        self.word_embedding = nn.Embedding(trg_vocab_size, embed_size)
        self.position_embedding = nn.Embedding(max_length, embed_size)
        self.layers = nn.ModuleList(
            [DecoderBlock(embed_size, heads, forward_expansion, dropout, device) for _ in range(num_layers)]
        )
        self.fc_out = nn.Linear(embed_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, enc_out, src_mask, trg_mask):
        N, seq_length = x.shape
        positions = torch.arange(0, seq_length).expand(N, seq_length).to(self.device)
        x = self.dropout(self.word_embedding(x) + self.position_embedding(positions))
        for layer in self.layers:
            x = layer(x, enc_out, enc_out, src_mask, trg_mask)
        out = self.fc_out(x)
        return out

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=256, num_layers=6, forward_expansion=4, heads=8, dropout=0, device="cuda", max_length=256):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, heads, device, forward_expansion, dropout, max_length)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, heads, forward_expansion, dropout, device, max_length)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx
        self.device = device

    def make_src_mask(self, src):
        src_mask = (src != self.src_pad_idx).unsqueeze(1).unsqueeze(2)
        return src_mask.to(self.device)

    def make_trg_mask(self, trg):
        N, trg_len = trg.shape
        trg_mask = torch.tril(torch.ones((trg_len, trg_len))).expand(N, 1, trg_len, trg_len)
        return trg_mask.to(self.device)

    def forward(self, src, trg):
        src_mask = self.make_src_mask(src)
        trg_mask = self.make_trg_mask(trg)
        enc_src = self.encoder(src, src_mask)
        out = self.decoder(trg, enc_src, src_mask, trg_mask)
        return out


# Example tensors for demonstration (replace with your actual data)
src_data = torch.tensor([[1, 5, 6, 4, 3, 9, 5, 2, 0], [1, 8, 7, 3, 4, 5, 6, 7, 2]]).to(device)
trg_data = torch.tensor([[1, 7, 4, 3, 5, 9, 2, 0], [1, 5, 6, 2, 4, 7, 6, 2]]).to(device)

# Vocabulary sizes and padding indices
src_pad_idx = 0
trg_pad_idx = 0
src_vocab_size = 59514  # Set appropriately
trg_vocab_size = 59514 # Set appropriately

# Initialize model
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device, max_length=128).to(device)

try:
    # Ensure the target batch does not have the last token (truncated for decoding)
    out = model(src_data, trg_data[:, :-1])
    print(out.shape)
except Exception as e:
    print(f"Error: {e}")


torch.Size([2, 7, 59514])


In [30]:
tokenizer.vocab_size

59514

In [31]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer)

In [32]:
data_collator

DataCollatorForSeq2Seq(tokenizer=MarianTokenizer(name_or_path='Helsinki-NLP/opus-mt-en-fr', vocab_size=59514, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	59513: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, model=None, padding=True, max_length=None, pad_to_multiple_of=None, label_pad_token_id=-100, return_tensors='pt')

In [33]:
batch = data_collator([train_tokenized[i] for i in range(1, 3)])
batch.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [34]:
batch["labels"]

tensor([[ 8399,    24,   222,    28,  1000,  6149,  5646,  1660,   337,    93,
             0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 5

## 5. Metrics

In [35]:
!pip install sacrebleu

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/58.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting portalocker (from sacrebleu)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting colorama (from sacrebleu)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/106.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.7/106.7 kB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)
Downloading portalocker-2.10.1-py3-none-any.whl (18 kB)
Installing collected packages: portalocker, colorama, sac

In [36]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.2


In [37]:
# We will Use BLEU metric

import evaluate

metric = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

## 6. Geting DataLoader

In [38]:
from torch.utils.data import dataloader

train_dataloader = dataloader.DataLoader(train_tokenized,
                                         shuffle=True,
                                         collate_fn=data_collator,
                                         batch_size=8,
                                         drop_last=True)

test_dataloader = dataloader.DataLoader(test_tokenized,
                                        shuffle=True,
                                        collate_fn=data_collator,
                                        batch_size=8,
                                        drop_last=True)

In [39]:
len(train_dataloader), len(test_dataloader)

(125, 125)

## 7. Building our model

## 8. Training

In [40]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.CrossEntropyLoss(ignore_index=trg_pad_idx)


In [41]:
for batch in train_dataloader:
  print(batch["input_ids"].shape)
  print(batch["labels"].shape)
  break

torch.Size([8, 128])
torch.Size([8, 128])


In [42]:
from tqdm.auto import tqdm

epoch = 5

for epoch in tqdm(range(epoch)):
  model.train()
  train_loss = 0
  for batch in train_dataloader:
    input_ids = batch["input_ids"].to(device)
    labels = batch["labels"].to(device)

    y_pred = model(input_ids, labels[:, :-1])
    loss = loss_fn(y_pred.view(-1, y_pred.shape[-1]), labels[:, 1:].contiguous().view(-1))
    train_loss += loss.item()

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  ### Testing!!!
  model.eval()
  test_loss = 0
  with torch.inference_mode():
    for batch in test_dataloader:
      input_ids = batch["input_ids"].to(device)
      labels = batch["labels"].to(device)

      y_pred = model(input_ids, labels[:, :-1])
      loss = loss_fn(y_pred.view(-1, y_pred.shape[-1]), labels[:, 1:].contiguous().view(-1))
      test_loss += loss.item()

  print(f"Epoch: {epoch+1}, Train Loss: {train_loss/len(train_dataloader)}, Test Loss: {test_loss/len(test_dataloader)}")

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 1.277537220478058, Test Loss: 1.0734237544536591
Epoch: 2, Train Loss: 0.9968394196033478, Test Loss: 1.1280162023305893
Epoch: 3, Train Loss: 0.98270579123497, Test Loss: 1.0892259256839751
Epoch: 4, Train Loss: 0.9805507326126098, Test Loss: 1.1112357342243195
Epoch: 5, Train Loss: 0.9744996843338013, Test Loss: 1.0886931793689727


In [64]:
import torch
from transformers import AutoTokenizer

# Define device
device = torch.device("cpu")

# Initialize tokenizer
model_checkpoint = "Helsinki-NLP/opus-mt-en-fr"  # This is just for the tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# Example input sentence
input_sentence = "Hello i am priyanshu chaudhary"

# Tokenize input
inputs = tokenizer(input_sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
input_ids = inputs["input_ids"].to(device)
print(f"Input IDs: {input_ids}\n and shape is {input_ids.shape}\n")

src_vocab_size = 59514
trg_vocab_size = 59514
src_pad_idx = tokenizer.pad_token_id
trg_pad_idx = tokenizer.pad_token_id
model = Transformer(src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, device=device, max_length=128).to(device)


trg_data = torch.tensor([[tokenizer.pad_token_id]]).to(device)
print(f"Target data is:\n {trg_data} \nTarget Data Shape: {trg_data.shape}\n")
# Run inference
model.eval()
with torch.no_grad():
    output = model(input_ids, trg_data)
    print(output)


if isinstance(output, torch.Tensor):
    output_probs = torch.softmax(output, dim=-1)
    print(f'Output probs is:\n {output_probs} and shape is \n{output_probs.shape}')
    output_ids = torch.argmax(output_probs, dim=-1)
else:
    output_ids = output

print(f"Output IDs: {output_ids}")

# Decode token IDs to text
translated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

print(f"Translated Text: {translated_text}")


Input IDs: tensor([[10537,   986,  1010, 14934, 10515,     9,  5752,  8450,  4170,   400,
          1835,     0, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513, 59513,
         59513, 59513, 59513, 59513, 5951

## Thats it!!!