In [None]:
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import dataset

import numpy as np
import matplotlib.pyplot as plt

In [None]:
class MultiHeadAttention(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, causal=False):
    super().__init__()

    # Assume d_v=d_k
    self.d_k = d_k
    self.n_heads =n_heads

    self.key = nn.Linear(d_model, d_k * n_heads) # the linear layers don't just do matrix multiplication, but they add a bias term also
    self.query = nn.Linear(d_model, d_k * n_heads) # there's an option to drop the bias term also, but here it is kept customary
    self.value = nn.Linear(d_model, d_k * n_heads)

    # final linear layer
    self.fc = nn.Linear(d_k * n_heads, d_model) # the output should be d_model, since the thats the input and output of each transformer block

    # causal mask
    # make it so that the diagonal is 0 too
    self.causal = causal
    if causal:
      cm = torch.tril(torch.ones(max_len, max_len))
      self.register_buffer(
          "causal_mask",
          cm.view(1, 1, max_len, max_len)
      )

    # register buffer so that tensor is saved


  def forward(self, q, v, k, pad_mask=None):
    q = self.query(q) # N x T x (hd_k)
    k = self.key(k) # N x T x (hd_k)
    v = self.value(v) # N x T x (hd_k)

    N = q.shape[0]
    T_output = q.shape[1]
    T_input = k.shape[1]

    # changing the shape from (N,T,h,d_k) -> (N,h,T,d_k) for proper matrix multiplication
    q = q.view(N, T_output, self.n_heads, self.d_k).transpose(1,2) # view function, lets us reshape the data
    k = k.view(N, T_input, self.n_heads, self.d_k).transpose(1,2) # reshaping is splitting up different heads
    v = v.view(N, T_input, self.n_heads, self.d_k).transpose(1,2)

    # compute attention weights
    # (N, h, T, d_k) x (N, h, d_k, T) -> (N, h, T, T)
    attn_scores = q @ k.transpose(-2,-1)/math.sqrt(self.d_k) # let the last 2 dimentions are matrices that we will multiply,
    if pad_mask is not None:  # and thinking of the 1st 2 dimentions component wise,
      attn_scores = attn_scores.masked_fill(
          pad_mask[:, None, None, :] == 0, float('-inf')) # None will introduce superfluous dimension of size 1 (N, T) -> (N, 1, 1, T)
          # basically wherever the 1st argument is true, apply the 2nd argument
    if self.causal:
      attn_scores = attn_scores.masked_fill(
          self.causal_mask[:, :, :T_output, :T_input] == 0, float('-inf'))
    attn_weights = F.softmax(attn_scores, dim=-1) # this allows broadcast correctly, and masked_fill fn applies mask in the appropriate places


    # computing attn weighted values
    # (N, h, T, T) x (N, h, T, d_k) -> (N, h, T, d_k)
    A = attn_weights @ v # @ does matrix multiplication and broadcasting

    # reshaping attention to feed into the fc layer
    A = A.transpose(1,2) # (N, T, h, d_k)
    A = A.contiguous().view(N, T_output, self.d_k * self.n_heads) #(N, T, h*d_k)
    # contiguous ensures the data is laid out in the memory correctly

    # projection
    return self.fc(A)

# attention has an input size of d_model, and output shape of d_k
# here it is written d_k * n_heads, this is because we are doing all heads all at once
# mask is a tensor of 0s and 1s of size N x T
# for each of the N samples, we have to identify which are the actual tokens and which are the pad tokens
# it is 2 dimentional instead of 4 dimensional, so we can apply mask directly, by multiplying attention score by mask
#

In [None]:
class EncoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob),
    )

    self.dropout = nn.Dropout(p=dropout_prob)
    # dropout is used for regularization


  def forward(self, x, pad_mask=None):
    x = self.ln1(x + self.mha(x, x, x, pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)
    return x


In [None]:
class DecoderBlock(nn.Module):
  def __init__(self, d_k, d_model, n_heads, max_len, dropout_prob=0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.ln3 = nn.LayerNorm(d_model)
    self.mha1 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=True)
    self.mha2 = MultiHeadAttention(d_k, d_model, n_heads, max_len, causal=False)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model*4),
        nn.GELU(),
        nn.Linear(d_model*4, d_model),
        nn.Dropout(dropout_prob),
    )

    self.dropout = nn.Dropout(p=dropout_prob)
    # dropout is used for regularization


  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.ln1(dec_input + self.mha1(dec_input, dec_input, dec_input, dec_mask))
    x = self.ln2(x + self.mha2(x, enc_output, enc_output, enc_mask))
    x = self.ln3(x + self.ann(x))
    x = self.dropout(x)
    return x


In [None]:
class PositionalEncoding(nn.Module):
  def __init__(self, d_model, max_len=2048, dropout_prob=0.1):
    super().__init__()
    self.dropout = nn.Dropout(p=dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0)/d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    # x.shape N x T x D
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)



In [None]:
class Encoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               # n_classes,
               dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        EncoderBlock(
            d_k,
            d_model,
            n_heads,
            dropout_prob) for _ in range(n_layers)]

    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    # self.fc = nn.Linear(d_model, n_classes)


  def forward(self, x, pad_mask=None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(x, pad_mask)

    # many_to_one x has shape (NxTxD)
    # x = x[:, 0, :]

    x = self.ln(x)
    # x = self.fc(x)

    return x


In [None]:
class Decoder(nn.Module):
  def __init__(self,
               vocab_size,
               max_len,
               d_k,
               d_model,
               n_heads,
               n_layers,
               dropout_prob):
    super().__init__()
    # n_classes is not needed in the output because the vocab_size is already giving classes


    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [
        DecoderBlock(
            d_k,
            d_model,
            n_heads,
            max_len,
            dropout_prob) for _ in range(n_layers)]

    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.fc = nn.Linear(d_model, vocab_size)


  def forward(self, enc_output, dec_input, enc_mask=None, dec_mask=None):
    x = self.embedding(dec_input)
    x = self.pos_encoding(x)
    for block in self.transformer_blocks:
      x = block(enc_output, x, enc_mask, dec_mask)

    x = self.ln(x)
    x = self.fc(x) # many to many

    return x


In [None]:
class Transformer(nn.Module):
  def __init__(self, encoder, decoder):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder


  def forward(self, enc_input, dec_input, enc_mask, dec_mask):
    enc_output = self.encoder(enc_input, enc_mask)
    dec_output = self.decoder(enc_output, dec_input, enc_mask, dec_mask)
    return dec_output

In [None]:
#test it
encoder = Encoder(
    vocab_size = 20_000,
    max_len = 512,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    #n_classes = 2,
    dropout_prob = 0.1,
)

decoder = Decoder(
    vocab_size = 10_000,
    max_len = 512,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    dropout_prob = 0.1,
)

transformer = Transformer(encoder, decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)

cuda:0


Decoder(
  (embedding): Embedding(10000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln3): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha1): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (mha2): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
 

In [None]:
xe = np.random.randint(0, 20_000, size=(8, 512))
xe_t = torch.tensor(xe).to(device)

xd = np.random.randint(0, 10_000, size=(8, 256))
xd_t = torch.tensor(xd).to(device)

maske = np.ones((8, 512))
maske[:, 256:] = 0
maske_t = torch.tensor(maske).to(device)

maskd = np.ones((8, 256))
maskd[:, 128:] = 0
maskd_t = torch.tensor(maskd).to(device)

out = transformer(xe_t, xd_t, maske_t, maskd_t)
out.shape

torch.Size([8, 256, 10000])

In [None]:
out

tensor([[[ 1.7294e-01,  1.0266e+00, -9.5789e-01,  ..., -5.4065e-02,
          -1.1790e+00, -3.7562e-01],
         [ 1.4389e-01,  1.5544e-01,  3.3485e-01,  ..., -1.1006e-01,
          -1.1071e+00,  3.0485e-01],
         [-3.7213e-01, -2.4268e-01, -8.5624e-01,  ..., -2.4043e-01,
          -7.7796e-02,  3.3449e-01],
         ...,
         [-7.6101e-01, -9.6441e-01,  8.3264e-01,  ...,  3.8358e-01,
           6.1155e-01, -9.0906e-01],
         [-3.2154e-01,  5.1323e-01, -8.1105e-01,  ...,  8.6930e-01,
           1.4224e-01, -1.3936e-01],
         [ 2.8389e-01,  6.1276e-01, -9.6883e-01,  ..., -5.0727e-01,
           6.7051e-01, -6.5662e-02]],

        [[-1.2852e-01,  5.1480e-01, -5.0375e-01,  ..., -8.3874e-01,
          -1.1787e+00,  1.2252e+00],
         [ 6.7235e-01,  1.6840e-01, -6.0791e-01,  ..., -2.7121e-02,
          -8.4289e-01,  2.4928e-01],
         [ 4.9989e-01,  2.3139e-01,  4.9666e-03,  ..., -9.8509e-03,
          -5.0474e-01,  5.5504e-01],
         ...,
         [-5.0004e-01, -1

In [None]:
!wget -nc https://lazyprogrammer.me/course_files/nlp3/spa.txt

--2024-02-02 17:16:22--  https://lazyprogrammer.me/course_files/nlp3/spa.txt
Resolving lazyprogrammer.me (lazyprogrammer.me)... 172.67.213.166, 104.21.23.210, 2606:4700:3031::6815:17d2, ...
Connecting to lazyprogrammer.me (lazyprogrammer.me)|172.67.213.166|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/plain]
Saving to: ‘spa.txt’

spa.txt                 [   <=>              ]   7.45M  13.1MB/s    in 0.6s    

2024-02-02 17:16:23 (13.1 MB/s) - ‘spa.txt’ saved [7817148]



In [None]:
!head spa.txt

Go.	Ve.
Go.	Vete.
Go.	Vaya.
Hi.	Hola.
Run!	¡Corre!
Who?	¿Quién?
Wow!	¡Órale!
Fire!	¡Fuego!
Fire!	¡Incendio!
Fire!	¡Disparad!


In [None]:
import pandas as pd
df = pd.read_csv('spa.txt', sep="\t", header=None)
df.head()

Unnamed: 0,0,1
0,Go.,Ve.
1,Go.,Vete.
2,Go.,Vaya.
3,Hi.,Hola.
4,Run!,¡Corre!


In [None]:
df.shape

(115245, 2)

In [None]:
df = df.iloc[:30_000]

In [None]:
df.columns = ['en', 'es']
df.to_csv('spa.csv', index=None)

In [None]:
!head spa.csv

en,es
Go.,Ve.
Go.,Vete.
Go.,Vaya.
Hi.,Hola.
Run!,¡Corre!
Who?,¿Quién?
Wow!,¡Órale!
Fire!,¡Fuego!
Fire!,¡Incendio!


In [None]:
!pip install transformers datasets sentencepiece sacremoses

Collecting datasets
  Downloading datasets-2.16.1-py3-none-any.whl (507 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m18.4 MB/s[0m eta [36m0:00:00[0m
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Dow

In [None]:
!pip install transformers datasets



In [None]:
from datasets import load_dataset
raw_datasets = load_dataset("csv", data_files='spa.csv')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 30000
    })
})

In [None]:
split = raw_datasets['train'].train_test_split(test_size=0.3, seed=42)
split

DatasetDict({
    train: Dataset({
        features: ['en', 'es'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['en', 'es'],
        num_rows: 9000
    })
})

In [None]:
from transformers import AutoTokenizer

model_checkpoint = 'Helsinki-NLP/opus-mt-en-es'
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

In [None]:
en_sentence = split['train'][0]['en']
es_sentence = split['train'][0]['es']

inputs = tokenizer(en_sentence)
targets = tokenizer(text_target = es_sentence)

tokenizer.convert_ids_to_tokens(targets['input_ids'])

['▁Yo', '▁puedo', '▁arreglarlo', '.', '</s>']

In [None]:
es_sentence

'Yo puedo arreglarlo.'

In [None]:
max_input_length = 128
max_target_length = 128

def preprocess_function(batch):
  model_inputs=tokenizer(batch['en'], max_length=max_input_length, truncation=True)

  labels = tokenizer(text_target=batch['es'], max_length=max_target_length, truncation=True)

  model_inputs['labels'] = labels['input_ids']

  return model_inputs

In [None]:
tokenized_datasets = split.map(preprocess_function, batched=True, remove_columns=split['train'].column_names)

Map:   0%|          | 0/21000 [00:00<?, ? examples/s]

Map:   0%|          | 0/9000 [00:00<?, ? examples/s]

In [None]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 21000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 9000
    })
})

In [None]:
from transformers import DataCollatorForSeq2Seq

In [None]:
data_collator = DataCollatorForSeq2Seq(tokenizer)

In [None]:
batch = data_collator([tokenized_datasets['train'][i] for i in range(0,5)])
batch.keys

<bound method BatchEncoding.keys of {'input_ids': tensor([[   33,    88,  9222,    48,     3,     0, 65000, 65000],
        [  552, 11490,     9,   310,   255,     3,     0, 65000],
        [  143,    31,   125,  1208,     3,     0, 65000, 65000],
        [ 1093,   220,  1890,    23,    48,     3,     0, 65000],
        [  124,    20,   100, 18422,    48,   141,     3,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[  711,  1039, 44159,     3,     0,  -100,  -100,  -100],
        [ 2722, 18663,   239,   212,     3,     0,  -100,  -100],
        [  539,    43,   155,   960,     3,     0,  -100,  -100],
        [15165,  1250,   380,  3564,    36,  1016,     3,     0],
        [  350,     8, 19153,    29, 31326,     3,     0,  -100]])}>

In [None]:
batch['input_ids']

tensor([[   33,    88,  9222,    48,     3,     0, 65000, 65000],
        [  552, 11490,     9,   310,   255,     3,     0, 65000],
        [  143,    31,   125,  1208,     3,     0, 65000, 65000],
        [ 1093,   220,  1890,    23,    48,     3,     0, 65000],
        [  124,    20,   100, 18422,    48,   141,     3,     0]])

In [None]:
batch['attention_mask']

tensor([[1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]])

In [None]:
batch['labels']

tensor([[  711,  1039, 44159,     3,     0,  -100,  -100,  -100],
        [ 2722, 18663,   239,   212,     3,     0,  -100,  -100],
        [  539,    43,   155,   960,     3,     0,  -100,  -100],
        [15165,  1250,   380,  3564,    36,  1016,     3,     0],
        [  350,     8, 19153,    29, 31326,     3,     0,  -100]])

In [None]:
tokenizer.all_special_ids

[0, 1, 65000]

In [None]:
tokenizer.all_special_tokens

['</s>', '<unk>', '<pad>']

In [None]:
tokenizer('<pad>')

{'input_ids': [65000, 0], 'attention_mask': [1, 1]}

In [None]:
tokenizer('</s>')
# tokenizer always adds end of sentence tokens

{'input_ids': [0, 0], 'attention_mask': [1, 1]}

In [None]:
from torch.utils.data import DataLoader

train_loader = DataLoader(
    tokenized_datasets['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

valid_loader = DataLoader(
    tokenized_datasets['test'],
    batch_size=32,
    collate_fn=data_collator
)

In [None]:
for batch in train_loader:
  for k,v in batch.items():
    print("k:", k, "v.shape", v.shape)
  break

k: input_ids v.shape torch.Size([32, 9])
k: attention_mask v.shape torch.Size([32, 9])
k: labels v.shape torch.Size([32, 10])


In [None]:
tokenizer.vocab_size

65001

In [None]:
tokenizer.decode([60000])

'ѕэр'

In [None]:
tokenizer.add_special_tokens({'cls_token': '<s>'})

1

In [None]:
tokenizer('</s>')

{'input_ids': [0, 0], 'attention_mask': [1, 1]}

In [None]:
tokenizer.vocab_size

65001

In [None]:
#test it
encoder = Encoder(
    vocab_size = tokenizer.vocab_size+1,
    max_len = 512,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    #n_classes = 2,
    dropout_prob = 0.1,
)

decoder = Decoder(
    vocab_size = tokenizer.vocab_size+1,
    max_len = 512,
    d_k = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    dropout_prob = 0.1,
)

transformer = Transformer(encoder, decoder)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
encoder.to(device)
decoder.to(device)

cuda:0


Decoder(
  (embedding): Embedding(65002, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln3): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha1): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
      )
      (mha2): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=True)
        (query): Linear(in_features=64, out_features=64, bias=True)
        (value): Linear(in_features=64, out_features=64, bias=True)
        (fc): Linear(in_features=64, out_features=64, bias=True)
 

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=-100)
optimizer = torch.optim.Adam(transformer.parameters())

In [None]:
from datetime import datetime


# A function to encapsulate the training loop
def train(model, criterion, optimizer, train_loader, valid_loader, epochs):
  train_losses = np.zeros(epochs)
  test_losses = np.zeros(epochs)

  for it in range(epochs):
    model.train()
    t0 = datetime.now()
    train_loss = []
    #n_train = 0
    for batch in train_loader:
      # move data to GPU
      batch = { k: v.to(device) for k, v in batch.items()}

      # zero the parameter gradient
      optimizer.zero_grad()

      # shifts targets backwards
      # as it is a distilbert it starts with [CLS] token
      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']
      #targets[:, -1] = tokenizer.pad_token_id # last token we want to be pad token and not a [CLS] token


      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 65_001

      dec_input = dec_input.masked_fill(
          dec_input== -100, tokenizer.pad_token_id
      )

      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(
          dec_input==tokenizer.pad_token_id, 0)


      # forward pass
      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      # outputs are N xT xV
      # but PyTorch expects N xV xT


      loss = criterion(outputs.transpose(2,1), targets)

      # backward and optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())


    # get average training loss
    train_loss = np.mean(train_loss)


    model.eval()
    test_loss =[]
    for batch in valid_loader:
      batch = { k: v.to(device) for k, v in batch.items()}

      enc_input = batch['input_ids']
      enc_mask = batch['attention_mask']
      targets = batch['labels']

      dec_input = targets.clone().detach()
      dec_input = torch.roll(dec_input, shifts=1, dims=1)
      dec_input[:, 0] = 65_001

      dec_input = dec_input.masked_fill(
          dec_input== -100, tokenizer.pad_token_id
      )

      dec_mask = torch.ones_like(dec_input)
      dec_mask = dec_mask.masked_fill(
          dec_input==tokenizer.pad_token_id, 0)


      # forward pass
      outputs = model(enc_input, dec_input, enc_mask, dec_mask)
      # outputs are N xT xV
      # but PyTorch expects N xV xT


      loss = criterion(outputs.transpose(2,1), targets)

      # backward and optimize
      loss.backward()
      optimizer.step()
      test_loss.append(loss.item())

    test_loss = np.mean(test_loss)


    # save losses
    train_losses[it] = train_loss
    test_losses[it] = test_loss

    dt = datetime.now() - t0
    print(f'Epoch {it + 1}/{epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}, Duration : {dt}')

  return train_losses, test_losses

In [None]:
train_losses, test_losses = train(
    transformer, criterion, optimizer, train_loader, valid_loader, epochs=5
)

Epoch 1/5, Train Loss: 4.8714, Test Loss: 6.9790, Duration : 0:00:23.238464
Epoch 2/5, Train Loss: 6.3284, Test Loss: 7.2790, Duration : 0:00:24.745706
Epoch 3/5, Train Loss: 7.1512, Test Loss: 6.5936, Duration : 0:00:19.366279
Epoch 4/5, Train Loss: 6.4871, Test Loss: 6.3752, Duration : 0:00:19.005736
Epoch 5/5, Train Loss: 6.3559, Test Loss: 6.3651, Duration : 0:00:19.215458


In [None]:
input_sentence = split['test'][10]['en']
input_sentence

'Can I take a day off?'

In [None]:
enc_input = tokenizer(input_sentence, return_tensors='pt')
enc_input

{'input_ids': tensor([[1283,   33,  273,    8,  502,  843,   21,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
dec_input_str = '<s>'

dec_input = tokenizer(text_target = dec_input_str, return_tensors='pt')
dec_input

{'input_ids': tensor([[65001,     0]]), 'attention_mask': tensor([[1, 1]])}

In [None]:
enc_input.to(device)
dec_input.to(device)
output = transformer(
    enc_input['input_ids'],
    dec_input['input_ids'][:, :-1],
    enc_input['attention_mask'],
    dec_input['attention_mask'][:, :-1]
)

output

tensor([[[  8.7641, -13.4706,   3.4891,  ..., -12.9830, -14.3199, -13.3733]]],
       device='cuda:0', grad_fn=<ViewBackward0>)

In [None]:
output.shape # NxTxV

torch.Size([1, 1, 65002])

In [None]:
enc_output = encoder(enc_input['input_ids'], enc_input['attention_mask'])
enc_output.shape

torch.Size([1, 8, 64])

In [None]:
dec_output = decoder(
    enc_output,
    dec_input['input_ids'][:, :-1],
    enc_input['attention_mask'],
    dec_input['attention_mask'][:, :-1])

dec_output.shape

torch.Size([1, 1, 65002])

In [None]:
torch.allclose(output, dec_output)

True

In [None]:
dec_input_ids = dec_input['input_ids'][:,:-1]
dec_attn_mask = dec_input['attention_mask'][:, :-1]

for _ in range(32):
  dec_output = decoder(
      enc_output,
      dec_input_ids,
      enc_input['attention_mask'],
      dec_attn_mask,
  )

  prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)

  dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1,1)))

  dec_attn_mask = torch.ones_like(dec_input_ids)

  if prediction_id ==0:
    break

In [None]:
tokenizer.decode(dec_input_ids[0])

'<s> </s>'

In [None]:
split['test'][10]['es']

'¿Puedo tomarme un día libre?'

In [None]:
def translate(input_sentence):
  enc_input = tokenizer(input_sentence, return_tensors='pt').to(device)
  enc_output = encoder(enc_input['input_ids'], enc_input['attention_mask'])

  dec_input_ids = torch.tensor([[65_001]], device=device)
  dec_attn_mask = torch.ones_like(dec_input_ids, device=device)

  for _ in range(32):
    dec_output = decoder(
        enc_output,
        dec_input_ids,
        enc_input['attention_mask'],
        dec_attn_mask,
    )

    prediction_id = torch.argmax(dec_output[:, -1, :], axis=-1)

    dec_input_ids = torch.hstack((dec_input_ids, prediction_id.view(1, 1)))

    dec_attn_mask = torch.ones_like(dec_input_ids)

    if prediction_id == 0:
      break

  translation = tokenizer.decode(dec_input_ids[0, 1:])
  print(translation)


In [None]:
translate('I am good, how are you?')

</s>
