<a href="https://colab.research.google.com/github/somewhereovertherainbo/TRANSFORMERS/blob/main/TRANSFORMERS_SCRATCH.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [85]:
import torch
from torch import nn

In [86]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# PLAN:
1. Have a class self.SelfAttention
2. Use it to make self.MultiHeadAttention
3. Use it to make transformer block.
4. Use it to make a Decoder.


# Top - down approach

In [87]:
class SelfAttention(nn.Module):

  def __init__(self, d_model, h):
    super().__init__()

    self.d_model = d_model
    self.h = h
    self.d_key = int(d_model/h)
    self.key_generator = nn.Linear(in_features = self.d_model, out_features = self.d_key, bias = False, device = device)
    # self.register_buffer('tril', torch.tril(torch.ones(self.d_model, self.d_model)))



  def forward(self,x):

    B, T = x.shape[0], x.shape[1]

    key = self.key_generator(x) # (B, T, d_model) @ (d_model, d_key) --> (B, T, d_key)
    query = self.key_generator(x) # (B, T, d_model) @ (d_model, d_key) --> (B, T, d_key)
    value = self.key_generator(x) # (B, T, d_model) @ (d_model, d_key) --> (B, T, d_key)

    att_mat = query @ key.transpose(-1,-2)
    # (B, T, d_key) @ (B, d_key, T) --> (B, T, T) --> (T,T) set of attention weights for each batch
    with torch.no_grad():
      tril = torch.tril(torch.ones((T, T), device = device))
    att_mat = att_mat.masked_fill(tril[:T, :T]==0, float('-inf'))/torch.sqrt(torch.tensor(self.d_key))
    att_mat = att_mat.softmax(dim = -1)

    sa_out = att_mat @ value # (B, T, T) @ (B, T, d_key) --> (B, T, d_key)

    return sa_out

In [88]:
class MultiHeadAttention(nn.Module):

  def __init__(self, d_model, h):
    super().__init__()
    self.h = h
    self.d_model = d_model
    self.SelfAttention = SelfAttention(self.d_model, self.h)
    self.W_O = nn.Linear(in_features = self.d_model, out_features = self.d_model, bias = False, device = device)


  def forward(self, x):
    multiple_attentions = [self.SelfAttention(x) for _ in range(0,self.h)]
    return self.W_O(torch.cat([a for a in multiple_attentions], dim = -1)) # (B, T, d_model) @ ( d_model, d_model) --> (B, T, d_model)



In [89]:
class Decoder(nn.Module):

  def __init__(self, d_model, h):
    super().__init__()
    self.d_model = d_model
    self.h = h
    self.LayerNorm = nn.LayerNorm(normalized_shape = self.d_model)
    self.ffwd = nn.Sequential(
        nn.Linear(in_features = self.d_model, out_features = self.d_model, device = device, bias = False),
        nn.ReLU(),
        nn.Linear(in_features = self.d_model, out_features = self.d_model, device = device, bias = False),
        nn.ReLU()
    )
    self.MultiHeadAttention = MultiHeadAttention(self.d_model, self.h)

  def forward(self, x):

    x = self.LayerNorm(x + self.MultiHeadAttention(x))
    x = self.LayerNorm(x + self.ffwd(x))

    return x

In [90]:
model = Decoder(d_model = 512, h = 8)
model.to(device)

Decoder(
  (LayerNorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
  (ffwd): Sequential(
    (0): Linear(in_features=512, out_features=512, bias=False)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=False)
    (3): ReLU()
  )
  (MultiHeadAttention): MultiHeadAttention(
    (SelfAttention): SelfAttention(
      (key_generator): Linear(in_features=512, out_features=64, bias=False)
    )
    (W_O): Linear(in_features=512, out_features=512, bias=False)
  )
)

In [7]:
B = 32
T = 10
d_model = 512
x_in = torch.randn((B, T, d_model), device = device)
print(x_in, '\n')
model(x_in)

tensor([[[ 0.3978, -0.1715, -0.7632,  ..., -0.2781, -0.3335, -0.2877],
         [ 0.9610, -0.2238,  0.6684,  ...,  0.4746,  1.1762, -0.9584],
         [ 0.1333, -0.3168,  1.2518,  ..., -1.4129,  1.1859, -0.4054],
         ...,
         [-0.5218, -1.7336,  0.0662,  ..., -0.0301,  0.1997,  0.3252],
         [ 0.2124, -2.3994, -1.1342,  ..., -1.7019,  0.4679,  0.8065],
         [-0.7692,  0.0809, -0.0102,  ..., -1.6648,  0.4375, -0.7785]],

        [[-0.9461, -1.0831, -0.7414,  ..., -0.5850,  1.1312,  0.1279],
         [ 0.8455, -0.2454,  0.2489,  ..., -0.4623,  2.0664,  1.2222],
         [-1.8552, -1.2974,  0.5128,  ..., -0.0138,  0.0789, -0.8528],
         ...,
         [ 0.3453, -0.5063,  0.4596,  ...,  0.9015,  0.4180, -0.7012],
         [ 1.4490, -0.4330, -0.3136,  ...,  1.9739, -0.2834,  0.4641],
         [-1.0407,  0.3429, -0.6570,  ...,  0.0928,  1.3705,  0.6940]],

        [[ 0.0085,  0.7256,  1.5053,  ..., -0.2927, -0.0703, -1.9302],
         [-0.2683,  0.5019,  0.0421,  ..., -0

tensor([[[ 0.8993, -0.2458, -1.2354,  ..., -0.5251, -0.7246, -0.3110],
         [ 0.6529, -0.1599,  0.9861,  ...,  0.2737,  0.3686, -0.7026],
         [ 0.1748, -0.5909,  0.8729,  ..., -0.9560,  1.5817,  0.1667],
         ...,
         [-1.0215, -1.6702, -0.4560,  ..., -0.3510,  0.2844,  0.5134],
         [ 0.1502, -2.3886, -1.2583,  ..., -1.6345,  0.6141,  0.6543],
         [-0.7052,  0.1779, -0.2589,  ..., -1.5519,  0.2758, -0.9221]],

        [[-0.5355, -0.5395, -1.2438,  ..., -1.2856,  0.7219, -0.2545],
         [ 0.7495, -0.4230, -0.2731,  ..., -0.5044,  1.6286,  1.1072],
         [-1.5544, -1.2682,  0.2969,  ..., -0.2065, -0.2462, -1.0216],
         ...,
         [-0.2288, -0.9991,  0.2297,  ...,  0.0175, -0.2838, -0.4741],
         [ 1.5123, -0.3719, -0.5805,  ...,  1.9474, -0.1998,  0.1808],
         [-1.0304,  0.4464, -0.8933,  ...,  0.2162,  1.2167,  0.2341]],

        [[-0.2458,  0.7389,  1.1954,  ..., -0.5741, -0.4773, -2.1133],
         [-0.3448,  0.6783, -0.0585,  ..., -0

# Much Effiecent implementation without the use of for loops

In [8]:
import math

import torch
from torch import nn
from torch.utils.data import dataset

import numpy as np
import matplotlib as plt

In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [10]:
class MultiHeadAttention(nn.Module):

  """
  Implements MultiHeadAttention Mechanism.

  Args:
    d_key: Dimension of keys to be produced
    d_model: Dimension of Model/ input token dimension
    h: Number of Attention heads
  """

  def __init__(self, d_key, d_model, h, mask = False):
    super().__init__()

    # Assumption: d_value = d_key
    self.d_key = d_key
    self.h = h

    # Compute all of the 'h' keys and queries and values at once instead of using a for loop for each head.
    self.key = nn.Linear(d_model, d_key * h, bias = False, device = device)
    self.query = nn.Linear(d_model, d_key * h, bias = False, device = device)
    self.value = nn.Linear(d_model, d_key * h, bias = False, device = device)

    # Final Linear Layer to be applied after attention blocks
    self.W_O = nn.Linear(d_key * h, d_model)

  def forward(self, q, k, v, mask = None):

    B, T = q.shape[0], q.shape[1]

    q = self.query(q)  # (B, T, d_key*h)
    k = self.key(k)  # (B, T, d_key*h)
    v = self.value(v)  # (B, T, d_key*h)

    # Change shape from (B, T, d_key*h) to (B, h, T, d_key)
    q = q.view(B, T, self.h, self.d_key).transpose(1,2)
    k = k.view(B, T, self.h, self.d_key).transpose(1,2)
    v = v.view(B, T, self.h, self.d_key).transpose(1,2)

    # Compute Attention Weights (B, h, T, d_key) @ (B, h, d_key, T) --> (B, h, T, T). (T, T) weights for each head.
    attention_mat = q @ k.transpose(-2,-1)/ math.sqrt(self.d_key)

    if mask is not None:
      attention_mat = attention_mat.masked_fill(mask[:, None, None, :] == 0, float('-inf'))

    attention_weights = attention_mat.softmax(dim = -1)

    # Attention weighted values for each input token (B, h, T, T) @ (B, h, T, d_key) --> (B, h, T, d_key)
    A = attention_weights @ v

    # Reshape it for the final output
    A = A.transpose(1,2)
    A = A.contiguous().view(B, T, self.d_key * self.h)

    return self.W_O(A)

In [11]:
class TransformerBlock(nn.Module):

  def __init__(self, d_key, d_model, h, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = MultiHeadAttention(d_key, d_model, h)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(4*d_model, d_model),
        nn.Dropout(dropout_prob)
    )

    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, x, mask = None):
    x = self.ln1(x + self.mha(x, x, x, mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)

    return x

In [12]:
class PositionalEncoding(nn.Module):

  def __init__(self, d_model, max_len = 2048, dropout_prob = 0.1):
    super().__init__()

    self.dropout = nn.Dropout(dropout_prob)

    position = torch.arange(max_len).unsqueeze(1)
    exp_term = torch.arange(0, d_model, 2)
    div_term = torch.exp(exp_term * (-math.log(10000.0)/ d_model))
    pe = torch.zeros(1, max_len, d_model)
    pe[0, :, 0::2] = torch.sin(position * div_term)
    pe[0, :, 1::2] = torch.cos(position * div_term)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = x + self.pe[:, :x.size(1), :]
    return self.dropout(x)

In [13]:
class Encoder(nn.Module):

  def __init__(self, vocab_size, max_len, d_key, d_model, n_heads, n_layers, n_classes, dropout_prob):
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [TransformerBlock(d_key, d_model, n_heads, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.ff = nn.Linear(d_model, n_classes)

  def forward(self, x, mask = None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    # Later on we need to pass the mask which is not None. So, we need a for loop to pass mask to each layer
    for block in self.transformer_blocks:
      x = block(x, mask)

    # Depends on the task need to be performed, we can choose just the output vector
    x = x[:, 0, :]  # Just picking out a single timestep output, timestep_0

    x = self.ln(x)
    x = self.ff(x)

    return x




In [14]:
# Test with dummy values
model = Encoder(vocab_size = 20000,
                max_len =1024,
                d_key = 16,
                d_model = 64,
                n_heads = 4,
                n_layers = 2,
                n_classes = 5,
                dropout_prob = 0.1)
model.to(device)

Encoder(
  (embedding): Embedding(20000, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=False)
        (query): Linear(in_features=64, out_features=64, bias=False)
        (value): Linear(in_features=64, out_features=64, bias=False)
        (W_O): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-

In [15]:
x_sample = torch.randint(0, 20000, size = (8,512)).to(device)

In [16]:
mask = torch.ones((8,512))
mask[:,256:] = 0
mask = mask.to(device)

In [17]:
y = model(x_sample, mask)

In [18]:
y.shape

torch.Size([8, 5])

In [19]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [20]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [21]:
!pip install datasets
from datasets import load_dataset



In [22]:
# !pip install pyarrow==11.0.0

In [23]:
# from datasets import load_dataset

In [24]:
raw_datasets = load_dataset('glue', 'sst2')

Downloading readme:   0%|          | 0.00/35.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/3.11M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/72.8k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/148k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/67349 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/872 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [25]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [26]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [27]:
tokenized_datasets = raw_datasets.map(tokenize_fn, batched = True)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Map:   0%|          | 0/67349 [00:00<?, ? examples/s]

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

In [28]:
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [29]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [30]:
tokenized_datasets['train'][0]

{'sentence': 'hide new secretions from the parental units ',
 'label': 0,
 'idx': 0,
 'input_ids': [101, 4750, 1207, 3318, 5266, 1121, 1103, 22467, 2338, 102],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [31]:
tokenized_datasets = tokenized_datasets.remove_columns(['sentence', 'idx'])
tokenized_datasets = tokenized_datasets.rename_column('label', 'labels')

In [32]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [33]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets['train'],
    shuffle = True,
    batch_size = 32,
    collate_fn = data_collator
)

valid_dataloader = DataLoader(
    tokenized_datasets['validation'],
    batch_size = 32,
    collate_fn = data_collator
)

In [34]:
next(iter(train_dataloader))

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': tensor([1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 0, 0, 1, 0]), 'input_ids': tensor([[  101,  1472,  1190,  ...,     0,     0,     0],
        [  101,  7310,  1193,  ...,     0,     0,     0],
        [  101, 14827,  2212,  ...,     0,     0,     0],
        ...,
        [  101,  7011,   102,  ...,     0,     0,     0],
        [  101, 11567,  2981,  ...,     0,     0,     0],
        [  101,  1129,  7109,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [35]:
len(train_dataloader)

2105

In [36]:
2105*32

67360

In [37]:
for batch in train_dataloader:
  print(batch)
  break

{'labels': tensor([0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0,
        1, 1, 0, 1, 1, 0, 0, 1]), 'input_ids': tensor([[  101,  1205,  1103,  ...,     0,     0,     0],
        [  101,  9328,  2254,  ...,     0,     0,     0],
        [  101,  1138,  2320,  ...,     0,     0,     0],
        ...,
        [  101, 23481,   102,  ...,     0,     0,     0],
        [  101,  1547,  1112,  ...,     0,     0,     0],
        [  101,  1144,  1694,  ...,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}


In [38]:
for batch in train_dataloader:
  for k,v in batch.items():
    print('k: ', k, 'v.shape: ', v.shape)
  break

k:  labels v.shape:  torch.Size([32])
k:  input_ids v.shape:  torch.Size([32, 45])
k:  attention_mask v.shape:  torch.Size([32, 45])


In [39]:
set(tokenized_datasets['train']['labels'])

{0, 1}

In [40]:
tokenizer.vocab_size

28996

In [41]:
tokenizer.max_model_input_sizes

{'distilbert-base-uncased': 512,
 'distilbert-base-uncased-distilled-squad': 512,
 'distilbert-base-cased': 512,
 'distilbert-base-cased-distilled-squad': 512,
 'distilbert-base-german-cased': 512,
 'distilbert-base-multilingual-cased': 512}

In [42]:
model = Encoder(
    vocab_size = tokenizer.vocab_size,
    max_len = tokenizer.max_model_input_sizes[checkpoint],
    d_key = 16,
    d_model = 64,
    n_heads = 4,
    n_layers = 2,
    n_classes = 2,
    dropout_prob = 0.1,
)
model.to(device)

Encoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): MultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=False)
        (query): Linear(in_features=64, out_features=64, bias=False)
        (value): Linear(in_features=64, out_features=64, bias=False)
        (W_O): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-

In [43]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters())

In [44]:
def train(model = model,
          loss_fn = loss_fn,
          optimizer = optimizer,
          train_dataloader = train_dataloader,
          valid_dataloader = valid_dataloader,
          epochs = 5):

  train_losses = torch.zeros(epochs)
  test_losses = torch.zeros(epochs)

  for epoch in range(epochs):

    # setting model to training mode
    model.train()

    #initializing loss and number of training loops
    train_loss = 0
    n_train = 0

    for batch in train_dataloader:
      # Move input data to the right device
      batch = {k: v.to(device) for k,v in batch.items()}

      # Optimizer zero grad
      optimizer.zero_grad()

      #Forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = loss_fn(outputs, batch['labels'])

      # Backward and optimize
      loss.backward()
      optimizer.step()

      train_loss += loss.item()
      n_train += 1

    train_loss = train_loss/n_train

    # Evaluation part
    model.eval()
    test_loss = 0
    n_test = 0
    for batch in valid_dataloader:
      batch = {k: v.to(device) for k,v in batch.items()}
      outputs = model(batch['input_ids'], batch['attention_mask'])
      loss = loss_fn(outputs, batch['labels'])
      test_loss += loss.item()
      n_test += 1
    test_loss = test_loss/n_test

    # Save losses
    train_losses[epoch] = train_loss
    test_losses[epoch] = test_loss
    print(f'Epoch: {epoch} | train_loss: {train_loss} | test_loss: {test_loss}')

  return train_losses, test_losses

In [45]:
train_losses, test_losses = train()

Epoch: 0 | train_loss: 0.5405567969321639 | test_loss: 0.4768559081213815
Epoch: 1 | train_loss: 0.3739163496148841 | test_loss: 0.47912435818995747
Epoch: 2 | train_loss: 0.30022076624943356 | test_loss: 0.4817569889128208
Epoch: 3 | train_loss: 0.25770155973201553 | test_loss: 0.5302381637905326
Epoch: 4 | train_loss: 0.228460176492601 | test_loss: 0.5164805054664612


In [46]:
# Measuring the accuracy

#Train accuracy
model.eval()
n_correct = 0
n_total = 0
for batch in train_dataloader:
  batch = {k: v.to(device) for k,v in batch.items()}
  outputs = model(batch['input_ids'], batch['attention_mask'])
  # print(outputs)
  #torch.max returns both max and argmax
  _, predictions = torch.max(outputs,1)
  n_correct += (predictions == batch['labels']).sum().item()
  n_total += batch['input_ids'].shape[0]

train_acc = n_correct/n_total

#Test accuracy
n_correct = 0
n_total = 0
for batch in valid_dataloader:
  batch = {k: v.to(device) for k,v in batch.items()}
  outputs = model(batch['input_ids'], batch['attention_mask'])
  #torch.max returns both max and argmax
  _, predictions = torch.max(outputs,1)
  n_correct += (predictions == batch['labels']).sum().item()
  n_total += batch['input_ids'].shape[0]

test_acc = n_correct/n_total
print(f'Train_acc: {train_acc} | Test_acc: {test_acc}')

Train_acc: 0.9447207827881632 | Test_acc: 0.7912844036697247


# Decoder Implementation

In [47]:
class CausalMultiHeadAttention(nn.Module):

  """
  Implements MultiHeadAttention Mechanism.

  Args:
    d_key: Dimension of keys to be produced
    d_model: Dimension of Model/ input token dimension
    h: Number of Attention heads
  """

  def __init__(self, d_key, d_model, h, max_len):
    super().__init__()

    # Assumption: d_value = d_key
    self.d_key = d_key
    self.h = h

    # Compute all of the 'h' keys and queries and values at once instead of using a for loop for each head.
    self.key = nn.Linear(d_model, d_key * h, bias = False, device = device)
    self.query = nn.Linear(d_model, d_key * h, bias = False, device = device)
    self.value = nn.Linear(d_model, d_key * h, bias = False, device = device)

    # Final Linear Layer to be applied after attention blocks
    self.W_O = nn.Linear(d_key * h, d_model)

    cm = torch.tril(torch.ones(max_len,max_len))
    self.register_buffer(
        'causal_mask',
        cm.view(1,1,max_len,max_len)
    )

  def forward(self, q, k, v, pad_mask = None):

    B, T = q.shape[0], q.shape[1]

    q = self.query(q)  # (B, T, d_key*h)
    k = self.key(k)  # (B, T, d_key*h)
    v = self.value(v)  # (B, T, d_key*h)

    # Change shape from (B, T, d_key*h) to (B, h, T, d_key)
    q = q.view(B, T, self.h, self.d_key).transpose(1,2)
    k = k.view(B, T, self.h, self.d_key).transpose(1,2)
    v = v.view(B, T, self.h, self.d_key).transpose(1,2)

    # Compute Attention Weights (B, h, T, d_key) @ (B, h, d_key, T) --> (B, h, T, T). (T, T) weights for each head.
    attention_mat = q @ k.transpose(-2,-1)/ math.sqrt(self.d_key)

    if pad_mask is not None:
      attention_mat = attention_mat.masked_fill(pad_mask[:, None, None, :] == 0, float('-inf'))

    attention_mat = attention_mat.masked_fill(self.causal_mask[:,:,:T,:T] == 0, float('-inf'))

    attention_weights = attention_mat.softmax(dim = -1)

    # Attention weighted values for each input token (B, h, T, T) @ (B, h, T, d_key) --> (B, h, T, d_key)
    A = attention_weights @ v

    # Reshape it for the final output
    A = A.transpose(1,2)
    A = A.contiguous().view(B, T, self.d_key * self.h)

    return self.W_O(A)

In [48]:
class DecoderTransformerBlock(nn.Module):

  def __init__(self, d_key, d_model, h, max_len, dropout_prob = 0.1):
    super().__init__()

    self.ln1 = nn.LayerNorm(d_model)
    self.ln2 = nn.LayerNorm(d_model)
    self.mha = CausalMultiHeadAttention(d_key, d_model, h, max_len)
    self.ann = nn.Sequential(
        nn.Linear(d_model, d_model * 4),
        nn.GELU(),
        nn.Linear(4*d_model, d_model),
        nn.Dropout(dropout_prob)
    )

    self.dropout = nn.Dropout(dropout_prob)

  def forward(self, x, pad_mask = None):
    x = self.ln1(x + self.mha(x, x, x, pad_mask))
    x = self.ln2(x + self.ann(x))
    x = self.dropout(x)

    return x

In [49]:
class Decoder(nn.Module):

  def __init__(self, vocab_size, max_len, d_key, d_model, n_heads, n_layers, dropout_prob): #n_classes is dropped as it is equal to vocab_size
    super().__init__()

    self.embedding = nn.Embedding(vocab_size, d_model)
    self.pos_encoding = PositionalEncoding(d_model, max_len, dropout_prob)
    transformer_blocks = [DecoderTransformerBlock(d_key, d_model, n_heads, max_len, dropout_prob) for _ in range(n_layers)]
    self.transformer_blocks = nn.Sequential(*transformer_blocks)
    self.ln = nn.LayerNorm(d_model)
    self.ff = nn.Linear(d_model, vocab_size)

  def forward(self, x, pad_mask = None):
    x = self.embedding(x)
    x = self.pos_encoding(x)
    # Later on we need to pass the mask which is not None. So, we need a for loop to pass mask to each layer
    for block in self.transformer_blocks:
      x = block(x, pad_mask)

    # Depends on the task need to be performed, we can choose just the output vector
    # x = x[:, 0, :]  # Just picking out a single timestep output, timestep_0
    # many-to-many task

    x = self.ln(x)
    x = self.ff(x)

    return x




In [50]:
decoder_model = Decoder(20000, 1024, 16, 64, 4, 2, 0.1).to(device)

In [51]:
x_dummy = torch.randint(0, 20000, size = (8,512)).to(device)

In [52]:
mask = torch.ones((8,512))
mask[:,256:] = 0
mask_dummy = mask.to(device)

In [53]:
y_ = decoder_model(x_dummy, mask_dummy)

In [54]:
y_.shape

torch.Size([8, 512, 20000])

In [55]:
from transformers import AutoTokenizer, DataCollatorWithPadding

In [56]:
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [57]:
from datasets import load_dataset

In [58]:
decoder_raw_data = load_dataset('glue','sst2')

In [59]:
decoder_raw_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [60]:
def tokenize_fn(batch):
  return tokenizer(batch['sentence'], truncation = True)

In [61]:
tokenized_decoder_data = decoder_raw_data.map(tokenize_fn, batched = True)
decoder_datacollator = DataCollatorWithPadding(tokenizer)

Map:   0%|          | 0/872 [00:00<?, ? examples/s]

In [62]:
tokenized_decoder_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [63]:
tokenized_decoder_data = tokenized_decoder_data.remove_columns(['sentence', 'idx', 'label'])

In [64]:
tokenized_decoder_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [65]:
from torch.utils.data import DataLoader

decoder_train_dataloader = DataLoader(
    tokenized_decoder_data['train'],
    shuffle = True,
    batch_size = 32,
    collate_fn = decoder_datacollator
)

decoder_test_dataloader = DataLoader(
    tokenized_decoder_data['validation'],
    shuffle = True,
    batch_size = 1,
    collate_fn = decoder_datacollator
)

In [66]:
for batch in decoder_train_dataloader:
  print(batch)
  break

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'input_ids': tensor([[  101,  1110,  1198,  ...,     0,     0,     0],
        [  101,  8869,  1106,  ...,     0,     0,     0],
        [  101,  3411,  2426,  ...,     0,     0,     0],
        ...,
        [  101,  1463,  1106,  ...,     0,     0,     0],
        [  101,  1243,   102,  ...,     0,     0,     0],
        [  101,  1211, 11826,  ...,  1313,   119,   102]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}


In [67]:
for batch in decoder_train_dataloader:
  for k,v in batch.items():
    print('k: ', k, 'v.shape: ', v.shape)
  break

k:  input_ids v.shape:  torch.Size([32, 38])
k:  attention_mask v.shape:  torch.Size([32, 38])


In [68]:
tokenizer.pad_token_id

0

In [69]:
decoder_model = Decoder(vocab_size = tokenizer.vocab_size,
                        max_len = tokenizer.max_model_input_sizes[checkpoint],
                        d_key = 16,
                        d_model = 64,
                        n_heads = 4,
                        n_layers = 2,
                        dropout_prob = 0.1)
decoder_model.to(device)

Decoder(
  (embedding): Embedding(28996, 64)
  (pos_encoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer_blocks): Sequential(
    (0): DecoderTransformerBlock(
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (ln2): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
      (mha): CausalMultiHeadAttention(
        (key): Linear(in_features=64, out_features=64, bias=False)
        (query): Linear(in_features=64, out_features=64, bias=False)
        (value): Linear(in_features=64, out_features=64, bias=False)
        (W_O): Linear(in_features=64, out_features=64, bias=True)
      )
      (ann): Sequential(
        (0): Linear(in_features=64, out_features=256, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=256, out_features=64, bias=True)
        (3): Dropout(p=0.1, inplace=False)
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): DecoderTransformerBlock(
      (ln1): Laye

In [70]:
loss_fn = nn.CrossEntropyLoss(ignore_index = tokenizer.pad_token_id)
optimizer = torch.optim.Adam(decoder_model.parameters())

In [71]:
def train_decoder(model = decoder_model,
                  loss_fn = loss_fn,
                  optimizer = optimizer,
                  train_loader = decoder_train_dataloader,
                  epochs = 20):

  train_losses = torch.zeros(epochs)

  for epoch in range(epochs):
    model.train()
    train_loss = []
    for batch in train_loader:
      batch = {k:v.to(device) for k,v in batch.items()}
      optimizer.zero_grad()

      #create the targets
      targets = batch['input_ids'].clone().detach()
      # print(targets)
      targets = targets.roll(shifts = -1, dims = 1)
      # print(targets)
      targets[:,-1] = tokenizer.pad_token_id
      # print(targets, targets.shape)

      #forward pass
      outputs = model(batch['input_ids'], batch['attention_mask'])
      # Outputs are (B, T, V) dimensional
      # But loss_fn expects (B, V, T) in case 3 dimensional inputs.
      loss = loss_fn(outputs.transpose(1,2), targets)

      #optimize
      loss.backward()
      optimizer.step()
      train_loss.append(loss.item())
      # break

    train_loss = torch.mean(torch.tensor(train_loss))
    train_losses[epoch] = train_loss
    # break

    print(f'Epoch: {epoch} | Train_loss: {train_loss}')
  return train_losses

In [72]:
decoder_train_losses = train_decoder()

Epoch: 0 | Train_loss: 5.973060131072998
Epoch: 1 | Train_loss: 5.0159592628479
Epoch: 2 | Train_loss: 4.687039375305176
Epoch: 3 | Train_loss: 4.503098011016846
Epoch: 4 | Train_loss: 4.369207859039307
Epoch: 5 | Train_loss: 4.26812744140625
Epoch: 6 | Train_loss: 4.181396007537842
Epoch: 7 | Train_loss: 4.105530738830566
Epoch: 8 | Train_loss: 4.036803722381592
Epoch: 9 | Train_loss: 3.976431131362915
Epoch: 10 | Train_loss: 3.9234111309051514
Epoch: 11 | Train_loss: 3.872299909591675
Epoch: 12 | Train_loss: 3.8283047676086426
Epoch: 13 | Train_loss: 3.7828221321105957
Epoch: 14 | Train_loss: 3.742156744003296
Epoch: 15 | Train_loss: 3.704469919204712
Epoch: 16 | Train_loss: 3.669290542602539
Epoch: 17 | Train_loss: 3.6348977088928223
Epoch: 18 | Train_loss: 3.606605052947998
Epoch: 19 | Train_loss: 3.5748722553253174


In [73]:
decoder_model.eval()
for batch in decoder_test_dataloader:
  batch = {k:v.to(device) for k,v in batch.items()}
  output = decoder_model(batch['input_ids'], batch['attention_mask'])
  break

In [74]:
output.shape

torch.Size([1, 11, 28996])

In [75]:
torch.argmax(output, dim = -1)

tensor([[1103,  112, 1138, 1151,  169,  119,  118,  102, 2168,  102,  102]],
       device='cuda:0')

In [76]:
prediction_ids = torch.argmax(output, dim = -1)
tokenizer.decode(prediction_ids[0])

"the'have been `. - [SEP] action [SEP] [SEP]"

In [77]:
tokenizer.decode(batch['input_ids'][0])

'[CLS] they should have called it gutterball. [SEP]'

In [78]:
tokenizer.decode(torch.concat((batch['input_ids'][0,:5], prediction_ids[:,4])))

'[CLS] they should have called `'

In [91]:
# generate language
prompt = 'So, '

tokenized_prompt = tokenizer(prompt, return_tensors ='pt')
tokenized_prompt

{'input_ids': tensor([[ 101, 1573,  117,  102]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [92]:
outputs = decoder_model(
    tokenized_prompt['input_ids'][:, :-1].to(device),
    tokenized_prompt['attention_mask'][:, :-1].to(device)
)
outputs.shape

torch.Size([1, 3, 28996])

In [93]:
predcition_ids = torch.argmax(outputs[:,-1,:], axis = -1)

In [94]:

tokenizer.decode(prediction_ids[0])

"the'have been `. - [SEP] action [SEP] [SEP]"

In [95]:
def language_generator(prompt, model):
  tokenized_prompt = tokenizer(prompt, return_tensors = 'pt')
  input_ids = tokenized_prompt['input_ids'][:,:-1].to(device)
  mask = tokenized_prompt['attention_mask'][:,:-1].to(device)

  for _ in range(20):
    outputs = model(input_ids, mask)
    prediction_id = torch.argmax(outputs[:,-1,:], dim = -1)

    input_ids = torch.hstack((input_ids, prediction_id.view(1,1)))
    mask = torch.ones_like(input_ids)

    if prediction_id == tokenizer.sep_token_id:
      print(tokenizer.decode(input_ids[0]))
      break

In [96]:
language_generator(prompt = prompt, model = decoder_model)

[CLS] So, and the film's a lot of funniest jokes [SEP]
