<a href="https://colab.research.google.com/github/shusank8/Transformers/blob/main/Transformers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
print("Transformers.... Excited")

Transformers.... Excited


In [2]:
# imports
import torch
import torch.nn as nn
import math

In [3]:
# config = {
#     "batch_size":8,
#     "num_epochs":20,
#     "lr":10**-4,
#     "block_size":512,
#     "embdim":512,
# }

In [4]:
class InputEmbeddings(nn.Module):

  def __init__(self, vocab_size, embdim):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, embdim)

  def forward(self, x):
    return self.embeddings(x)


In [5]:
class PositionalEmbeddings(nn.Module):

  def __init__(self, block_size, embdim, dropout):
    super().__init__()
    self.dropout = nn.Dropout(dropout)

    pe = torch.zeros(block_size, embdim)

    position = torch.arange(0, block_size, dtype = torch.float).unsqueeze(1)

    div_term = torch.exp(torch.arange(0, embdim, 2).float() * (-math.log(10000.0)/embdim))

    pe[:, 0::2] = torch.sin(position*div_term)
    pe[:, 1::2] = torch.cos(position*div_term)
    pe = pe.unsqueeze(0)
    self.register_buffer('pe', pe)

  def forward(self, x):
    x = self.pe[:, :x.shape[1],:]
    x = self.dropout(x)
    return x

In [6]:
class LayerNormalization(nn.Module):

  def __init__(self, embdim):
    super().__init__()
    self.eps = 10**-6
    self.alpha = nn.Parameter(torch.ones(embdim))
    self.bias = nn.Parameter(torch.zeros(embdim))

  def forward(self, x):
    xmean = x.mean(dim=-1, keepdim=True)
    xvar = x.var(dim=-1, keepdim=True)
    x = self.alpha*((x-xmean)/(xvar+self.eps)**(1/2))+self.bias
    return x



In [7]:
class FeedForward(nn.Module):

  def __init__(self, embdim, dropout):
    super().__init__()
    self.m = nn.Sequential(
        nn.Linear(embdim, 3*embdim),
        nn.ReLU(),
        nn.Linear(3*embdim, embdim),
        nn.Dropout(dropout)
    )

  def forward(self, x):
    x = self.m(x)
    return x


In [8]:
class MultiHeadAttentionBlock(nn.Module):

  def __init__(self, embdim, no_of_heads, dropout):
    super().__init__()

    self.embdim = embdim
    self.q = nn.Linear(embdim, embdim)
    self.k = nn.Linear(embdim, embdim)
    self.v = nn.Linear(embdim, embdim)
    self.proj = nn.Linear(embdim, embdim)
    self.no_of_heads = no_of_heads
    self.dropout = nn.Dropout(dropout)

  @staticmethod
  def attention(query, key, value, mask, dropout):
    head_dim = query.shape[-1]
    attention_scores = (query@key.transpose(-2,-1))/math.sqrt(head_dim)
    if mask is not None:
      attention_scores.masked_fill(mask==0, float("-inf"))
    attention_scores = attention_scores.softmax(dim=-1)
    if dropout is not None:
      attention_scores = dropout(attention_scores)
    return (attention_scores@value), attention_scores



  def forward(self, query, key, val, mask):
    # for self attn query==key==val but cross attn
    q = self.q(query)
    k = self.k(key)
    v = self.v(val)
    hdim = q.shape[-1]//self.no_of_heads
    # shape of q=> (B, T, C) BUT WE WANT TO BREAK C INTO DIFF HEADS
    # (B,T,NO_OF_HEADS, HEADIM) WHERE NO_OF_HEADS * HEADIM = C
    query = q.view(q.shape[0], q.shape[1], self.no_of_heads, hdim).transpose(1,2)
    key = k.view(k.shape[0], k.shape[1], self.no_of_heads, hdim).transpose(1,2)
    v = v.view(v.shape[0], v.shape[1], self.no_of_heads, hdim).transpose(1,2)

    x, attn_scores = MultiHeadAttentionBlock(q, k, v, mask, self.dropout)
    x = x.transpose(1,2).contiguous().view(x.shape[0], -1, self.embdim)
    return self.proj(x)


In [9]:
class ResidualConnection(nn.Module):
  def __init__(self, dropout, embdim):
    super().__init__()
    self.dropout  = nn.Dropout(dropout)
    self.norm = LayerNormalization(embdim)

  def forward(self, x, sublayer):
    return x+ self.dropout(sublayer(self.norm(x)))

In [10]:
class EncoderBlock(nn.Module):

  def __init__(self, embdim, s_attn, ffwd, dropout):
    super().__init__()
    self.selfattn = s_attn
    self.ffwd = ffwd
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout,embdim) for _ in range(2)])

  def forward(self, x, src_mask):
    x = self.residual_connections[0](x, lambda x: self.selfattn(x,x,x,src_mask))
    x = self.residual_connections[1](x, self.ffwd)
    return x

In [11]:
class Encoder(nn.Module):

  def __init__(self, layers, embdim):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization(embdim)

  def forward(self, x, mask):
    for layer in self.layers:
      x = layer(x, mask)
    return self.norm(x)

In [12]:
class DecoderBlock(nn.Module):
  def __init__(self, embdim, selfattn, crossattn, ffwd, dropout):
    super().__init__()
    self.selfattn = selfattn
    self.crossattn = crossattn
    self.ffwd = ffwd
    self.residual_connections = nn.ModuleList([ResidualConnection(dropout,embdim) for _ in range(3)])

  def forward(self, x, encoder_output, src_mask, tgt_mask):
    x = self.residual_connections[0](x, lambda x: self.selfattn(x,x,x, tgt_mask))
    x = self.residual_connections[1](x, lambda x: self.crossattn(x, encoder_output, encoder_output, src_mask))
    x = self.residual_connections[2](x, self.ffwd)
    return x





In [13]:
class Decoder(nn.Module):

  def __init__(self, layers, embdim):
    super().__init__()
    self.layers = layers
    self.norm = LayerNormalization(embdim)

  def forward(self, x, encoder_output, src_mask, tgt_mask):
    for layer in self.layers:
      x = layer(x, encoder_output, src_mask, tgt_mask)
    return self.norm(x)

In [14]:
class ProjectionLayer(nn.Module):

  def __init__(self, embdim, vocab_size):
    super().__init__()
    self.proj = nn.Linear(embdim, vocab_size)
  def forward(self, x):
    return torch.log_softmax(self.proj(x), dim=-1)

In [15]:
class Transformer(nn.Module):

  def __init__(self, encoder, decoder, src_embed, tgt_embd, src_pos, tgt_pos, projection_layer):
    super().__init__()
    self.encoder = encoder
    self.decoder = decoder
    self.src_embd = src_embed
    self.tgt_embd = tgt_embd
    self.src_pos = src_pos
    self.tgt_pos = tgt_pos
    self.proj_layer = projection_layer

  def encode(self, src, src_mask):
    src = self.src_embd(src)
    src = self.src_pos(src)
    return self.encoder(src, src_mask)

  def decode(self, encoder_output, src_mask, tgt, tgt_mask):
    tgt = self.tgt_embd(tgt)
    tgt = self.tgt_pos(tgt)
    return self.decoder(tgt, encoder_output, src_mask, tgt_mask)
  def projection(self, x):
    return self.projection_layer(x)


In [16]:
def build_transformer(src_vocab_size,tgt_vocab_size, src_seq_len, tgt_seq_len, embdim, n_of_layers, no_of_heads, dropout):
  src_embd = InputEmbeddings(src_vocab_size, embdim)
  tgt_embd = InputEmbeddings(tgt_vocab_size, embdim)

  src_pos = PositionalEmbeddings(src_seq_len, embdim, dropout)
  tgt_pos = PositionalEmbeddings(tgt_seq_len, embdim, dropout)

  encoder_blocks = []
  for _ in range(n_of_layers):
    encoder_sa = MultiHeadAttentionBlock(embdim, no_of_heads, dropout)
    encoder_ffd = FeedForward(embdim, dropout)
    encoder_block = EncoderBlock(embdim,encoder_sa, encoder_ffd, dropout)
    encoder_blocks.append(encoder_block)
  decoder_blocks = []
  for _ in range(n_of_layers):
    decoder_sa1 = MultiHeadAttentionBlock(embdim, no_of_heads, dropout)
    decoder_ca = MultiHeadAttentionBlock(embdim, no_of_heads, dropout)
    decoder_ffd = FeedForward(embdim, dropout)
    decoder_block = DecoderBlock(embdim, decoder_sa1, decoder_ca, decoder_ffd, dropout)
    decoder_blocks.append(decoder_block)

  encoder = Encoder(nn.ModuleList(encoder_blocks), embdim)
  decoder = Decoder(nn.ModuleList(decoder_blocks), embdim)

  projection_layer = ProjectionLayer(embdim, tgt_vocab_size)

  transformer = Transformer(encoder, decoder, src_embd, tgt_embd, src_pos, tgt_pos, projection_layer)

  # initialize the parameters
  for p in transformer.parameters():
    if p.dim()>=2:
      nn.init.xavier_uniform_(p)
  return transformer



In [17]:
!pip install datasets



In [18]:
import pandas as pd

In [19]:
from datasets import load_dataset

# Load the dataset
ds = load_dataset("iamTangsang/Nepali-to-English-Translation-Dataset")

# Convert each split (train, test, validation) to Pandas DataFrame
df_train = ds['train'].to_pandas() if 'train' in ds else None
df_test = ds['test'].to_pandas() if 'test' in ds else None
df_valid = ds['validation'].to_pandas() if 'validation' in ds else None


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [20]:
df_train = pd.concat([df_train, df_valid], ignore_index=True)

In [21]:
df_train.columns = ['tgt', 'src']
df_test.columns = ['tgt', 'src']

In [22]:
df_train['tgt_len'] = df_train['tgt'].apply(lambda x : len(x.split(" ")))

In [23]:
df_train['src_len'] = df_train['src'].apply(lambda x:len(x.split(" ")))

In [24]:

df_train = df_train[df_train['tgt_len']<100]

In [25]:
# df_train['src_len']
df_train = df_train[df_train['src_len']<100]

In [26]:
df_test['tgt_len'] = df_test['tgt'].apply(lambda x : len(x.split(" ")))
df_test['src_len'] = df_test['src'].apply(lambda x:len(x.split(" ")))
df_test = df_test[df_test['tgt_len']<100]
df_test = df_test[df_test['src_len']<100]

In [27]:
# SOURCE TEXT => ENGLISH
# TARGET TEXT => NEPALI

In [28]:
from tokenizers.trainers import WordLevelTrainer
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE, WordLevel
import os
def create_or_load_tokenizer(df):
  if os.path.exists("SourceTokenizer.json"):
    srctok = Tokenizer.from_file("SourceTokenizer.json")
    tgttok = Tokenizer.from_file("TargetTokenizer.json")
    return srctok, tgttok
  else:
    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    trainer = WordLevelTrainer(special_tokens=["[UNK]",  "[PAD]", "[SOS]", "[EOS]"], vocab_size = 50000, min_frequency=8)
    tokenizer.pre_tokenizer = Whitespace()
    tokenizer.train_from_iterator(df_train['src'], trainer=trainer)
    tokenizer.save("SourceTokenizer.json")
    tokenizer.train_from_iterator(df_train['tgt'], trainer=trainer)
    tokenizer.save("TargetTokenizer.json")
    srctokenizer = Tokenizer.from_file("SourceTokenizer.json")
    targettokenizer = Tokenizer.from_file("TargetTokenizer.json")
    return srctokenizer, targettokenizer

In [29]:
src_tok, tgt_tok = create_or_load_tokenizer(df_train)

In [30]:
src_tok.get_vocab_size()

20687

In [31]:
tgt_tok.get_vocab_size()

36061

In [32]:
from torch.utils.data import Dataset
class BiDataSet(Dataset):

  def __init__(self, df, src_tok, tgt_tok,  block_size):
    self.df = df
    self.src_tok = src_tok
    self.tgt_tok = tgt_tok
    self.block_size = block_size

    self.eos_tok = torch.tensor([src_tok.token_to_id("[EOS]")], dtype = torch.int64)
    self.sos_tok = torch.tensor([src_tok.token_to_id("[SOS]")], dtype = torch.int64)
    self.pad_tok = torch.tensor([src_tok.token_to_id("[PAD]")], dtype = torch.int64)

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    df = self.df.iloc[index]
    src_text = df['src']
    tgt_text = df['tgt']

    enc_inp_tokens = self.src_tok.encode(src_text).ids
    dec_inp_tokens = self.tgt_tok.encode(tgt_text).ids

    enc_num_padding_tok = self.block_size - len(enc_inp_tokens)-2
    dec_num_padding_tok = self.block_size - len(dec_inp_tokens)-1

    if enc_num_padding_tok < 0 or dec_num_padding_tok<0:
      raise ValueError("Sentence is too long")

    encoder_input = torch.cat([
        self.sos_tok,
        torch.tensor(enc_inp_tokens, dtype = torch.int64),
        self.eos_tok,
        torch.tensor([self.pad_tok]*enc_num_padding_tok, dtype=torch.int64)
    ])

    decoder_input = torch.cat([
        self.sos_tok,
        torch.tensor(dec_inp_tokens, dtype = torch.int64),
        torch.tensor([self.pad_tok]*dec_num_padding_tok, dtype=torch.int64)
    ])

    label = torch.cat(
        [
            torch.tensor(dec_inp_tokens, dtype = torch.int64),
            self.eos_tok,
            torch.tensor([self.pad_tok]*dec_num_padding_tok, dtype=torch.int64)

        ]
    )

    assert encoder_input.size(0)==self.block_size
    assert decoder_input.size(0)==self.block_size
    assert label.size(0)==self.block_size


    return {
        "encoder_input":encoder_input,
        "decoder_input":decoder_input,
        "encoder_mask":(encoder_input!=self.pad_tok).unsqueeze(0).unsqueeze(0).int(),
        "decoder_mask":(decoder_input!=self.pad_tok).unsqueeze(0).unsqueeze(0).int() & causal_mask(decoder_input.size(0)),
        "label":label,
        'src_text':src_text,
        'tgt_text':tgt_text
    }

def causal_mask(size):
  mask = torch.tril(torch.ones(1, size, size)).type(torch.int)
  return mask==1


In [33]:
len(df_train), len(df_test)

(713424, 10864)

In [34]:
# block_size = config['block_size']
block_size = 164
train_ds = BiDataSet(df_train,src_tok, tgt_tok, block_size )
val_ds = BiDataSet(df_test,src_tok, tgt_tok, block_size )

In [35]:
# max_len_src = 0
# max_len_tgt = 0

In [36]:
# x = 0
# ts = 0
# for item in df_train['src']:
#   src_ids = src_tok.encode(item).ids
#   # tgt_ids = tgt_tok.encode(item['tgt']).ids
#   max_len_src = max(max_len_src, len(src_ids))
#   ts+=len(src_ids)
#   # max_len_tgt = max(max_len_tgt, tgt_ids)

In [37]:
# x = 0
# s = 0
# for item in df_train['tgt']:
#   # src_ids = src_tok.encode(item).ids
#   tgt_ids = tgt_tok.encode(item).ids
#   # max_len_src = max(max_len_src, len(src_ids))
#   max_len_tgt = max(max_len_tgt, len(tgt_ids))
#   s+=len(tgt_ids)

In [38]:
# max_len_src, max_len_tgt

In [39]:
from torch.utils.data import DataLoader
batch_size = 32
train_dataloader = DataLoader(train_ds, batch_size, shuffle=True)
val_dataloader = DataLoader(val_ds, batch_size, shuffle=True)


In [40]:
# for x in iter(train_dataloader):
#   inp = x
#   break

In [42]:
model = build_transformer(src_tok.get_vocab_size(), tgt_tok.get_vocab_size(), 128, 128, 512, 6, 8, 0.2)
model = model.to("cuda")
optimizer = torch.optim.Adam(model.parameters(), lr = 10**-4, eps = 1e-9)

AttributeError: 'NoneType' object has no attribute 'to'