<a href="https://colab.research.google.com/github/praburocking/transformers/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [24]:
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[K     |████████████████████████████████| 19.1 MB 717 kB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[K     |████████████████████████████████| 13.9 MB 4.4 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [25]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
#import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn import LayerNorm
import numpy as np


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [None]:
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [26]:
class EncoderDecoder(nn.Module):
  """
  first encoderDecoder (standard one)
  """
  def __init__(self,encoder,decoder,src_vocab_len,tgt_vocab_len,d_model,generator):
    super(EncoderDecoder,self).__init__()
    
    self.encoder=encoder
    self.decoder=decoder
    self.src_embed=nn.Embedding(src_vocab_len,d_model)
    self.tgt_embed=nn.Embedding(tgt_vocab_len,d_model)
    self.pos=PositionalEncoding(d_model)
    self.gen=generator
    
  def forward(self,src,src_mask,tgt,tgt_mask):
    return self.decode(self.encode(src,src_mask),src_mask,tgt,tgt_mask)

  def encode(self,src,mask_src):
    # breakpoint()
    return self.encoder(self.pos(self.src_embed(src)),mask_src)

  def decode(self,mem,mem_mask,tgt,tgt_mask):
    # breakpoint()
    return self.gen(self.decoder(mem,mem_mask,self.pos(self.tgt_embed(tgt)),tgt_mask))

In [None]:
class Generator(nn.Module):
  def __init__(self,d_model,vocab_size):
    super(Generator,self).__init__()
    self.linear=nn.Linear(d_model,vocab_size)
    
  def forward(self,x):
    return log_softmax(self.linear(x),dim=-1)

In [None]:
def clone(module,N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [None]:
class Encoder(nn.Module):
  def __init__(self,N,module):
    super(Encoder,self).__init__()
    self.encodeLayers=clone(module,N)
      
  def forward(self,x,mask):
    for encodeLayer in self.encodeLayers:
      x=encodeLayer(x,mask)
    return x

class Decoder(nn.Module):
  def __init__(self,N,module):
    super(Decoder,self).__init__()
    self.decodeLayers=clone(module,N)
    
  def forward(self,mem,mem_mask,y,y_mask,):
    for decodeLayer in self.decodeLayers:
      x=decodeLayer(mem,mem_mask,y,y_mask)
    return x


In [None]:
class SubLayerConnection(nn.Module):
  def __init__(self,embed_size,dropout=0.01):
    super(SubLayerConnection,self).__init__()
    self.norm = LayerNorm(embed_size)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self,subModule,x):
    return x+self.dropout(self.norm(subModule(x)))

In [None]:
class FeedForward(nn.Module):
  def __init__(self,size):
    super(FeedForward,self).__init__()
    self.ff1=nn.Linear(size,size)
    self.ff2=nn.Linear(size,size)
    self.relu=nn.ReLU()
    
  def forward(self,x):
    x=self.relu(self.ff1(x))
    return self.ff2(x)
   


In [None]:
class EncodeLayer(nn.Module):
  def __init__(self,head,d_model):
    super(EncodeLayer,self).__init__()
    self.multiHeadAtten= MultiHeadAttention(head,d_model)
    self.feedForward=FeedForward(d_model)
    self.subLayerCon=clone(SubLayerConnection(d_model),2)
    

  def forward(self,x,x_mask):
    x=self.subLayerCon[0](lambda x: self.multiHeadAtten(x,x,x,x_mask),x)
    x=self.subLayerCon[1](self.feedForward,x)
    return x
    

In [None]:
class DecodeLayer(nn.Module):
  def __init__(self,head,d_model):
    super(DecodeLayer,self).__init__()
    self.multiHeadAtten1=MultiHeadAttention(head,d_model)
    self.multiHeadAtten2=MultiHeadAttention(head,d_model)
    self.feedForward=FeedForward(d_model)
    self.subLayerCon=clone(SubLayerConnection(d_model),3)
    

  def forward(self,mem,mem_mask,tgt,tgt_mask):
    # breakpoint()
    x=self.subLayerCon[0](lambda tgt: self.multiHeadAtten1(tgt,tgt,tgt,tgt_mask),tgt)
    x=self.subLayerCon[1](lambda mem: self.multiHeadAtten2(x,mem,mem,mem_mask),mem)
    x=self.subLayerCon[2](self.feedForward,x)
    return x

    
    

In [None]:
class Embedding(nn.Module):
  def __init__(self,vocab_len,d_model):
    super(Embedding,self).__init__()
    self.embedding=nn.Embedding(vocab_len,d_model)
  def forward(self,x):
    return self.embedding(x)

In [None]:
q=torch.rand(5,4)

print(q)
q.size()
torch.triu(torch.ones(5,4))==0
mask=torch.triu(torch.ones(5,4))==0
q.masked_fill_(mask,10)
q
float('-inf')
q@q.T

tensor([[0.9778, 0.1773, 0.7058, 0.5408],
        [0.6097, 0.4162, 0.2337, 0.0357],
        [0.3823, 0.0247, 0.6154, 0.8204],
        [0.5381, 0.7503, 0.6415, 0.3229],
        [0.6275, 0.6348, 0.4910, 0.7243]])


tensor([[  1.7782,  10.0360,  12.4294,  18.7839,  24.0177],
        [ 10.0360, 100.2291, 104.3353, 106.5104, 106.8556],
        [ 12.4294, 104.3353, 201.0518, 206.4186, 214.3582],
        [ 18.7839, 106.5104, 206.4186, 300.1043, 303.2293],
        [ 24.0177, 106.8556, 214.3582, 303.2293, 400.0000]])

In [None]:
from torch._C import NoneType
# def attention(key,query,value,mask=None):
#   '''
#   key-size(-1,n,d_k)
#   query-size(-1,n,d_q)
#   value size(-1,n,d_v)
#   d_k,d_v,d_q=d
#   '''
#   d=key.size(dim=-1)
#   attent=torch.matmul(key,torch.transpose(query,-2,-1)) #size(n,n)
#   attent=attent/torch.sqrt(d)#size(n,n)
#   if mask is not None:
#         attent = attent.masked_fill(mask == 0, -1e9)
#   attent=attent.softMax(dim=-1)#size(n,n)

#   attentValue= torch.matmul(attent,value)#size(n,d)
#   return attent,attentValue

class Attention(nn.Module):
  def __init__(self,embed_size,d_k_size,d_v_size):
    super(Attention,self).__init__()
    self.k_lin=nn.Linear(embed_size,d_k_size)
    self.q_lin=nn.Linear(embed_size,d_k_size)
    self.v_lin=nn.Linear(embed_size,d_v_size)
    self.d_k_size=d_k_size
    self.d_v_size=d_v_size
    
  def forward(self,query,key,value,mask=None):
    # breakpoint()
    key=self.k_lin(key)#batch,seq,d_k
    query=self.q_lin(query)#batch,seq,d_k
    value=self.v_lin(value)#batch,seq,d_v
    # breakpoint()
    attent=torch.matmul(query,key.transpose(-1,-2))/math.sqrt(self.d_k_size)#batch,seq(query),seq(value)
    if mask is not None:
      # breakpoint()
      attent.masked_fill_(mask,float('-inf'))
    attent=attent.softmax(dim=-1)
    return attent, attent@value

# class MultiHeadAttention(nn.Module):
#   def __init__(self,h,d_model,dropout=0.1):
#     super(MultiHeadAttention,self).__init__()
#     assert d_model%h==0
#     self.d_k=d_model//h
#     self.linears=clone(nn.Linear(d_model,self.d_k),3*h)
#     self.atten=torch.zeros((0,self.d_k))#d_k should be 
#     self.attenValue=torch.zeros((0,self.d_k))
    

#   def forward(self,x,x_mask):
#     head_count=0
#     for i in range(0,len(self.linears),3):
#       v=self.linear[i](x)
#       k=self.linear[i+1](x)
#       q=self.linear[i+2](x)
#       attent,attentValue=attention(k,q,v,x_mask)

#       head_count=head_count+1
    
class MultiHeadAttention(nn.Module):
  def __init__(self,h,d_model,dropout=0.1):
    super(MultiHeadAttention,self).__init__()
    assert d_model%h==0
    self.d_k=d_model//h
    self.d_v=d_model//h
    self.d_model=d_model
    self.attHeads=clone(Attention(d_model,self.d_k,self.d_v),h)

  def forward(self,query,key,value,x_mask):
    appendVal=torch.Tensor([])
    
    for i,attHead in enumerate(self.attHeads):
      # breakpoint()
      _,attentVal=attHead(query,key,value,x_mask)
      appendVal=torch.cat((appendVal,attentVal),-1)
    return appendVal
    
      



In [None]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout=0.01, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        print(div_term)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        print(div_term)

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [27]:
batch=2
seq_len=20
d_model=8
tgt_vocab_len=100
src_vocab_len=100
num_head=2
num_layer=2

src=torch.randint(0,99,(batch,seq_len))
tgt=torch.randint(0,99,(batch,seq_len))
tgt_mask=torch.triu(torch.ones(seq_len,seq_len))==0
tgt_mask=torch.unsqueeze(tgt_mask,0)
src_mask=torch.ones(seq_len,seq_len)
src_mask=torch.unsqueeze(src_mask,0)


# enc=EncodeLayer(2,16)
# dec=DecodeLayer(2,16)
enc=Encoder(num_layer,copy.deepcopy(EncodeLayer(num_head,d_model)))
dec=Decoder(8,copy.deepcopy(DecodeLayer(num_head,d_model)))
# mem=enc(src,src_mask)
# output=dec(mem,torch.ones(10,10),mask,torch.rand(2,10,16))
# q=torch.Tensor([])
# torch.cat((q,torch.ones(2,3)),0)
# print(output.size())
encDec=EncoderDecoder(enc,dec,src_vocab_len,tgt_vocab_len,d_model,Generator(d_model,tgt_vocab_len))
print(tgt.size())
print(src.size())
output=encDec(src,src_mask,tgt,tgt_mask)
output.ma
print(output.size())

tensor([1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03])
tensor([1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03])
torch.Size([2, 20])
torch.Size([2, 20])
torch.Size([2, 20, 100])


In [29]:
# torch.tensor([[2,3,4],[2,3,4]]).size()
# lin=nn.Linear(5,3)
# a=torch.rand((3,4,5))
# lin(a)
# lin.weight.shape
# a=torch.Tensor([[]])
# a[0,:]=torch.Tensor([1,2,3,4])
# a

q=torch.rand(2,10,10)
print(output.max(dim=-1))
print(src)


torch.return_types.max(
values=tensor([[-1.7748, -2.1283, -1.1489, -2.3102, -1.5027, -1.5435, -2.5304, -1.3228,
         -1.8757, -1.2773, -1.5106, -1.8929, -2.2803, -2.0485, -1.5822, -2.0528,
         -1.7103, -2.1180, -1.8704, -2.0196],
        [-2.1902, -1.9152, -1.3068, -2.3500, -3.0056, -1.8908, -2.4926, -2.2137,
         -1.5956, -1.8084, -2.2959, -1.3956, -2.1741, -2.6560, -2.5848, -2.7051,
         -2.1005, -1.4640, -2.6729, -2.4243]], grad_fn=<MaxBackward0>),
indices=tensor([[87, 87, 87, 87, 87, 87, 27, 87, 87, 22, 87, 27, 87, 27, 27, 87, 87, 22,
         27, 37],
        [27, 87, 87, 27, 36, 27, 36, 36, 87, 27, 57, 27, 87, 87, 87, 27, 37, 87,
         87, 87]]))
tensor([[23, 91, 23, 30, 86, 90, 11, 60, 50, 20, 59, 67, 52, 80, 42, 74, 94, 46,
         90, 24],
        [44, 89, 60, 84, 51, 65, 26, 34, 13, 54, 83, 77, 56, 61, 27, 26, 74, 66,
         33, 87]])


In [None]:
a=torch.Tensor([[1,2,3,4]])
torch.cat((a,torch.Tensor([[1,2,3,4]])),0)
a

tensor([[1., 2., 3., 4.]])