<a href="https://colab.research.google.com/github/praburocking/transformers/blob/main/transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install -q torchdata==0.3.0 torchtext==0.12 spacy==3.2 altair GPUtil
!python -m spacy download de_core_news_sm
!python -m spacy download en_core_web_sm

[K     |████████████████████████████████| 47 kB 2.0 MB/s 
[K     |████████████████████████████████| 6.0 MB 8.7 MB/s 
[K     |████████████████████████████████| 138 kB 10.8 MB/s 
[K     |████████████████████████████████| 127 kB 58.7 MB/s 
[?25h  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
en-core-web-sm 3.3.0 requires spacy<3.4.0,>=3.3.0.dev0, but you have spacy 3.2.0 which is incompatible.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.2.0/de_core_news_sm-3.2.0-py3-none-any.whl (19.1 MB)
[K     |██████████████████████████

In [4]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
#import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
from torch.nn import LayerNorm
import numpy as np


# Set to False to skip notebook execution (e.g. for debugging)
warnings.filterwarnings("ignore")
RUN_EXAMPLES = True

In [5]:
q=torch.rand(4,3)
w=torch.rand(2,3)
print(q)
print(w)
print((q@w.T).size())

tensor([[0.2377, 0.7682, 0.7806],
        [0.5799, 0.5986, 0.2128],
        [0.1995, 0.5491, 0.0072],
        [0.2029, 0.5745, 0.0293]])
tensor([[0.1220, 0.2529, 0.8184],
        [0.7535, 0.2571, 0.2870]])
torch.Size([4, 2])


In [6]:
def is_interactive_notebook():
    return __name__ == "__main__"


def show_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        return fn(*args)


def execute_example(fn, args=[]):
    if __name__ == "__main__" and RUN_EXAMPLES:
        fn(*args)


class DummyOptimizer(torch.optim.Optimizer):
    def __init__(self):
        self.param_groups = [{"lr": 0}]
        None

    def step(self):
        None

    def zero_grad(self, set_to_none=False):
        None


class DummyScheduler:
    def step(self):
        None

In [7]:
class EncoderDecoder(nn.Module):
  """
  first encoderDecoder (standard one)
  """
  def __init__(self,encoder,decoder,src_vocab_len,tgt_vocab_len,d_model,generator):
    super(EncoderDecoder,self).__init__()
    
    self.encoder=encoder
    self.decoder=decoder
    self.src_embed=nn.Embedding(src_vocab_len,d_model)
    self.tgt_embed=nn.Embedding(tgt_vocab_len,d_model)
    self.pos=PositionalEncoding(d_model)
    self.gen=generator
    
  def forward(self,src,src_mask,tgt,tgt_mask):
    return self.decode(self.encode(src,src_mask),src_mask,tgt,tgt_mask)

  def encode(self,src,mask_src):
    # breakpoint()
    return self.encoder(self.pos(self.src_embed(src)),mask_src)

  def decode(self,mem,mem_mask,tgt,tgt_mask):
    # breakpoint()
    return self.gen(self.decoder(mem,mem_mask,self.pos(self.tgt_embed(tgt)),tgt_mask))

In [8]:
class Generator(nn.Module):
  def __init__(self,d_model,vocab_size):
    super(Generator,self).__init__()
    self.linear=nn.Linear(d_model,vocab_size)
    
  def forward(self,x):
    return log_softmax(self.linear(x),dim=-1)

In [9]:
def clone(module,N):
  return nn.ModuleList([copy.deepcopy(module) for _ in range(N)])

In [10]:
class Encoder(nn.Module):
  def __init__(self,N,module):
    super(Encoder,self).__init__()
    self.encodeLayers=clone(module,N)
      
  def forward(self,x,mask):
    for encodeLayer in self.encodeLayers:
      x=encodeLayer(x,mask)
    return x

class Decoder(nn.Module):
  def __init__(self,N,module):
    super(Decoder,self).__init__()
    self.decodeLayers=clone(module,N)
    
  def forward(self,mem,mem_mask,y,y_mask,):
    for decodeLayer in self.decodeLayers:
      x=decodeLayer(mem,mem_mask,y,y_mask)
    return x


In [11]:
class SubLayerConnection(nn.Module):
  def __init__(self,embed_size,dropout=0.01):
    super(SubLayerConnection,self).__init__()
    self.norm = LayerNorm(embed_size)
    self.dropout = nn.Dropout(dropout)
    
  def forward(self,subModule,x):
    return x+self.dropout(self.norm(subModule(x)))

In [12]:
class FeedForward(nn.Module):
  def __init__(self,size):
    super(FeedForward,self).__init__()
    self.ff1=nn.Linear(size,size)
    self.ff2=nn.Linear(size,size)
    self.relu=nn.ReLU()
    
  def forward(self,x):
    x=self.relu(self.ff1(x))
    return self.ff2(x)
   


In [13]:
class EncodeLayer(nn.Module):
  def __init__(self,head,d_model):
    super(EncodeLayer,self).__init__()
    self.multiHeadAtten= MultiHeadAttention(head,d_model)
    self.feedForward=FeedForward(d_model)
    self.subLayerCon=clone(SubLayerConnection(d_model),2)
    

  def forward(self,x,x_mask):
    x=self.subLayerCon[0](lambda x: self.multiHeadAtten(x,x,x,x_mask),x)
    x=self.subLayerCon[1](self.feedForward,x)
    return x
    

In [14]:
class DecodeLayer(nn.Module):
  def __init__(self,head,d_model):
    super(DecodeLayer,self).__init__()
    self.multiHeadAtten1=MultiHeadAttention(head,d_model)
    self.multiHeadAtten2=MultiHeadAttention(head,d_model)
    self.feedForward=FeedForward(d_model)
    self.subLayerCon=clone(SubLayerConnection(d_model),3)
    

  def forward(self,mem,mem_mask,tgt,tgt_mask):
    # print("***********decoder")
   
    x=self.subLayerCon[0](lambda tgt: self.multiHeadAtten1(tgt,tgt,tgt,tgt_mask),tgt)
    # print("decoder::: MH---1 size of x  "+str(x.size())+"size of mem"+str(mem.size()))
    x=self.subLayerCon[1](lambda x: self.multiHeadAtten2(x,mem,mem,mem_mask),x)
    # print("decoder::: MH---2 size of x  "+str(x.size())+"size of mem"+str(mem.size()))

    x=self.subLayerCon[2](self.feedForward,x)
    # print("decoder::: FF---1 size of x  "+str(x.size())+"size of mem"+str(mem.size()))
    return x

    
    

In [15]:
class Embedding(nn.Module):
  def __init__(self,vocab_len,d_model):
    super(Embedding,self).__init__()
    self.embedding=nn.Embedding(vocab_len,d_model)
  def forward(self,x):
    return self.embedding(x)

In [16]:
q=torch.rand(5,4)

print(q)
q.size()
torch.triu(torch.ones(5,4))==0
mask=torch.triu(torch.ones(5,4))==0
q.masked_fill_(mask,10)
q
float('-inf')
q@q.T

tensor([[0.8310, 0.0987, 0.9675, 0.6026],
        [0.1576, 0.7603, 0.0789, 0.7471],
        [0.2920, 0.3147, 0.5396, 0.1241],
        [0.9974, 0.9123, 0.6193, 0.2941],
        [0.1998, 0.3838, 0.9892, 0.7518]])


tensor([[  1.9994,   8.9112,   9.8934,  19.1487,  24.9974],
        [  8.9112, 101.1425, 107.7387, 108.6118, 115.8633],
        [  9.8934, 107.7387, 200.3065, 205.4320, 206.6363],
        [ 19.1487, 108.6118, 205.4320, 300.0865, 302.9409],
        [ 24.9974, 115.8633, 206.6363, 302.9409, 400.0000]])

In [17]:
from torch._C import NoneType
# def attention(key,query,value,mask=None):
#   '''
#   key-size(-1,n,d_k)
#   query-size(-1,n,d_q)
#   value size(-1,n,d_v)
#   d_k,d_v,d_q=d
#   '''
#   d=key.size(dim=-1)
#   attent=torch.matmul(key,torch.transpose(query,-2,-1)) #size(n,n)
#   attent=attent/torch.sqrt(d)#size(n,n)
#   if mask is not None:
#         attent = attent.masked_fill(mask == 0, -1e9)
#   attent=attent.softMax(dim=-1)#size(n,n)

#   attentValue= torch.matmul(attent,value)#size(n,d)
#   return attent,attentValue

class Attention(nn.Module):
  def __init__(self,embed_size,d_k_size,d_v_size):
    super(Attention,self).__init__()
    self.k_lin=nn.Linear(embed_size,d_k_size)
    self.q_lin=nn.Linear(embed_size,d_k_size)
    self.v_lin=nn.Linear(embed_size,d_v_size)
    self.d_k_size=d_k_size
    self.d_v_size=d_v_size
    
  def forward(self,query,key,value,mask=None):
    # breakpoint()
    key=self.k_lin(key)#batch,seq,d_k
    query=self.q_lin(query)#batch,seq,d_k
    value=self.v_lin(value)#batch,seq,d_v
    #breakpoint()
    attent=torch.matmul(query,key.transpose(-1,-2))/math.sqrt(self.d_k_size)#batch,seq(query),seq(value)
    if mask is not None:
      # breakpoint()
      attent.masked_fill_(mask,float('-inf'))
    attent=attent.softmax(dim=-1)
    return attent, attent@value

# class MultiHeadAttention(nn.Module):
#   def __init__(self,h,d_model,dropout=0.1):
#     super(MultiHeadAttention,self).__init__()
#     assert d_model%h==0
#     self.d_k=d_model//h
#     self.linears=clone(nn.Linear(d_model,self.d_k),3*h)
#     self.atten=torch.zeros((0,self.d_k))#d_k should be 
#     self.attenValue=torch.zeros((0,self.d_k))
    

#   def forward(self,x,x_mask):
#     head_count=0
#     for i in range(0,len(self.linears),3):
#       v=self.linear[i](x)
#       k=self.linear[i+1](x)
#       q=self.linear[i+2](x)
#       attent,attentValue=attention(k,q,v,x_mask)

#       head_count=head_count+1
    
class MultiHeadAttention(nn.Module):
  def __init__(self,h,d_model,dropout=0.1):
    super(MultiHeadAttention,self).__init__()
    assert d_model%h==0
    self.d_k=d_model//h
    self.d_v=d_model//h
    self.d_model=d_model
    self.attHeads=clone(Attention(d_model,self.d_k,self.d_v),h)

  def forward(self,query,key,value,x_mask):
    appendVal=torch.Tensor([])
    
    for i,attHead in enumerate(self.attHeads):
      # breakpoint()
      _,attentVal=attHead(query,key,value,x_mask)
      appendVal=torch.cat((appendVal,attentVal),-1)
    return appendVal
    
      



In [18]:
class PositionalEncoding(nn.Module):
    "Implement the PE function."

    def __init__(self, d_model, dropout=0.01, max_len=5000):
        super(PositionalEncoding, self).__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        print(div_term)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        print(div_term)

        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)

In [77]:
batch=2
seq_len=20
d_model=8
tgt_vocab_len=100
src_vocab_len=100
num_head=2
num_layer=2

src=torch.randint(0,99,(batch,seq_len))
tgt=torch.randint(0,99,(batch,seq_len))
tgt_mask=torch.triu(torch.ones(seq_len,seq_len))==0
tgt_mask=torch.unsqueeze(tgt_mask,0)
src_mask=torch.ones(seq_len,seq_len)
src_mask=torch.unsqueeze(src_mask,0)


tgt_mask=torch.zeros(1,1)
print("tgt_mask size "+str(tgt_mask.size()))
print("tgt_mask size "+str(tgt.size()))
# tgt_mask=None
src_mask=None
# enc=EncodeLayer(2,16)
# dec=DecodeLayer(2,16)
enc=Encoder(num_layer,copy.deepcopy(EncodeLayer(num_head,d_model)))
dec=Decoder(num_layer,copy.deepcopy(DecodeLayer(num_head,d_model)))
# mem=enc(src,src_mask)
# output=dec(mem,torch.ones(10,10),mask,torch.rand(2,10,16))
# q=torch.Tensor([])
# torch.cat((q,torch.ones(2,3)),0)
# print(output.size())
encDec=EncoderDecoder(enc,dec,src_vocab_len,tgt_vocab_len,d_model,Generator(d_model,tgt_vocab_len))
# print(tgt.size())
# print(src.size())

# output=encDec(src,src_mask,src,tgt_mask)
# output
# print("tgt size" +str(output.size()))

tgt_mask size torch.Size([1, 1])
tgt_mask size torch.Size([2, 1])
tensor([1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03])
tensor([1.0000e+00, 1.0000e-01, 1.0000e-02, 1.0000e-03])


In [78]:

epoch=5
optimizer = torch.optim.Adam(encDec.parameters(), lr=1e-2)
criterion = nn.CrossEntropyLoss()

tgt=torch.tensor([[src[0,0]],[src[1,0]]])n

for j in range(epoch):
  temp_tgt=torch.clone(tgt)
  mem=encDec.encode(src,src_mask)
  for i in range(1,99):
    breakpoint()
    x=encDec.decode(mem,src_mask,temp_tgt,tgt_mask)
    tgt=torch.tensor([[src[0,i]],[src[1,i]]])
  
    x=x.argmax(dim=-1)
    x=x.to(dtype=torch.float)
    tgt=tgt.to(dtype=torch.float)
    print(tgt)
    print(x)
    print(tgt)
    print(x.dtype)
    loss=criterion(x,tgt)
    print(loss)

    # # print(x.size())
    temp_tgt=torch.cat((temp_tgt,x), dim=1)
  print(temp_tgt)
  print(temp_tgt.size())
    


> <ipython-input-78-c4d598cd38a3>(13)<module>()->None
-> x=encDec.decode(mem,src_mask,temp_tgt,tgt_mask)
(Pdb) n
> <ipython-input-78-c4d598cd38a3>(14)<module>()->None
-> tgt=torch.tensor([[src[0,i]],[src[1,i]]])
(Pdb) n
> <ipython-input-78-c4d598cd38a3>(16)<module>()->None
-> x=x.argmax(dim=-1)
(Pdb) n
> <ipython-input-78-c4d598cd38a3>(17)<module>()->None
-> x=x.to(dtype=torch.float)
(Pdb) 
> <ipython-input-78-c4d598cd38a3>(18)<module>()->None
-> tgt=tgt.to(dtype=torch.float)
(Pdb) 
> <ipython-input-78-c4d598cd38a3>(19)<module>()->None
-> print(tgt)
(Pdb) 
tensor([[15.],
        [ 8.]])
> <ipython-input-78-c4d598cd38a3>(20)<module>()->None
-> print(x)
(Pdb) 
tensor([[92.],
        [85.]])
> <ipython-input-78-c4d598cd38a3>(21)<module>()->None
-> print(tgt)
(Pdb) 
tensor([[15.],
        [ 8.]])
> <ipython-input-78-c4d598cd38a3>(22)<module>()->None
-> print(x.dtype)
(Pdb) 
torch.float32
> <ipython-input-78-c4d598cd38a3>(23)<module>()->None
-> loss=criterion(x,tgt)
(Pdb) 
> <ipython-input-

RuntimeError: ignored

In [61]:
x = np.array([[
    [[1,0,0],[1,0,0]], # predict class 0 for pixel (0,0) and class 0 for pixel (0,1)
    [[0,1,0],[0,0,1]], # predict class 1 for pixel (1,0) and class 2 for pixel (1,1)
]])*5  # multiply by 5 to give bigger losses
print("logits map :")
print(x)

# ground truth labels
y = np.array([[
    [0,1], # must predict class 0 for pixel (0,0) and class 1 for pixel (0,1)
    [1,2], # must predict class 1 for pixel (1,0) and class 2 for pixel (1,1)
]])  
print("\nlabels map :")
print(y)

x=torch.Tensor(x).permute((0,3,1,2))  # shape of preds must be (N, C, H, W) instead of (N, H, W, C)
y=torch.Tensor(y).long() #  shape of labels must be (N, H, W) and type must be long integer


print(x.size())
print(y.size())
print(x)
print(y)
losses = nn.CrossEntropyLoss(reduction="none")(x, y)  # reduction="none" to get the loss by pixel 
print("\nLosses map :")
print(losses)

logits map :
[[[[5 0 0]
   [5 0 0]]

  [[0 5 0]
   [0 0 5]]]]

labels map :
[[[0 1]
  [1 2]]]
torch.Size([1, 3, 2, 2])
torch.Size([1, 2, 2])
tensor([[[[5., 5.],
          [0., 0.]],

         [[0., 0.],
          [5., 0.]],

         [[0., 0.],
          [0., 5.]]]])
tensor([[[0, 1],
         [1, 2]]])

Losses map :
tensor([[[0.0134, 5.0134],
         [0.0134, 0.0134]]])


In [26]:
loss = nn.CrossEntropyLoss()
input = torch.randn(3, 3, requires_grad=True)
target = torch.randn(3, 5).softmax(dim=1)
output = loss(input, target)
print(input.size())
print(target.size)


RuntimeError: ignored

In [63]:
out = torch.FloatTensor([[0.05, 0.9, 0.05],[0.05, 0.9, 0.05]])
out = torch.autograd.Variable(out)

# Categorical targets
y = torch.LongTensor([1,2])
y = torch.autograd.Variable(y)

# One-hot encoded targets
y1 = torch.FloatTensor([[0, 1, 0], [0, 0, 1], [1, 0, 0]])
y1 = torch.autograd.Variable(y1)

# Calculating the loss
loss_val = nn.CrossEntropyLoss()(out, y)
# loss_val1 = nn.BCEWithLogitsLoss()(out, y1)
print(loss_val)
print(out.size())
print(y.size())

tgt=torch.rand(3,3,5,100)
x=torch.rand(3,3,5,100)
criterion(x,tgt)

tensor(1.0428)
torch.Size([2, 3])
torch.Size([2])


tensor(1.6946)