In [2]:
import copy
import numpy as np
import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, random_split, \
TensorDataset

In [3]:
class Encoder(nn.Module):
    def __init__(self,n_features,hidden_dim):
        
        super().__init__()
        self.hidden_dim=hidden_dim
        self.n_features=n_features
        self.hidden=None
        self.basic_rnn=nn.GRU(self.n_features,
                             self.hidden_dim,
                             batch_first=True)
        
    def forward(self,x):
        rnn_out,self.hidden=self.basic_rnn(x)
        return rnn_out
    

In [4]:
full_seq = (torch.tensor([[-1, -2], [-1, 3], [1, 4], [1, -5]])
.float()
.view(1, 4, 2))
source_seq = full_seq[:, :2] # first two corners
target_seq = full_seq[:, 2:] # last two corners

In [5]:
torch.manual_seed(21)
encoder = Encoder(n_features=2, hidden_dim=2)
hidden_seq = encoder(source_seq) # output is N, L, F
hidden_final = hidden_seq[:, -1:]
# takes last hidden state
hidden_final,hidden_seq

(tensor([[[ 0.5242, -0.4707]]], grad_fn=<SliceBackward0>),
 tensor([[[ 0.0122,  0.3053],
          [ 0.5242, -0.4707]]], grad_fn=<TransposeBackward1>))

In [6]:
class Decoder(nn.Module):
    
    def __init__(self,n_features,hidden_dim):
        
        super().__init__()
        
        self.hidden_dim=hidden_dim
        
        self.n_features=n_features
        
        self.hidden=None
        
        self.basic_rnn=nn.GRU(self.n_features,
                             self.hidden_dim,
                             batch_first=True)
        self.regression=nn.Linear(self.hidden_dim,
                                 self.n_features)
    def init_hidden(self,hidden_seq):
        
        hidden_final=hidden_seq[:,-1:]
        
        self.hidden=hidden_final.permute(1,0,2)
        
    def forward(self,x):
        
        batch_first_output , self.hidden=self.basic_rnn(x,self.hidden)
        
        last_output=batch_first_output[:,-1:]
        
        out=self.regression(last_output)
        
        return out.view(-1,1,self.n_features)
    
        

In [7]:
torch.manual_seed(21)
decoder=Decoder(2,2)

decoder.init_hidden(hidden_seq)
inputs=source_seq[:,-1:]

print('Inputs :',inputs)
target_len=2
for i in range(target_len):
    print(f'Hidden : {decoder.hidden}')
    out=decoder(inputs)
    print(f'outputs :{out}')
    inputs=out

Inputs : tensor([[[-1.,  3.]]])
Hidden : tensor([[[ 0.5242, -0.4707]]], grad_fn=<PermuteBackward0>)
outputs :tensor([[[-0.3348,  0.4237]]], grad_fn=<ViewBackward0>)
Hidden : tensor([[[ 0.6296, -0.8031]]], grad_fn=<StackBackward0>)
outputs :tensor([[[-0.1195,  0.4348]]], grad_fn=<ViewBackward0>)


In [8]:
# Initial hidden state will be encoder's final hidden state
decoder.init_hidden(hidden_seq)
# Initial data point is the last element of source sequence
inputs = source_seq[:, -1:]

target_len=2
for i in range(target_len):
    print(f'Hidden : {decoder.hidden}')
    out=decoder(inputs)
    print(f'outputs :{out}')
    inputs=target_seq[:,i:i+1]
    print(f'i :{target_seq[:,i:i+1]}')

Hidden : tensor([[[ 0.5242, -0.4707]]], grad_fn=<PermuteBackward0>)
outputs :tensor([[[-0.3348,  0.4237]]], grad_fn=<ViewBackward0>)
i :tensor([[[1., 4.]]])
Hidden : tensor([[[ 0.6296, -0.8031]]], grad_fn=<StackBackward0>)
outputs :tensor([[[-0.2940,  0.3910]]], grad_fn=<ViewBackward0>)
i :tensor([[[ 1., -5.]]])


In [9]:
decoder.init_hidden(hidden_seq)

inputs=source_seq[:,-1:]

teacher_forcing_prob=0.5
target_len=2
for i in range(target_len):
    print(f'Hidden state :{decoder.hidden}')
    out=decoder(inputs)
    print(f'Output : {out}')
    
    if torch.randn(1)<=teacher_forcing_prob:
        inputs=target_seq[:,i:i+1]
        
    else:
        inputs=out

Hidden state :tensor([[[ 0.5242, -0.4707]]], grad_fn=<PermuteBackward0>)
Output : tensor([[[-0.3348,  0.4237]]], grad_fn=<ViewBackward0>)
Hidden state :tensor([[[ 0.6296, -0.8031]]], grad_fn=<StackBackward0>)
Output : tensor([[[-0.1195,  0.4348]]], grad_fn=<ViewBackward0>)


In [16]:
class EncoderDecoder(nn.Module):
    
    def __init__(self,
                encoder,
                decoder,
                input_len,
                target_len,
                teacher_forcing_prob=0.5):
        super().__init__()
        self.encoder=encoder
        self.decoder=decoder
        self.input_len=input_len
        self.target_len=target_len
        self.teacher_forcing_prob=teacher_forcing_prob
        self.outputs=None
        
    def init_outputs(self,batch_size):
        #print('\n\nInitialization of outputs ')
        device = next(self.parameters()).device
        # N---->batch_size
        #L----->target_len
        #F----->ecnoder_features
        print(f'\n\nBatch_size ->(N) : {batch_size}\nInput_length ->(L) : {self.target_len} \nEncoder_feature-->(F) : {self.encoder.n_features}')
        self.outputs=torch.zeros(batch_size,self.target_len,
                                 self.encoder.n_features).to(device)
        
    def store_outputs(self,i,out):
        print('\n\nStore_outputs')
        self.outputs[:,i:i+1,:]=out
        print(f'Full Output : ',out)
        print('Iteration', i)
        print(f'Storing output from {i} : {i+1}')
        print(f'Stored outputs :{self.outputs[:,i:i+1]}')
        
    def forward(self,x):
        print('\n\nforward')
        #splits the data in source and target sequence
        #the target seq will be empty in testing mode
        # N,L,F
        #batch_size=x.shape[0]
        #x=x.view(batch_size,-1,self.input_len+self.target_len)
        
        source_seq=x[:,:self.input_len,:]
        print(f'Source Seq : {source_seq}')
        target_seq=x[:,self.input_len:,:]
        print(f'target Seq : {target_seq}')
        self.init_outputs(x.shape[0])
        
        hidden_seq=self.encoder(source_seq)
        print(f'Hidden_seq Of encoder : {hidden_seq}')
        self.decoder.init_hidden(hidden_seq)
        
        dec_inputs=source_seq[:,-1:,:]
        print('\n\n***************Decoder Iteration******************\n')
        for i in range(self.target_len):
            
            out=self.decoder(dec_inputs)
            
            self.store_outputs(i,out)
            
            prob=self.teacher_forcing_prob
            
            if not self.training:
                prob=0
                
            if torch.rand(1) <= prob:
                
                dec_inputs=target_seq[:,i:i+1,:]
                #print(f'Decoder Ouputs : {target_seq[:,i:i+1,:]}')
            else:
                dec_inputs=out
        
        return self.outputs


In [17]:
encdec=EncoderDecoder(encoder,
                     decoder,
                     input_len=2,
                     target_len=2,
                     teacher_forcing_prob=0.5)

In [18]:
full_seq = (torch.tensor([[-1, -2], [-3, 3], [6, 4], [5, -5]])
.float()
.view(1, 4, 2))

In [19]:
encdec.train()
encdec(full_seq)



forward
Source Seq : tensor([[[-1., -2.],
         [-3.,  3.]]])
target Seq : tensor([[[ 6.,  4.],
         [ 5., -5.]]])


Batch_size ->(N) : 1
Input_length ->(L) : 2 
Encoder_feature-->(F) : 2
Hidden_seq Of encoder : tensor([[[ 0.0122,  0.3053],
         [ 0.1228, -0.5392]]], grad_fn=<TransposeBackward1>)


***************Decoder Iteration******************



Store_outputs
Full Output :  tensor([[[-0.2861,  0.5117]]], grad_fn=<ViewBackward0>)
Iteration 0
Storing output from 0 : 1
Stored outputs :tensor([[[-0.2861,  0.5117]]], grad_fn=<SliceBackward0>)


Store_outputs
Full Output :  tensor([[[0.0164, 0.4904]]], grad_fn=<ViewBackward0>)
Iteration 1
Storing output from 1 : 2
Stored outputs :tensor([[[0.0164, 0.4904]]], grad_fn=<SliceBackward0>)


tensor([[[-0.2861,  0.5117],
         [ 0.0164,  0.4904]]], grad_fn=<CopySlices>)

# Attentions

## **Values**

Referring to the encoder’s hidden states (or their affine
transformations) as "values" (V). The resulting multiplication of a "value" by its
corresponding attention score is called an alignment vector.the sum of all alignment vectors (that is, the weighted average of the
hidden states) is called a context vector.

## **context_vector = $(\alpha * h_0)+(\alpha*h_1) = 0.8 * (value)_0 + 0.2 *(value)_1$**

The encoder’s hidden states are used as both "keys" (K)
and "values" (V).will Applying affine transformations to the hidden states,
**The encoder’s hidden states are called "keys" (K), while the
decoder’s hidden state is called a "query" (Q).**
one for the "keys," another for the "values," so they will actually have different
values.the general idea is that the encoder works like a key-value store, as if it were
some sort of database, and then the decoder queries it. The attention mechanism
looks the query up in its keys (the matching part) and returns its values
**The "query" (Q) is matched to both "keys" (K) to compute the attention scores (s)
used to compute the context vector, which is simply the weighted average of the
"values" (V).**

## **Context Vector**

In [15]:
full_seq = (torch.tensor([[-1, -1], [-1, 1], [1, 1], [1, -1]]).float().view(1, 4, 2))
source_seq = full_seq[:, :2]
target_seq = full_seq[:, 2:]

# **values(v)**

In [22]:
torch.manual_seed(21)
encoder=Encoder(n_features=2,hidden_dim=2)
hidden_seq=encoder(source_seq)
values=hidden_seq #N,L,H
values

tensor([[[ 0.0832, -0.0356],
         [ 0.3105, -0.5263]]], grad_fn=<TransposeBackward1>)

## **keys(k)**

In [23]:
keys=hidden_seq #N ,L ,H
keys

tensor([[[ 0.0832, -0.0356],
         [ 0.3105, -0.5263]]], grad_fn=<TransposeBackward1>)

## **Query(Q)**

In [25]:
torch.manual_seed(21)
decoder=Decoder(n_features=2,hidden_dim=2)
decoder.init_hidden(hidden_seq)
inputs=source_seq[:,-1:]
out=decoder(inputs)

**The first "query" (Q) is the decoder’s hidden state (remember, hidden states are
always sequence-first, so we’re permuting it to batch-first)**

In [26]:
query=decoder.hidden.permute(1,0,2) # N,1,H
query

tensor([[[ 0.3913, -0.6853]]], grad_fn=<PermuteBackward0>)

In [29]:
def cal_alphas(ks,q):
    N,L,H=ks.size()
    #print(ks.size())
    alphas=torch.ones(N,1,L).float()*1/L
    return alphas
alphas = cal_alphas(keys,query)
alphas

torch.Size([1, 2, 2])



tensor([[[0.5000, 0.5000]]])

In [30]:
# N, 1, L x N, L, H -> 1, L x L, H -> 1, H
context_vector = torch.bmm(alphas, values)
context_vector

tensor([[[ 0.1968, -0.2809]]], grad_fn=<BmmBackward0>)

**context vector is ready to  concatenate it to the "query" (the
decoder’s hidden state) and use it as the input for the linear layer that actually
generates the predicted**

In [31]:
concatenated = torch.cat([context_vector, query], axis=-1)
concatenated

tensor([[[ 0.1968, -0.2809,  0.3913, -0.6853]]], grad_fn=<CatBackward0>)

**scoring method will use the transformed "keys" and "queries" to compute
the attention scores,**

                          

**keys(k) : ---->Encoder-->Affine Transforms-->Scoring**          

**Queries(Q) : ----->Decoder--->Affine Transforms--->scroing**        

**Values(v) :---->Encoder---->Alingnment Vector**

## Scoring Method

**A "key" (K) is a hidden state from the encoder. A "query" (Q) is a hidden state from
the decoder. Both of them are vectors with the same number of dimensions If two vectors are
pointing in the same direction, their cosine similarity is a perfect one.**

$\cos\theta \left\lVert Q \right\rVert \left\lVert K \right\rVert = Q .K $

In [33]:
product=torch.bmm(query,keys.permute(0,2,1))
product

tensor([[[0.0569, 0.4821]]], grad_fn=<BmmBackward0>)

## **Attention Scores**

In [37]:
alphas =F.softmax(product,dim=-1)
alphas

tensor([[[0.3953, 0.6047]]], grad_fn=<SoftmaxBackward0>)

The attention scores above mean that
the first hidden state(0.3953) contributes to roughly 40% of the context vector while the second hidden state(0.60) contributes to the remaining 60% of the context vector.

## **Updating $\alpha$**

In [38]:
def calc_alphas(ks,q):
    
    product=torch.bmm(q,ks.permute(0,2,1))
    alphas=F.softmax(produt,dim=-1)
    return alphas

## **Scaled Dot Product**
So far, we’ve used simple dot products between a "query" and each of the "keys."
But, given that the dot product between two vectors is the sum of the elements
after an element-wise multiplication of both vectors, guess what happens as the
vectors grow to a larger number of dimensions? The variance gets larger as well.
So, we need to (somewhat) standardize it by scaling the dot product by the inverse
of its standard deviation:

**Scaled dot Product** = $Q.K / \sqrt(d_k)$

In [39]:
dims = query.size(-1)
scaled_products = products / np.sqrt(dims)
scaled_products

tensor([[[0.0403, 0.3409]]], grad_fn=<DivBackward0>)

In [41]:
def calc_alphas(ks, q):
    dims = q.size(-1)
    # N, 1, H x N, H, L -> N, 1, L
    products = torch.bmm(q, ks.permute(0, 2, 1))
    scaled_products = products / np.sqrt(dims)
    alphas = F.softmax(scaled_products, dim=-1)
    return alphas
alphas = calc_alphas(keys, query)
# N, 1, L x N, L, H -> 1, L x L, H -> 1, H
context_vector = torch.bmm(alphas, values)
context_vector

tensor([[[ 0.2138, -0.3175]]], grad_fn=<BmmBackward0>)

# **Attention**
              +-------------+
              |             |
              |   Query q   |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              | Key vectors |
              |     ks      |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |    Linear   |
              | Transformation|
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |  Key matrix |
              |     K       |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |  Value matrix|
              |     V       |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              | Product and |
              |   Scaling   |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |   Softmax   |
              |             |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |  Attention  |
              |   weights   |
              |    alphas   |
              +------+------+
                     |
                     v
              +-------------+
              |             |
              |  Weighted   |
              |  sum of the |
              |   values    |
              +------+------+
                     |
                     v
                +-------+
                |       |
                |  alphas x  |
                |   values|
                |       |
                +-------+
                     |
                     v
              +-------------+
              |             |
              | Context vec.|
              |             |
              +------+------+
                     |
                     v
                Output
