<a href="https://colab.research.google.com/github/rikudoayush/Transformers_from_scratch/blob/main/Pytorch_Transformers_from_Scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn.functional as F

# Basic Self-Attention

In [None]:
# assume we have some tensor x with size (b, t, k)
##x = 

#The set of all raw dot products w′ij forms a matrix, which we can compute simply by multiplying 𝐗 by its transpose:
##raw_weights = torch.bmm(x,x.transpose(1,2))
# - torch.bmm is a batched matrix multiplication. It 
#   applies matrix multiplication over batches of 
#   matrices.


# Then, to turn the raw weights w′ij into positive values that sum to one, we apply a row-wise softmax:

##weights = F.softmax(raw_weights, dim=2)

In [None]:
# Finally, to compute the output sequence, we just multiply the weight matrix by 𝐗. This results in a batch of output matrices 𝐘 of size (b, t, k) 
# whose rows are weighted sums over the rows of 𝐗.
##y = torch.bmm(weights,x)

# Transformer Self-Attention

\begin{align*}
q_{i} &= W_qx_{i} &
k_{i} &= W_kx_{i} &
v_{i} &= W_vx_{i} 
\end{align*}

\begin{align*}
w'_{{i}{j}} &= q_{i}^Tk_{j} \\
w_{{i}{j}} &= softmax(w'_{{i}{j}}) \\
y_{i} &= \sum_{j} w_{{i}{j}}v_{i}
\end{align*}


\begin{align*}
w'_{{i}{j}} = \frac{{q_{i}}^Tk_{j}}{\sqrt{k}}
\end{align*}


Every input vector x_i is used in three different ways in the self attention operation:

1) It is compared to every other vector to establish the weights for its own output 𝐲_i

2) It is compared to every other vector to establish the weights for the output of the j-th vector 𝐲_j

3) It is used as part of the weighted sum to compute each output vector once the weights have been established.

In [None]:
import torch
from torch import nn
import torch.nn.functional as F

class SelfAttention(nn.Module):
  def __init__(self, k, heads = 8):
    super().__init__()
    self.k, self.heads = k, heads
    # These compute the queries, keys and values for all 
    # heads (as a single concatenated vector)
    self.tokeys = nn.Linear(k, k*heads bias = False)
    self.toqueries = nn.Linear(k, k*heads bias = False)
    self.tovalues = nn.Linear(k, k*heads bias = False)
    # This unifies the outputs of the different heads into 
	  # a single k-vector
    self.unifyheads = nn.Linear(heads*k, k)

  def forward(self, x):
    
    b, t, k = x.size()
    h = self.heads

    queries = self.toqueries(x).view(b, t, h, k)
    key = self.tokeys(x).view(b, t, h, k)
    values = self.tovalues(x).view(b, t, h, k)
    # - fold heads into the batch dimension
    keys = keys.transpose(1,2).contiguous().view(b*h, t, k)
    queries = queries.transpose(1,2).contiguous().view(b*h, t, k)
    values = values.transpose(1,2).contiguous().view(b*h, t, k)

    queries = queries/(k**(1/4))
    keys = keys/(k**(1/4))
    # - get dot product of queries and keys, and scale
    dot = torch.bmm(queries, key.transpose(1,2))
    # - dot has size (b*h, t, t) containing raw weights
'''
   For text generation :  to ensure that elements can only attend to input elements that precede them in the sequence.
                           we’ll need to ensure that it cannot look forward into the sequence. We do this by applying a mask 
                           to the matrix of dot products, before the softmax is applied. This mask disables all elements above the diagonal of the matrix.

    dot = torch.bmm(queries, keys.transpose(1, 2))

    indices = torch.triu_indices(t, t, offset=1)
    dot[:, indices[0], indices[1]] = float('-inf')

    dot = F.softmax(dot, dim=2)                       
'''

    dot = F.softmax(dot, dim=2)
    # - dot now contains row-wise normalized weights
    out = torch.bmm(out, values).view(b, h, t, k)
    # swap h, t back, unify heads
    # so that the head dimension and the embedding dimension are next to each other
    # We then pass these through the unifyheads layer to project them back down to k dimensions.
    out = out.transpose(1,2)contiguous().view(b, t, h * k)
    return self.unifyheads(out)
  
  def forward_einsum(self, x):
    b, t, e = x.size()
    h = self.heads

    keys    = self.tokeys(x).view(b, t, h, e)
    queries = self.toqueries(x).view(b, t, h, e)
    values  = self.tovalues(x).view(b, t, h, e)

    dot = torch.einsum('bthe,bihe->bhti', queries, keys) / math.sqrt(e)
    dot = F.softmax(dot, dim=-1)

    out = torch.einsum('bhtd,bdhe->bthe', dot, values)

    # we can move reshape of weights to init; I left it here just to compare with the original implementation
    out = torch.einsum('bthe,khe->btk', out, self.unifyheads.weight.view(e,h,e)) 
    return out + self.unifyheads.bias

In [None]:
class TransformerBlock(nn.Module):
  # the block applies, in sequence: a self attention layer, layer normalization, 
   # a feed forward layer (a single MLP applied independently to each vector), and another layer normalization.

  def __init__(self, k, heads):
    super().__init__()

    self.attention = SelfAttention(k, heads=heads)

    self.norm1 = nn.LayerNorm(k)
    self.norm2 = nn.LayerNorm(k)

    self.ff = nn.Sequential(nn.Linear(k, 4*k),
                            nn.Relu(),
                            nn.Linear(4*k, k))
  def forward(self, x):
    
    attended = self.attention(x)
    x = self.norm1(attended(x))

    fedforward = self.ff(x)

    return self.norm2(fedforward+x)

In [None]:
# Data Reading and Pre-Processing    for REFERENCE visit (https://github.com/pbloem/former/blob/master/experiments/classify.py)

In [None]:
# Data Reading and Pre-Processing

In [None]:
# Data Reading and Pre-Processing

In [None]:
# PRE-TRAINED EMBEDDINGS
vocab_size = vocab_size
embeddings_matrix = np.zeros((vocab_size, emb_dim))
model = FastText.load_fasttext_format('C:/Users/Admin/Downloads/FILES/Model/cc.en.300.bin/cc.en.100.bin')
for key, value in t_1.word_index.items():
    embedding_matrix[value] = model.wv[key]

In [None]:
# CUSTOM EMBEDDINGS
'''
embeddings_index = dict()
f = open('C:/Users/Admin/Downloads/FILES/Model/Script/final1.txt',encoding='utf8')
for line in f:
	values = line.split()
	word = values[0]
	coefs = asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

embedding_matrix = zeros((vocab_size, 100))
for word, i in t_1.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector
'''    

In [None]:
'''
def create_emb_layer(embedding_matrix, non_trainable=False):
    num_embeddings, embedding_dim = embedding_matrix.size()
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': embedding_matrix})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim
'''    

In [None]:
class Transformer(nn.Module):
  def __init__(self, k, heads, depth, seq_length, num_tokens, num_classes):
    super().__init__()
    self.num_embeddings, self.embedding_dim = embedding_matrix.size()
    self.num_tokens = num_tokens
    #self.token_emb = nn.Embeddings(num_tokens, k)
    self.token_emb = nn.Embeddings(self.num_embeddings, self.embedding_dim, requires_grad=True)
    self.token_emb.load_state_dict({'weight': embedding_matrix})
    #self.pos_emb = nn.Embeddings(seq_length, k)
    self.pos_emb = nn.Embeddings(seq_length, self.embedding_dim)
    # The sequence of transformer blocks that does all the 
		# heavy lifting
    tblocks = []
    for i in range(depth):
      tblocks.append(TransformerBlock(k = k, heads = heads))
    self.tblocks = nn.Sequential(*tblocks)
    # Maps the final output sequence to class logits 
    #self.toprobs = nn.Linear(k, num_classes)

  def forward(self, x):
    
  """
        :param x: A (b, t) tensor of integer values representing 
                  words (in some predetermined vocabulary).
        :return: A (b, c) tensor of log-probabilities over the 
                 classes (where c is the nr. of classes).
  """
     # generate token embeddings
     tokens = self.token_emb(x)
     b, t, k = tokens.size()
     
     # generate position embeddings
     positions = torch.arange(t)
     positions = self.pos_emb(positions)[None, :, :].expand(b, t, k)

     x = tokens + postions
     x = self.tblocks(x)
      
     # Average-pool over the t dimension and project to class 
     # probabilities

     #x = self.toprobs(x.mean(dims=1))
     #return F.log_softmax(x, dims=1) 
     return F.sigmoid(x, dims=1) 

In [None]:
lr = 0.0001
lr_warmup = 10000
num_heads = 8
depth = 6
batch_size = 16
num_epochs = 10
gradient_clipping = 1.0

In [None]:
model = former.CTransformer(emb=embedding_size, heads=num_heads, depth=depth, seq_length=mx, num_tokens=vocab_size, num_classes=NUM_CLS, max_pool=max_pool)
if torch.cuda.is_available():
  model.cuda()


opt = torch.optim.Adam(lr=lr, params=model.parameters())
sch = torch.optim.lr_scheduler.LambdaLR(opt, lambda i: min(i / (lr_warmup / batch_size), 1.0))

In [None]:
def train():
    seen = 0
    for e in range(num_epochs):

        print(f'\n epoch {e}')
        model.train(True)

        for batch in tqdm.tqdm(train_iter):

            opt.zero_grad()

            input = batch.text[0]
            label = batch.label - 1

            if input.size(1) > mx:
                input = input[:, :mx]
            out = model(input)
            loss = F.nll_loss(out, label)

            loss.backward()

            # clip gradients
            # - If the total gradient vector has a length > 1, we clip it back down to 1.
            if gradient_clipping > 0.0:
                nn.utils.clip_grad_norm_(model.parameters(), gradient_clipping)

            opt.step()
            sch.step()

            seen += input.size(0)
            tbw.add_scalar('classification/train-loss', float(loss.item()), seen)

        with torch.no_grad():

            model.train(False)
            tot, cor= 0.0, 0.0

            for batch in test_iter:

                input = batch.text[0]
                label = batch.label - 1

                if input.size(1) > mx:
                    input = input[:, :mx]
                out = model(input).argmax(dim=1)

                tot += float(input.size(0))
                cor += float((label == out).sum().item())

            acc = cor / tot
            print(f'-- {"test" if arg.final else "validation"} accuracy {acc:.3}')
            tbw.add_scalar('classification/test-loss', float(loss.item()), e)


train()            