<a href="https://colab.research.google.com/github/shakib1364/Transfer-Learning-on-Pig-s-White-Blood-Cell/blob/main/Self_Attention_from_Scretch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# prompt: write a self attention code from scratch

import tensorflow as tf

class SelfAttention(tf.keras.layers.Layer):
  def __init__(self, num_hiddens, num_heads, dropout, bias=False):
    super(SelfAttention, self).__init__()
    self.num_hiddens = num_hiddens
    self.num_heads = num_heads
    self.attention = DotProductAttention(dropout)
    self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_v = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_o = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
  def call(self, queries, keys, values, valid_lens):
    queries = transpose_qkv(self.W_q(queries), self.num_heads)
    keys = transpose_qkv(self.W_k(keys), self.num_heads)
    values = transpose_qkv(self.W_v(values), self.num_heads)
    output = self.attention(queries, keys, values, valid_lens)
    output_concat = transpose_output(output, self.num_heads)
    return self.W_o(output_concat)
class DotProductAttention(tf.keras.layers.Layer):
  def __init__(self, dropout):
    super(DotProductAttention, self).__init__()
    self.dropout = tf.keras.layers.Dropout(dropout)
  def call(self, queries, keys, values, valid_lens):
    d = queries.shape[-1]
    scores = tf.matmul(queries, keys, transpose_b=True) / tf.math.sqrt(tf.cast(d, tf.float32))
    scores = scores + valid_lens
    attention_weights = tf.nn.softmax(scores, axis=-1)
    attention_weights = self.dropout(attention_weights)
    return tf.matmul(attention_weights, values)
def transpose_qkv(X, num_heads):
  X = tf.reshape(X, shape=(-1, X.shape[1], num_heads, X.shape[2] // num_heads))
  X = tf.transpose(X, perm=(0, 2, 1, 3))
  return X
def transpose_output(X, num_heads):
  X = tf.transpose(X, perm=(0, 2, 1, 3))
  return tf.reshape(X, shape=(-1, X.shape[1], X.shape[3] * num_heads))


In [None]:
import numpy as np
import math

L, d_k, d_v, = 4, 8, 8
q = np.random.randn(L, d_k)
k = np.random.randn(L, d_k)
v = np.random.randn(L, d_v)

print("Q/n", q)
print("K/n", k)
print("V/n", v)


### Why need self-attention?

In [None]:
np.matmul(q, k.T)

In [None]:
q.var(), k.var(), np.matmul(q, k.T).var()

In [None]:
scaled = np.matmul(q, k.T)/math.sqrt(d_k)
q.var(), k.var(), scaled.var()

(0.7327166434320771, 0.9096103414889943, 0.40459314499420407)

###**Masking**
- This is to ensure words don't get context from words generated in the future.
- Not required in the encoders but rquired in the decoders.

In [None]:
mask = np.tril(np.ones( (L, L) ))
mask

In [None]:
mask[mask==0] = -np.infty
mask[mask==1] = 0
mask

In [None]:
scaled+mask

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

attention = softmax(scaled+mask)
attention

In [None]:
new_v = np.matmul(attention, v)
new_v

In [None]:
def softmax(x):
  return (np.exp(x).T / np.sum(np.exp(x), axis=-1)).T

def scaled_dot_product_attention(q, k, v, mask=None):
  d_k = q.shape[-1]
  scaled = np.matmul(q, k.T) / math.sqrt(d_k)
  if mask is not None:
    scaled = scaled + mask
  attention = softmax(scaled)
  out = np.matmul(attention, v)
  return out, attention

In [None]:
values, attention = scaled_dot_product_attention(q, k, v, mask=mask)
print("Q/n", q)
print("K/n", k)
print("V/n", v)
print("New V\n", values)
print("Attention\n", attention)

Q/n [[-0.22725209  0.19017038 -1.25114645 -1.60011654  0.84150335  1.01990211
   0.34129021  0.00248908]
 [-0.20413995  1.59576592  0.54901898 -0.04465936 -0.43112762 -0.87721749
   0.09576519  0.91020876]
 [-0.28440536  1.10386939  0.00492604 -1.16604145 -0.30778306 -1.73889011
   0.9162179   1.30897592]
 [-0.19456469 -0.16751012  0.62563674  0.46679993 -1.24958566  1.19980282
   0.16474196  0.92063551]]
K/n [[-1.31579679 -0.45811179 -0.51386137  0.91222848  0.55710378 -0.6549167
   0.8844657   0.06065584]
 [ 0.85422319 -0.47238994  0.34522987  0.82301987  0.7667225  -0.47701927
  -0.87768361  0.82211111]
 [ 0.58533978  1.46084103  0.4575586   0.30232704  0.99251451 -1.35205815
  -1.552387   -2.26042259]
 [ 1.72204824 -0.30073142 -0.42415881 -1.70990408  0.56949191  0.39512841
  -0.16085376  0.8839436 ]]
V/n [[-0.68709021  1.86512806 -0.36518021  0.04592316 -0.55808395 -0.81443247
  -0.07224199  0.30840532]
 [-0.75227043 -1.37519284 -0.15201227  0.04156362  0.4868172   1.3067235
  -1.

###**MultiHead Attention**



In [None]:
# prompt: Write down MultiHead attention code of a ViT model

class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, key_size, query_size, value_size, num_hiddens, num_heads, dropout, bias=False):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.attention = DotProductAttention(dropout)
    self.W_q = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_k = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_v = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
    self.W_o = tf.keras.layers.Dense(num_hiddens, use_bias=bias)
  def call(self, queries, keys, values, valid_lens):
    queries = transpose_qkv(self.W_q(queries), self.num_heads)
    keys = transpose_qkv(self.W_k(keys), self.num_heads)
    values = transpose_qkv(self.W_v(values), self.num_heads)
    output = self.attention(queries, keys, values, valid_lens)
    output_concat = transpose_output(output, self.num_heads)
    return self.W_o(output_concat)


In [3]:
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

sequence_length = 4
batch_size = 1
input_dim = 512
d_model = 512
x = torch.randn((batch_size, sequence_length, input_dim))

In [5]:
x.size()

torch.Size([1, 4, 512])

In [6]:
qkv_layer = nn.Linear(input_dim, 3*d_model)
qkv = qkv_layer(x)

In [8]:
!

sample_data
