<a href="https://colab.research.google.com/github/rahiakela/deep-learning-research-and-practice/blob/main/deep-learning-fundamentals/unit08-NLP/01-bag-of-words/02_self_attention_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Self-attention works from scratch

**Reference**

[Understanding and Coding the Self-Attention Mechanism of Large Language Models From Scratch](https://sebastianraschka.com/blog/2023/self-attention-from-scratch.html)


##Setup

In [3]:
import numpy as np
import pandas as pd
import torch

from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, DataLoader

## 1) Embedding Input Sentence

In [1]:
sentence = "Life is short, eat dessert first"

# create vocab
vocab = {w: i for i, w in enumerate(sorted(sentence.replace(",", "").split()))}
vocab

{'Life': 0, 'dessert': 1, 'eat': 2, 'first': 3, 'is': 4, 'short': 5}

In [4]:
# let's make integer-vector representation
sentence_vec = torch.tensor([vocab[w] for w in sentence.replace(",", "").split()])
sentence_vec

tensor([0, 4, 5, 2, 1, 3])

In [5]:
torch.manual_seed(123)

# let's embed this integer-vector representation
embedding = torch.nn.Embedding(6, 16)
embedded_sentence = embedding(sentence_vec).detach()

print(embedded_sentence.shape)
print(embedded_sentence)

torch.Size([6, 16])
tensor([[ 0.3374, -0.1778, -0.3035, -0.5880,  0.3486,  0.6603, -0.2196, -0.3792,
          0.7671, -1.1925,  0.6984, -1.4097,  0.1794,  1.8951,  0.4954,  0.2692],
        [ 0.5146,  0.9938, -0.2587, -1.0826, -0.0444,  1.6236, -2.3229,  1.0878,
          0.6716,  0.6933, -0.9487, -0.0765, -0.1526,  0.1167,  0.4403, -1.4465],
        [ 0.2553, -0.5496,  1.0042,  0.8272, -0.3948,  0.4892, -0.2168, -1.7472,
         -1.6025, -1.0764,  0.9031, -0.7218, -0.5951, -0.7112,  0.6230, -1.3729],
        [-1.3250,  0.1784, -2.1338,  1.0524, -0.3885, -0.9343, -0.4991, -1.0867,
          0.8805,  1.5542,  0.6266, -0.1755,  0.0983, -0.0935,  0.2662, -0.5850],
        [-0.0770, -1.0205, -0.1690,  0.9178,  1.5810,  1.3010,  1.2753, -0.2010,
          0.4965, -1.5723,  0.9666, -1.1481, -1.1589,  0.3255, -0.6315, -2.8400],
        [ 0.8768,  1.6221, -1.4779,  1.1331, -1.2203,  1.3139,  1.0533,  0.1388,
          2.2473, -0.8036, -0.2808,  0.7697, -0.6596, -0.7979,  0.1838,  0.2293]])


## 2)  Weight Matrices

In [6]:
torch.manual_seed(123)

d = embedded_sentence.shape[1]

# For computing dot-product between the query and key vectors, it contain the same number of elements
d_q, d_k, d_v = 24, 24, 28

W_Q = torch.nn.Parameter(torch.rand(d_q, d))
W_K = torch.nn.Parameter(torch.rand(d_k, d))
W_V = torch.nn.Parameter(torch.rand(d_v, d))

In [9]:
W_Q.shape, W_K.shape, W_V.shape

(torch.Size([24, 16]), torch.Size([24, 16]), torch.Size([28, 16]))

##3) Computing Attention Weights

In [None]:
X_train.shape

(35000, 10000)

In [None]:
X_train[0]

<1x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 74 stored elements in Compressed Sparse Row format>

In [None]:
np.array(X_train[0].todense())[0]

array([0, 0, 0, ..., 0, 0, 0])

In [None]:
np.bincount(np.array(X_train[0].todense())[0])

array([9926,   67,    5,    0,    1,    0,    1])

In [None]:
X_train[0].todense().flatten()

matrix([[0, 0, 0, ..., 0, 0, 0]])

In [None]:
np.array(X_train.todense()).shape

(35000, 10000)