In [1]:
import torch

In [1]:
# We always start with a dataset to train on. Let's download the tiny shakespeare dataset
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2024-04-02 21:11:00--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8000::154, 2606:50c0:8001::154, 2606:50c0:8002::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8000::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-04-02 21:11:00 (13.2 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [2]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
# let's look at the first 1000 characters
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [4]:
chars = sorted(set(text))
print(chars)

['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [5]:
vocab_size = len(chars)
vocab_size

65

In [6]:
string_to_int = {}
int_to_string = {}

for i,ch in enumerate(chars):
    int_to_string[i] = ch
    string_to_int[ch] = i

In [7]:
def encode(str_data):
    encoded_data = []
    for d in str_data:
        encoded_data.append(string_to_int[d])
    return encoded_data
        

In [8]:
encode('hi \n')


[46, 47, 1, 0]

In [9]:
all_text_encoded = encode(text)
data = torch.tensor(all_text_encoded, dtype = torch.long)

In [10]:
data.shape

torch.Size([1115394])

In [11]:
n = int(0.9* len(data))

In [12]:
train_data = data[:n]
train_data.shape

torch.Size([1003854])

In [13]:
test_data = data[n:]
test_data.shape

torch.Size([111540])

In [14]:
0.1*n

100385.40000000001

In [15]:
batch_size = 4
context_len = 8

In [16]:
def get_single_batch_data(train_or_test_str):
    if train_or_test_str == 'train':
        used_data = train_data
    else:
        used_data = test_data
    
    batch_beg_idx = torch.randint(0, len(used_data) - context_len, (batch_size,1))
    single_batch_data = torch.zeros(batch_size, context_len)
    single_batch_data_output = torch.zeros(batch_size, context_len)
    
    for i in range(batch_size):
        single_batch_data[i,:] = torch.tensor([ used_data[batch_beg_idx[i] + kk] for kk in range(0,8) ])
        single_batch_data_output[i,:] = torch.tensor([ used_data[batch_beg_idx[i] + kk] for kk in range(1,9) ])
        
    return single_batch_data, single_batch_data_output

In [17]:
get_single_batch_data('test')

(tensor([[47., 56., 12.,  0.,  0., 28., 43., 42.],
         [42.,  1., 58., 53.,  1., 39., 57., 49.],
         [47., 43., 57., 58.,  1., 47., 57.,  1.],
         [56.,  1., 19., 56., 43., 51., 47., 53.]]),
 tensor([[56., 12.,  0.,  0., 28., 43., 42., 39.],
         [ 1., 58., 53.,  1., 39., 57., 49.,  1.],
         [43., 57., 58.,  1., 47., 57.,  1., 56.],
         [ 1., 19., 56., 43., 51., 47., 53.,  6.]]))

In [18]:
# using bag of words to store past context(mean from start to present)
B, T, C = 4, 8, 2
x = torch.randn(B, T, C)
xbow = torch.zeros(B, T, C)

In [19]:
for b in range(B):
    for t in range(T):
        x_at_b_t = x[b,:t+1]
        # print(torch.mean(x_at_b_t, dim=0).shape)
        xbow[b,t,:] = torch.mean(x_at_b_t, dim=0)
        # print(x_at_b_t.shape)
        # break
    # break

In [48]:
x[0]

tensor([[ 0.6224, -0.0270],
        [ 1.4111,  0.2896],
        [-1.3121,  0.0160],
        [ 1.1322,  0.5791],
        [-0.0847,  0.3350],
        [-1.2623,  0.8922],
        [ 0.0700,  0.3648],
        [ 1.1043,  0.7109]])

In [20]:
xbow[0]

tensor([[-1.4504,  0.2132],
        [-0.7731,  0.4824],
        [-0.7209,  0.2608],
        [-0.7917,  0.2771],
        [-0.7492,  0.1327],
        [-0.2860, -0.0838],
        [-0.2514,  0.0463],
        [-0.1560,  0.0522]])

In [21]:
# efficient way to perform the above without loops
weights = torch.tril(torch.ones(T,T)) # T x T @ T x C = T X C
print(weights)
weights_norm = weights/weights.sum(1, keepdim=True)
weights_norm # T X T 
xbow_eff = weights_norm @ x # (B) X T X T @ B x T x C = B x T x C

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])


In [22]:
print(xbow[0])
print(xbow_eff[0])

tensor([[-1.4504,  0.2132],
        [-0.7731,  0.4824],
        [-0.7209,  0.2608],
        [-0.7917,  0.2771],
        [-0.7492,  0.1327],
        [-0.2860, -0.0838],
        [-0.2514,  0.0463],
        [-0.1560,  0.0522]])
tensor([[-1.4504,  0.2132],
        [-0.7731,  0.4824],
        [-0.7209,  0.2608],
        [-0.7917,  0.2771],
        [-0.7492,  0.1327],
        [-0.2860, -0.0838],
        [-0.2514,  0.0463],
        [-0.1560,  0.0522]])


In [23]:
# now using softmax
tril = torch.tril(torch.ones(T,T))
tril
weights_2 = torch.zeros(T,T)
weights_2 = weights_2.masked_fill(tril == 0, float('-inf'))
weights_2


tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [24]:
weights_2_s = torch.nn.functional.softmax(weights_2, 1)
weights_2_s

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [25]:
xbow3 = weights_2_s @ x

In [26]:
print(xbow_eff[0])
print(xbow3[0])

tensor([[-1.4504,  0.2132],
        [-0.7731,  0.4824],
        [-0.7209,  0.2608],
        [-0.7917,  0.2771],
        [-0.7492,  0.1327],
        [-0.2860, -0.0838],
        [-0.2514,  0.0463],
        [-0.1560,  0.0522]])
tensor([[-1.4504,  0.2132],
        [-0.7731,  0.4824],
        [-0.7209,  0.2608],
        [-0.7917,  0.2771],
        [-0.7492,  0.1327],
        [-0.2860, -0.0838],
        [-0.2514,  0.0463],
        [-0.1560,  0.0522]])


In [42]:
import torch.nn as nn

In [43]:
# each token has Query vec(what am I looking for?), Key vec(What do I contain)
# A token's Query(what I am looking for?) . Key of Tok1(what I contian?), . Key of Tok2(what i contain?)
# head size = len of Query/key vector of each token

# but what will be aggregrated is not just input/token directly, but value of token. A new maping

In [69]:
B, T, C = 4, 8, 2 # 4 batches, 8 tokens, 2 channels/embedding len

In [86]:
x = torch.randn(B, T, C)

In [87]:
# a hyper param for attention block
head_size = 16
# keys and querys of size = head_size for all tokens, so T x head_size
# but it is a linear mapping from (B) x C x head_size . {B} x T X C => T x C @ C x head_size = T x head_size
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
# same way, instead of direct token, another mapping from embedding input to a Value
value = nn.Linear(C, head_size, bias=False)



In [88]:
q = query(x)
k = key(x)

In [89]:
q.shape

torch.Size([4, 8, 16])

In [90]:
k.shape

torch.Size([4, 8, 16])

In [91]:
weight_mat = q @ k.transpose(-2,-1) # only last 2 dim, leaving batch size

In [92]:
weight_mat.var()

tensor(0.3302, grad_fn=<VarBackward0>)

In [93]:
weight_mat = q @ k.transpose(-2, -1) * (head_size**-2)

In [94]:
weight_mat.var()

tensor(5.0391e-06, grad_fn=<VarBackward0>)

In [95]:
weight_mat.shape

torch.Size([4, 8, 8])

In [96]:
# but future tokens can't communicate, so have to make them zero
tril = torch.tril(torch.ones(T,T))
weight_mat = weight_mat.masked_fill(tril == 0, float('-inf'))
weight_mat = torch.nn.functional.softmax(weight_mat, -1)
weight_mat[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5004, 0.4996, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3327, 0.3320, 0.3353, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2490, 0.2491, 0.2509, 0.2509, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2003, 0.2005, 0.1990, 0.1989, 0.2013, 0.0000, 0.0000, 0.0000],
        [0.1666, 0.1666, 0.1669, 0.1668, 0.1665, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1430, 0.1426, 0.1425, 0.1432, 0.1428, 0.1430, 0.0000],
        [0.1250, 0.1251, 0.1248, 0.1248, 0.1252, 0.1250, 0.1251, 0.1251]],
       grad_fn=<SelectBackward0>)

In [97]:
# final output
v = value(x)
out = weight_mat @ v
out.shape

torch.Size([4, 8, 16])

In [98]:
# 1. attention can be on any directed graph, like hopfield networks too! 
# 2. attention need not just be on past, future can also be imposed(that is called encoder block). Here it depends on past, its 
# called decoder block

In [99]:
weight_mat[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5004, 0.4996, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3327, 0.3320, 0.3353, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2490, 0.2491, 0.2509, 0.2509, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2003, 0.2005, 0.1990, 0.1989, 0.2013, 0.0000, 0.0000, 0.0000],
        [0.1666, 0.1666, 0.1669, 0.1668, 0.1665, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1430, 0.1426, 0.1425, 0.1432, 0.1428, 0.1430, 0.0000],
        [0.1250, 0.1251, 0.1248, 0.1248, 0.1252, 0.1250, 0.1251, 0.1251]],
       grad_fn=<SelectBackward0>)

In [None]:
# softmax when given input as very neg or very pos numbers, the output is very non-uniform

In [102]:
import torch.nn.functional as F
F.softmax(torch.tensor([100., 90., 80.]), dim=0) # max 8 order diff, min 4 order diff

tensor([9.9995e-01, 4.5398e-05, 2.0611e-09])

In [103]:
F.softmax(torch.tensor([100., 90., 80.])/100, dim=0) # around 0.3

tensor([0.3672, 0.3322, 0.3006])