In [52]:
import torch
import matplotlib
import torch.nn as nn
import torch.nn.functional as F

In [None]:
# importing and saving the raw data 
import requests

url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
response = requests.get(url)


with open("input.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

In [82]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [19]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(vocab_size)
print(''.join(chars))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [21]:
#encoding and decoding
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s : [stoi[i] for i in s]
decode = lambda l : [''.join(itos[i] for i in l)]


In [23]:
a = encode('Hello world')
b = decode(a)
print(a,b)

[20, 43, 50, 50, 53, 1, 61, 53, 56, 50, 42] ['Hello world']


In [30]:
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)
data[:100]


torch.Size([1115394]) torch.int64


tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [32]:
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]


In [39]:
block_size = 8
x = train_data[:block_size]
y = train_data[1:block_size+1]
for i in range(block_size):
    context = x[:i+1]
    target = y[i]
    print(context,target)

tensor([18]) tensor(47)
tensor([18, 47]) tensor(56)
tensor([18, 47, 56]) tensor(57)
tensor([18, 47, 56, 57]) tensor(58)
tensor([18, 47, 56, 57, 58]) tensor(1)
tensor([18, 47, 56, 57, 58,  1]) tensor(15)
tensor([18, 47, 56, 57, 58,  1, 15]) tensor(47)
tensor([18, 47, 56, 57, 58,  1, 15, 47]) tensor(58)


In [None]:
torch.manual_seed(1337)
batch_size = 4
block_size = 8

def get_split(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+1+block_size] for i in ix])
    return x , y

xb,yb = get_split('train')

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b,:t+1]
        target = yb[b,t]
        print(f'for input {context.tolist()} -> output {target.item()}')
    print('----------------------')



for input [24] -> output 43
for input [24, 43] -> output 58
for input [24, 43, 58] -> output 5
for input [24, 43, 58, 5] -> output 57
for input [24, 43, 58, 5, 57] -> output 1
for input [24, 43, 58, 5, 57, 1] -> output 46
for input [24, 43, 58, 5, 57, 1, 46] -> output 43
for input [24, 43, 58, 5, 57, 1, 46, 43] -> output 39
----------------------
for input [44] -> output 53
for input [44, 53] -> output 56
for input [44, 53, 56] -> output 1
for input [44, 53, 56, 1] -> output 58
for input [44, 53, 56, 1, 58] -> output 46
for input [44, 53, 56, 1, 58, 46] -> output 39
for input [44, 53, 56, 1, 58, 46, 39] -> output 58
for input [44, 53, 56, 1, 58, 46, 39, 58] -> output 1
----------------------
for input [52] -> output 58
for input [52, 58] -> output 1
for input [52, 58, 1] -> output 58
for input [52, 58, 1, 58] -> output 46
for input [52, 58, 1, 58, 46] -> output 39
for input [52, 58, 1, 58, 46, 39] -> output 58
for input [52, 58, 1, 58, 46, 39, 58] -> output 1
for input [52, 58, 1, 58, 

torch.Size([])

In [None]:
torch.manual_seed(1337)
class BiagramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,target):
        logits = self.token_embedding_table(idx)
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        target = target.view(B*T)
        loss = F.cross_entropy(logits,target)

        return logits,loss

m = BiagramLanguageModel(vocab_size)

logits,loss = m.forward(xb,yb)
print(loss)
print(logits.shape)
    

tensor(4.8786, grad_fn=<NllLossBackward0>)
torch.Size([32, 65])


In [59]:
optimizer = torch.optim.AdamW(m.parameters(),lr=0.001)

In [81]:
batch_size = 32
for _ in range(1000):
    xb,yb = get_split('train')
    logits,loss = m.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
print(loss)




tensor(2.4135, grad_fn=<NllLossBackward0>)


In [99]:
torch.manual_seed(1337)
class BiagramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,idx,target):
        logits = self.token_embedding_table(idx)
        B,T,C = logits.shape
        logits = logits.view(B*T,C)
        target = target.view(B*T)
        loss = F.cross_entropy(logits,target)
        # print(logits.shape)

        return logits,loss

m = BiagramLanguageModel()
logits, loss = m.forward(xb,yb)


In [101]:
B,T,C = 4,8,32
torch.zeros(T,T)

tensor([[0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [119]:
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros(T,T)
wei = wei.masked_fill(tril == 0 , float('-inf'))
wei

tensor([[0., -inf, -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., -inf, -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., -inf, -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., -inf, -inf, -inf, -inf],
        [0., 0., 0., 0., 0., -inf, -inf, -inf],
        [0., 0., 0., 0., 0., 0., -inf, -inf],
        [0., 0., 0., 0., 0., 0., 0., -inf],
        [0., 0., 0., 0., 0., 0., 0., 0.]])

In [121]:
wei = F.softmax(wei,dim =-1)
wei

tensor([[0.2797, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029, 0.1029],
        [0.1773, 0.1773, 0.1076, 0.1076, 0.1076, 0.1076, 0.1076, 0.1076],
        [0.1519, 0.1519, 0.1519, 0.1089, 0.1089, 0.1089, 0.1089, 0.1089],
        [0.1405, 0.1405, 0.1405, 0.1405, 0.1095, 0.1095, 0.1095, 0.1095],
        [0.1341, 0.1341, 0.1341, 0.1341, 0.1341, 0.1098, 0.1098, 0.1098],
        [0.1300, 0.1300, 0.1300, 0.1300, 0.1300, 0.1300, 0.1100, 0.1100],
        [0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1271, 0.1102],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])

In [126]:
x = torch.randn(B,T,C)
out = wei @ x
out.shape

torch.Size([4, 8, 32])

In [155]:
head_size = 16
x = torch.randn(B,T,C)
query = nn.Linear(C,head_size,bias=None)
key = nn.Linear(C,head_size,bias=None)
value = nn.Linear(C,head_size,bias=None)
q = query(x)
k = key(x)
v = value(x)
wei = q @ k.transpose(-1,-2) * (head_size ** -0.5) 
tril = torch.tril(torch.ones(T,T))
wei = wei.masked_fill(tril == 0,float('-inf'))
wei = F.softmax(wei,dim = -1)
out = wei @ v
out[0]




tensor([[-0.6841, -0.5638, -0.2279,  0.4443,  0.2718,  0.2982, -0.5557, -0.2690,
          0.0644,  0.0468, -0.7681, -0.4201, -0.8925, -1.8488,  0.1754,  0.3729],
        [-0.0793, -0.2409, -0.3245,  0.4120,  0.0099,  0.2230, -0.1024, -0.1015,
          0.0300,  0.2737, -0.2424,  0.2448, -0.4092, -0.8259, -0.2421, -0.1070],
        [-0.0354, -0.1061, -0.1841,  0.2840,  0.0862,  0.0334, -0.0048,  0.1523,
         -0.0907,  0.2187,  0.1581,  0.2854,  0.0535, -0.4993, -0.4435, -0.2363],
        [-0.2628, -0.1917, -0.0780,  0.2148, -0.0968, -0.0111, -0.0878,  0.0442,
          0.0114,  0.0674,  0.2046,  0.0160, -0.0105, -0.4364, -0.2321,  0.0325],
        [-0.0619, -0.1428, -0.0903,  0.1182, -0.2610,  0.0310, -0.1157, -0.0066,
          0.0284,  0.0410,  0.1522,  0.1522, -0.1466, -0.3224, -0.2496,  0.0220],
        [-0.2027, -0.1387, -0.0308,  0.0485, -0.3569,  0.1092, -0.0804, -0.1644,
          0.1587,  0.0290,  0.1002,  0.1984, -0.2671, -0.2561, -0.2712,  0.0610],
        [-0.4286, -0.2

In [156]:
wei[0]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5809, 0.4191, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3558, 0.3724, 0.2718, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2562, 0.2877, 0.2246, 0.2315, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1772, 0.2828, 0.1995, 0.1561, 0.1844, 0.0000, 0.0000, 0.0000],
        [0.1505, 0.2006, 0.1971, 0.1699, 0.1553, 0.1267, 0.0000, 0.0000],
        [0.1023, 0.0937, 0.1447, 0.2979, 0.1256, 0.0694, 0.1665, 0.0000],
        [0.1559, 0.0992, 0.1109, 0.1041, 0.1245, 0.1653, 0.1415, 0.0986]],
       grad_fn=<SelectBackward0>)