In [3]:
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

This notebook just tries to predict next character. Works at character level not token level

In [1]:
# read it in to inspect it
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [2]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [3]:
print("length of text:",len(text))

length of text: 1115394


In [4]:
chars = sorted(list(set(text)))
vocab_size = len(chars)

print(vocab_size)
print(''.join(chars))

65

 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


##### We need to convert this text into sequence of numbers so that the model can understand.
- Let's build a naive tokenizer that assigns a number in the order of vocabulary

In [5]:
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [stoi[char] for char in s]
decode = lambda s:''.join([itos[i] for i in s])

print(encode("hello there"))
print(decode(encode("hello there")))

[46, 43, 50, 50, 53, 1, 58, 46, 43, 56, 43]
hello there


In [6]:
#tokenize input data
tokenized_text = encode(text)
import torch
data = torch.tensor(tokenized_text,dtype=torch.long)

print(data.shape,data.dtype)

torch.Size([1115394]) torch.int64


In [7]:
#split the data into train and validation
n = int(0.9*len(text))
train = data[:n]
val = data[n:]

In [8]:
torch.manual_seed(121)
batch_size = 4
block_size = 8 #context length

#get sample batch data
def get_batch(split):
    data = train if split=="train" else val
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    
    return x,y

xb,yb = get_batch("train")
print(xb.shape,yb.shape)
print(xb)
print(yb)
    

torch.Size([4, 8]) torch.Size([4, 8])
tensor([[ 1, 57, 53,  1, 40, 53, 50, 42],
        [ 1, 46, 47, 57,  1, 46, 39, 54],
        [58,  1, 53, 44,  0, 58, 46, 63],
        [ 1, 46, 43, 56,  1, 47, 52,  1]])
tensor([[57, 53,  1, 40, 53, 50, 42,  0],
        [46, 47, 57,  1, 46, 39, 54, 54],
        [ 1, 53, 44,  0, 58, 46, 63,  1],
        [46, 43, 56,  1, 47, 52,  1, 46]])


In [9]:
torch.randint(len(data)-8,(4,))

tensor([ 65018, 807717,  96106,  98581])

In [10]:
yb.view(-1)

tensor([57, 53,  1, 40, 53, 50, 42,  0, 46, 47, 57,  1, 46, 39, 54, 54,  1, 53,
        44,  0, 58, 46, 63,  1, 46, 43, 56,  1, 47, 52,  1, 46])

In [12]:
#let's build a simple bigram model
import torch
import torch.nn as nn   
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,vocab_size)
    
    def forward(self,idx,targets=None):
        
        logits = self.token_embedding(idx) # this returns Batch, Time, Channel (logits)
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            targets = targets.view(B*T)
            logits = logits.view(B*T,C)  #for loss calculation it should be batch_size, vocab - so adding time to batch dimension
            loss = F.cross_entropy(logits,targets)
        return logits,loss
    
    
    def generate(self, idx, max_tokens):
        # idx BxT
        
        for _ in range(max_tokens):
            logits,loss = self(idx) #logits - BxTxC
            logits = logits[:,-1,:] #take only the last prediction
            probs = F.softmax(logits,dim=1)
            
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim=1)
        
        return idx            

In [13]:
m = BigramLanguageModel(vocab_size)

In [14]:
logits,loss = m(xb,yb)

In [15]:
logits.shape,loss

(torch.Size([32, 65]), tensor(4.7009, grad_fn=<NllLossBackward0>))

In [21]:
xb[:1]

tensor([[ 1, 57, 53,  1, 40, 53, 50, 42]])

In [22]:
m.generate(xb[:1],max_tokens=10)

tensor([[ 1, 57, 53,  1, 40, 53, 50, 42, 30, 43, 24, 40, 59,  2, 30, 50, 35,  3]])

In [25]:
#optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

batch_size = 32
for steps in range(10000):
    xb,yb = get_batch("train")
    optimizer.zero_grad(set_to_none=True)
    logits,loss = m(xb,yb)
    loss.backward()
    optimizer.step()
    
print(loss.item())
    

2.5182087421417236


In [26]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long),max_tokens=300)[0].tolist()))



Th wa t nouncoreclf see trayodaraceashifie d my,
TI cor seanoreat toneres w'speesins d thext h,
KIIAn r, w, s lethiph iststhau
I ray as VIOris se?
CHAn, IUCals
Belove hay te, s NGBO: d fit:
KEDeeblio me whines d ter me Dohanthin o w th killate, thofoile spe t ceres has; pral ky y po pe te, fachoung


In [27]:
# take average of all previous contexts

T = 8
x = torch.randn((T,T))
wei = torch.tril(torch.ones(T,T))
wei = wei/wei.sum(1,keepdims=True)

xbow = x@wei

In [29]:
n_embed = 32
head_size = 16

class Head(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.query = nn.Linear(n_embed,head_size,bias=False)
        self.key = nn.Linear(n_embed,head_size,bias=False)
        self.value = nn.Linear(n_embed,head_size,bias=False)
        self.register_buffer("tril",torch.tril(torch.ones(block_size,block_size)))
    
    def forward(self,x):
        B,T,C = x.shape
        q = self.query(x)
        k = self.key(x)
        v = self.value(v)
        
        wei = q@k.transpose(-2,-1)*C**-0.5
        wei = wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei = F.softmax(wei,dim=-1)
        
        out = wei@v
        return out

In [33]:
class MultiHeadAttention(nn.Module):
    def __init__(self,num_heads,head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        
    def forward(self,x):
        return torch.cat([h(x) for h in self.heads],dim=-1)

In [36]:
#let's build a simple bigram model
import torch
import torch.nn as nn   
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size,n_embed)
        self.pos_embed = nn.Embedding(block_size,n_embed)
        self.sa_head = MultiHeadAttention(4,n_embed//4)
        self.lm_head = nn.Linear(n_embed,vocab_size)
    
    def forward(self,idx,targets=None):
        B,T = idx.shape
        tok_emb = self.token_embedding(idx) # this returns Batch, Time, Channel (logits)
        pos_emb = self.pos_embed(torch.arange(T))
        x = tok_emb + pos_emb
        x = self.sa_head(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            targets = targets.view(B*T)
            logits = logits.view(B*T,C)  #for loss calculation it should be batch_size, vocab - so adding time to batch dimension
            loss = F.cross_entropy(logits,targets)
        return logits,loss
    
    
    def generate(self, idx, max_tokens):
        # idx BxT
        
        for _ in range(max_tokens):
            idx_cond = idx[:,-block_size:]
            logits,loss = self(idx_cond) #logits - BxTxC
            logits = logits[:,-1,:] #take only the last prediction
            probs = F.softmax(logits,dim=1)
            
            idx_next = torch.multinomial(probs,num_samples=1)
            idx = torch.cat((idx,idx_next),dim=1)
        
        return idx            

In [38]:
#optimizer
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3)

batch_size = 32
for steps in range(5000):
    xb,yb = get_batch("train")
    optimizer.zero_grad(set_to_none=True)
    logits,loss = m(xb,yb)
    loss.backward()
    optimizer.step()
    
print(loss.item())
    

2.369410991668701


In [39]:
print(decode(m.generate(torch.zeros((1,1),dtype=torch.long),max_tokens=300)[0].tolist()))


NCK:
NULatarut mend thit,
ARIther ha p's mpispoterm blat t anme w mbusise,


The bre:
I honf ate p thag chatrt ourime E tweal hevime ave gl yooss, wabed s toforo Vovu IOfon by and h t w ichu l pauso th?
Angeir uck anomean y, prd hanoru d ow anctharshest?
Sus;
CICoyomean, Wis?
NTon, fero, cr: on.
CLo


: 