In [6]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

In [7]:
#!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

In [8]:
with open('input.txt', 'r') as file:
    text = file.read()
    
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [9]:
chars = sorted(list(set(text)))

print("".join(chars))

char_to_int = {c: i for i, c in enumerate(chars)}
int_to_char = {i: c for i, c in enumerate(chars)}

encode = lambda x: [char_to_int[c] for c in x]
decode = lambda x: ''.join(int_to_char[i] for i in x)
vocab_size = len(chars)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz


In [10]:
encoded = encode(text)
print(encoded[:1000])

[18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 14, 43, 44, 53, 56, 43, 1, 61, 43, 1, 54, 56, 53, 41, 43, 43, 42, 1, 39, 52, 63, 1, 44, 59, 56, 58, 46, 43, 56, 6, 1, 46, 43, 39, 56, 1, 51, 43, 1, 57, 54, 43, 39, 49, 8, 0, 0, 13, 50, 50, 10, 0, 31, 54, 43, 39, 49, 6, 1, 57, 54, 43, 39, 49, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 37, 53, 59, 1, 39, 56, 43, 1, 39, 50, 50, 1, 56, 43, 57, 53, 50, 60, 43, 42, 1, 56, 39, 58, 46, 43, 56, 1, 58, 53, 1, 42, 47, 43, 1, 58, 46, 39, 52, 1, 58, 53, 1, 44, 39, 51, 47, 57, 46, 12, 0, 0, 13, 50, 50, 10, 0, 30, 43, 57, 53, 50, 60, 43, 42, 8, 1, 56, 43, 57, 53, 50, 60, 43, 42, 8, 0, 0, 18, 47, 56, 57, 58, 1, 15, 47, 58, 47, 64, 43, 52, 10, 0, 18, 47, 56, 57, 58, 6, 1, 63, 53, 59, 1, 49, 52, 53, 61, 1, 15, 39, 47, 59, 57, 1, 25, 39, 56, 41, 47, 59, 57, 1, 47, 57, 1, 41, 46, 47, 43, 44, 1, 43, 52, 43, 51, 63, 1, 58, 53, 1, 58, 46, 43, 1, 54, 43, 53, 54, 50, 43, 8, 0, 0, 13, 50, 50, 10, 0, 35, 43, 1, 49, 52, 53, 61, 5, 

In [11]:
train_index = int(0.9*len(encoded))
train_data = torch.tensor(encoded[0:train_index])
test_data = torch.tensor(encoded[train_index:])

In [12]:
class Head(nn.Module):
    def __init__(self, embedding_dims, num_heads, dropout) -> None:
        super().__init__()
        self.head_size = embedding_dims // num_heads
        self.Q_network = nn.Linear(embedding_dims, self.head_size)
        self.K_network = nn.Linear(embedding_dims, self.head_size)
        self.V_network = nn.Linear(embedding_dims, self.head_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        B, T, C = x.shape
        
        Q = self.Q_network(x)
        K = self.K_network(x)
        V = self.V_network(x)

        raw_attn = Q @ K.transpose(-2, -1) / np.sqrt(self.head_size)

        tril = torch.tril(torch.ones(T, T, device=x.device)).unsqueeze(0)  # shape [1, T, T]
        raw_attn = raw_attn.masked_fill(tril == 0, float('-inf'))

        scaled_attn = F.softmax(raw_attn, dim=-1)
        scaled_attn = self.dropout(scaled_attn)
        out = scaled_attn @ V
        return out

In [13]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_dims, num_heads, dropout) -> None:
        super().__init__()
        self.heads = nn.ModuleList([Head(embedding_dims, num_heads, dropout) for _ in range(num_heads)])
        self.proj = nn.Linear(embedding_dims, embedding_dims)
    
    def forward(self, x):
        out_list = []
        for head in self.heads:
            out_list.append(head(x))
            
        return self.proj(torch.cat(out_list, dim=-1))
    
class FeedForwardNetwork(nn.Module):
    def __init__(self, embedding_dims, dropout) -> None:
        super().__init__()
        hidden_dims = 4 * embedding_dims
        self.net = nn.Sequential(
            nn.Linear(embedding_dims, hidden_dims),
            nn.ReLU(),
            nn.Linear(hidden_dims, embedding_dims),
            nn.Dropout(dropout)
        )
        
    def forward(self, x):
        x = self.net(x)
        return x

In [14]:
class TransformerBlock(nn.Module):
    def __init__(self, embedding_dims, num_heads, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(embedding_dims, num_heads, dropout)
        self.feed_forward = FeedForwardNetwork(embedding_dims, dropout)
        self.ln1 = nn.LayerNorm(embedding_dims)
        self.ln2 = nn.LayerNorm(embedding_dims)
        
    def forward(self, x):
        x = x + self.attention(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x

In [15]:
class NanoGPT(nn.Module):
    def __init__(self, embedding_dims, n_blocks, num_heads, block_size, dropout):
        super().__init__()
        self.blocks = nn.ModuleList([TransformerBlock(embedding_dims, num_heads, dropout) for _ in range(n_blocks)])
        self.ln = nn.LayerNorm(embedding_dims)
        self.proj = nn.Linear(embedding_dims, vocab_size)
        self.token_embed = nn.Embedding(vocab_size, embedding_dims)
        self.position_embed = nn.Embedding(block_size, embedding_dims)
        
    def forward(self, idx):
        B, T = idx.shape
        token_emb = self.token_embed(idx)
        pos = torch.arange(T, device=idx.device).unsqueeze(0)
        pos_emb = self.position_embed(pos)
        
        x = token_emb + pos_emb
        
        for block in self.blocks:
            x = block(x)
        
        x = self.ln(x)
        logits = self.proj(x)
        
        
        return logits



In [16]:
def get_batch(split, block_size, batch_size):
    data = train_data if split == "train" else test_data
    ix = torch.randint(0, len(data) - block_size - 1, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y



In [17]:
max_iter = 5000
learning_rate = 1e-3


batch_size = 4
block_size = 8
embedding_dims = 64
n_blocks = 4
num_heads = 4
dropout = 0.25
block_size = block_size


model = NanoGPT(embedding_dims, n_blocks, num_heads, block_size, dropout)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)


for i in range(max_iter):
    model.train()
    x, y = get_batch("train", block_size, batch_size)
    logits = model(x)
    
    B, T, C = logits.shape
    logits = logits.view(B * T, C)
    y = y.view(B * T)

    loss = F.cross_entropy(logits, y)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()


    if i % 100 == 0:
        print(f"Step {i}, Loss: {loss.item():.4f}")

  _torch_pytree._register_pytree_node(


Step 0, Loss: 4.5104
Step 100, Loss: 3.0218
Step 200, Loss: 2.6938
Step 300, Loss: 2.7376
Step 400, Loss: 2.7454
Step 500, Loss: 2.2103
Step 600, Loss: 2.2371
Step 700, Loss: 2.2941
Step 800, Loss: 2.5773
Step 900, Loss: 1.9867
Step 1000, Loss: 2.3334
Step 1100, Loss: 2.3379
Step 1200, Loss: 2.5704
Step 1300, Loss: 2.6942
Step 1400, Loss: 2.4805
Step 1500, Loss: 2.6009
Step 1600, Loss: 2.2222
Step 1700, Loss: 2.5983
Step 1800, Loss: 2.5781
Step 1900, Loss: 2.2502
Step 2000, Loss: 2.3080
Step 2100, Loss: 2.1757
Step 2200, Loss: 2.6110
Step 2300, Loss: 2.2124
Step 2400, Loss: 2.0703
Step 2500, Loss: 2.4372
Step 2600, Loss: 2.2949
Step 2700, Loss: 2.2278
Step 2800, Loss: 2.0941
Step 2900, Loss: 2.0762
Step 3000, Loss: 2.3197
Step 3100, Loss: 2.3651
Step 3200, Loss: 2.2480
Step 3300, Loss: 2.4231
Step 3400, Loss: 2.4199
Step 3500, Loss: 2.3721
Step 3600, Loss: 2.2610
Step 3700, Loss: 2.0963
Step 3800, Loss: 3.0379
Step 3900, Loss: 2.3317
Step 4000, Loss: 1.8799
Step 4100, Loss: 2.4865
Step

In [22]:
def generate(model, start, max_new_tokens):
    model.eval()  
    
    for _ in range(max_new_tokens):
        context = start[:, -block_size:] # trim to block size

        logits = model(context)
        logits = logits[:, -1, :] # get logits of last token

        probs = F.softmax(logits, dim=-1) # convert logits to probabilities
        next_token = torch.multinomial(probs, num_samples=1)  # sample from probabilities

        start = torch.cat((start, next_token), dim=1) # add back to context

    return start

start_text = "ROMEO: "
start_ids = torch.tensor([encode(start_text)], dtype=torch.long)
out_ids = generate(model, start_ids, max_new_tokens=200)[0].tolist()

print(decode(out_ids))

ROMEO: wing maons all mith but-s
Must stnoe my thalplins?

PORTARUS: Rise,
And fromerres the me shink is will mold shase sheack be mime lond, shave by?

CUMILANF:
Djens, come the sethouer, is go Frame.
 GLON
