In [2]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()

In [3]:
print("length of dataset in characters: ", len(text))

length of dataset in characters:  1115394


In [4]:
print(text[:1000])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor citizens, the patricians good.
What authority surfeits on would relieve us: if they
would yield us but the superfluity, while it were
wholesome, we might guess they relieved us humanely;
but they think we are too dear: the leanness that
afflicts us, the object of our misery, is as an
inventory to particularise their abundance; our
sufferance is a gain to them Let us revenge this with
our pikes, ere we become rakes: for the gods know I
speak this in hunger for bread, not in thirst for revenge.



In [5]:
chars = sorted(list(set(text)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


In [6]:
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(stoi)

print(encode("hii there"))
print(decode(encode("hii there")))

{'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}
[46, 47, 47, 1, 58, 46, 43, 56, 43]
hii there


In [7]:
import torch
import torch.nn as nn
import torch.nn.functional as F

data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)

  cpu = _conversion_method_template(device=torch.device("cpu"))


torch.Size([1115394])


In [8]:
n = int(len(data) * 0.9)
train = data[:n]
val = data[n:]

In [9]:
torch.manual_seed(1337)

batch_size = 4
block_size = 8 # context window size

def get_batch(split: str, batch_size: int):
    data = train if split == "train" else val
    ix = torch.randint(0, len(data)-block_size-1, (batch_size,)) # samples batch_size number of integers
    X = torch.stack([data[i:i+block_size] for i in ix])          # B, block_size 
    Y = torch.stack([data[i+1:i+block_size+1] for i in ix])      # B, block_size

    return X, Y

xb, yb = get_batch("train", batch_size)
xb.shape, yb.shape

(torch.Size([4, 8]), torch.Size([4, 8]))

# Bigram Model

In [10]:
torch.manual_seed(1337) # need to reset every time so we "reset" the randomness tape everytime the cell is run for consistent results, otherwise pos gets consumed and moves to next everytime cell is run

class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding_table = nn.Embedding(vocab_size, vocab_size)
    
    def forward(self, x, targets=None):
        '''
        x       - B, T 
        targets - B, T
        '''
        logits = self.embedding_table(x) # B (batch_size), T ("time" [block_size]), C (vocab_size); each int -> produce a tensor of logits for next char (C)
        loss = None

        if targets != None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # pytorch cross entropy expects channel dimension to be in 2nd dimension (B, C, ...)
            targets = targets.view(B*T)  # no one-hot is more stable if the output is not a prob distribution
            print(logits.shape, targets.shape)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, ix, max_new_tokens):
        '''
        ix (B x T) - starting point of generating new output
        max_new_tokens - max length of the additional output
        '''
        for _ in range(max_new_tokens):
            # bigram only needs the last input; keep tensor 2d as forward expects 2d input
            context = ix[:, -1].view(-1, 1)     # B, 1 
            logits, _ = self(context)  # B, 1, C
            logits = logits.squeeze(dim=1) # B, C
            probs = F.softmax(logits, dim=1)
            next_char = torch.multinomial(probs, num_samples=1, replacement=True)
            ix = torch.cat((ix, next_char), dim=1)
        return ix
    
model = BigramModel(vocab_size=vocab_size)
logits, loss = model(xb, yb)

input = torch.zeros((1,1), dtype=torch.long)
output = model.generate(input, max_new_tokens=100) # B, 1+max_new_tokens
print(output.shape)
output_list = output[0].tolist()
print(decode(output_list))

torch.Size([32, 65]) torch.Size([32])
torch.Size([1, 101])

SKIcLT;AcELMoTbvZv C?nq-QE33:CJqkOKH-q;:la!oiywkHjgChzbQ?u!3bLIgwevmyFJGUGp
wnYWmnxKWWev-tDqXErVKLgJ


In [11]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [12]:
batch_size = 32
epochs = 100000

for epoch in range(epochs):
    # get batch
    xb, yb = get_batch("train", batch_size=batch_size)

    # forward pass
    logits, loss = model(x=xb, targets=yb)

    # backward pass
    optimizer.zero_grad()
    loss.backward()

    # update step
    optimizer.step()

    print(f"Epoch {epoch+1}: loss={loss.item()}")
    


torch.Size([256, 65]) torch.Size([256])
Epoch 1: loss=4.750201225280762
torch.Size([256, 65]) torch.Size([256])
Epoch 2: loss=4.60972785949707
torch.Size([256, 65]) torch.Size([256])
Epoch 3: loss=4.710180759429932
torch.Size([256, 65]) torch.Size([256])
Epoch 4: loss=4.694168567657471
torch.Size([256, 65]) torch.Size([256])
Epoch 5: loss=4.683116436004639
torch.Size([256, 65]) torch.Size([256])
Epoch 6: loss=4.676141262054443
torch.Size([256, 65]) torch.Size([256])
Epoch 7: loss=4.669034004211426
torch.Size([256, 65]) torch.Size([256])
Epoch 8: loss=4.6731038093566895
torch.Size([256, 65]) torch.Size([256])
Epoch 9: loss=4.732837200164795
torch.Size([256, 65]) torch.Size([256])
Epoch 10: loss=4.770676136016846
torch.Size([256, 65]) torch.Size([256])
Epoch 11: loss=4.772107124328613
torch.Size([256, 65]) torch.Size([256])
Epoch 12: loss=4.605309009552002
torch.Size([256, 65]) torch.Size([256])
Epoch 13: loss=4.760058879852295
torch.Size([256, 65]) torch.Size([256])
Epoch 14: loss=4.568

In [13]:
input = torch.zeros((1,1), dtype=torch.long)
output = model.generate(input, max_new_tokens=300) # B, 1+max_new_tokens
print(output.shape)
output_list = output[0].tolist()
print(decode(output_list))

torch.Size([1, 301])


Ofows ht IUS:
S:

ING flvenje ssutefr,
M:
War cl igagimous pray whars:
Panalit I It aithit terised the. by fonau buaror VOubed spo mng as chathab llll:
Ware,

ee her,
Thooured aly y hind I'dimashat-owhrees s, share hathure Anfaneof f s llon!

ICLiroushanot

Then
Magend cugss, be jollrty

AROUFLom, 


# Self-Attention

We represent the previous context at timestep x as the average of embeddings up to x

In [None]:
# general idea for self attention

x_sample, y_sample = get_batch("train", batch_size=3) # B x T
B, T = x_sample.shape
x_sample = x_sample.view((B, T, -1)).float()
B, T, C = x_sample.shape # B, T, C=1

# attention table assuming equal weights
permute = torch.tril(torch.ones((T, T), dtype=torch.float32)) # triangle lower (tri l)
permute /= permute.sum(dim=1, keepdim=True)

# attention table with weighted attention
wei = torch.zeros((T, T), dtype=torch.float)
wei = wei.masked_fill(permute == 0, -float("inf")) # ensures no attention comes from future characters
wei = F.softmax(wei, dim=-1)

x_context = wei @ x_sample # (permute - (T, T); x_sample - (B, T, C)) - Batch dimensions appended as 1 until shapes match then performing parallel matrix multiplications, in this case - B permute matrices are formed (B, T, T) then the matrix multiplications ((T, T) @ (T, C)) are performed in parallel across the B matrices

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3333, 0.3333, 0.3333, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2500, 0.2500, 0.2500, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000, 0.0000, 0.0000],
        [0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.1667, 0.0000, 0.0000],
        [0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.1429, 0.0000],
        [0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250, 0.1250]])


In [16]:
torch.manual_seed(1337) # need to reset every time so we "reset" the randomness tape everytime the cell is run for consistent results, otherwise pos gets consumed and moves to next everytime cell is run
emb_size = 32

class BigramModel(nn.Module):

    def __init__(self, vocab_size):
        super().__init__()

        self.embedding_table = nn.Embedding(vocab_size, emb_size)
        self.lm_head = nn.Linear(emb_size, vocab_size)
    
    def forward(self, x, targets=None):
        '''
        x       - B, T 
        targets - B, T
        '''
        tok_emb = self.embedding_table(x) # B (batch_size), T ("time" [block_size]), C (vocab_size); each int -> produce a tensor of logits for next char (C)
        logits = self.lm_head(tok_emb)
        loss = None

        if targets != None:
            B, T, C = logits.shape
            logits = logits.view(B*T, C) # pytorch cross entropy expects channel dimension to be in 2nd dimension (B, C, ...)
            targets = targets.view(B*T)  # no one-hot is more stable if the output is not a prob distribution
            print(logits.shape, targets.shape)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, ix, max_new_tokens):
        '''
        ix (B x T) - starting point of generating new output
        max_new_tokens - max length of the additional output
        '''
        for _ in range(max_new_tokens):
            # bigram only needs the last input; keep tensor 2d as forward expects 2d input
            context = ix[:, -1].view(-1, 1)     # B, 1 
            logits, _ = self(context)  # B, 1, C
            logits = logits.squeeze(dim=1) # B, C
            probs = F.softmax(logits, dim=1)
            next_char = torch.multinomial(probs, num_samples=1, replacement=True)
            ix = torch.cat((ix, next_char), dim=1)
        return ix
    
model = BigramModel(vocab_size=vocab_size)
logits, loss = model(xb, yb)

input = torch.zeros((1,1), dtype=torch.long)
output = model.generate(input, max_new_tokens=100) # B, 1+max_new_tokens
print(output.shape)
output_list = output[0].tolist()
print(decode(output_list))

torch.Size([256, 65]) torch.Size([256])
torch.Size([1, 101])

hYQRnbbmkMTUwbiu$?3KHvybsMEEFNLyb!SZgyGzRX$oNqTs!roUNLjMXM!EjT!hjmfH'ER3cOn.kvgAuau&e;m-CNLkfMW HT'R
