In [512]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import random
import time
import math
import os
import sys
import matplotlib.pyplot as plt

In [513]:
# download file to tmp/data.txt
!wget -O tmp/data.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-09-25 19:46:55--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 2606:50c0:8002::154, 2606:50c0:8000::154, 2606:50c0:8003::154, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|2606:50c0:8002::154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'tmp/data.txt'

     0K .......... .......... .......... .......... ..........  4% 3.16M 0s
    50K .......... .......... .......... .......... ..........  9% 11.1M 0s
   100K .......... .......... .......... .......... .......... 13% 8.51M 0s
   150K .......... .......... .......... .......... .......... 18% 15.6M 0s
   200K .......... .......... .......... .......... .......... 22% 26.1M 0s
   250K .......... .......... .......... .......... .......... 27% 9.42M 0s
   300K .......... .......... .......... .......... .......... 32% 15.2M 0s
  

In [514]:
with open('tmp/data.txt', 'r') as f:
    text = f.read()
print('text length:', len(text))

text length: 1115394


In [515]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [516]:
chars = ["[PAD]", *sorted(list(set(text)))]
vocab_size = len(chars)
print("".join(chars))
print("vocab size:", vocab_size)

[PAD]
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 66


In [517]:
# Create mapping from character to index and vice versa
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[47, 48, 48, 2, 59, 47, 44, 57, 44]
hii there


In [518]:
# store in tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 100 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60])


In [519]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [520]:
block_size = 100
train_data[:block_size+1]

tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60,  2])

In [521]:
a = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = a[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([19]) the target: 48
when input is tensor([19, 48]) the target: 57
when input is tensor([19, 48, 57]) the target: 58
when input is tensor([19, 48, 57, 58]) the target: 59
when input is tensor([19, 48, 57, 58, 59]) the target: 2
when input is tensor([19, 48, 57, 58, 59,  2]) the target: 16
when input is tensor([19, 48, 57, 58, 59,  2, 16]) the target: 48
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48]) the target: 59
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59]) the target: 48
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48]) the target: 65
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65]) the target: 44
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44]) the target: 53
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53]) the target: 11
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11]) the target: 1
when input is tensor([19, 48, 57, 58, 59,  2, 16, 

In [522]:
torch.manual_seed(1337)
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 10 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print(xb)
print('targets:')
print(yb.shape)
print(yb)

# print('----')

# for b in range(batch_size): # batch dimension
#     for t in range(block_size): # time dimension
#         context = xb[b, :t+1]
#         target = yb[b,t]
#         print(f"when input is {context.tolist()} the target: {target}")

inputs:
torch.Size([32, 10])
tensor([[57, 42, 47,  2, 54,  6,  2, 59, 47, 44],
        [15, 34, 16, 24, 22, 27, 20, 21, 14, 26],
        [59, 47, 44, 57, 11,  1, 15, 60, 59,  2],
        [48, 61, 44,  2, 52, 44,  2, 59, 47, 64],
        [55, 54, 51, 51, 54,  2, 55, 51, 40, 64],
        [62, 40, 51, 51, 54, 62,  2, 52, 44,  2],
        [48, 57, 58, 59,  2, 26, 60, 57, 43, 44],
        [54,  2, 40,  2, 47, 54, 60, 58, 44,  7],
        [47, 44,  2, 62, 54, 44, 45, 60, 51, 51],
        [14, 41, 54, 60, 59,  2, 59, 47, 44, 48],
        [57, 44,  9,  2, 14, 62, 40, 64,  2, 62],
        [59, 47, 48, 58,  2, 42, 54, 61, 44, 53],
        [22,  2, 52, 44, 40, 53,  2, 59, 54,  2],
        [53, 43,  2, 52, 44, 12,  1, 14, 53, 43],
        [44,  2, 55, 44, 40, 42, 44,  2, 62, 48],
        [43,  2, 45, 57, 54, 52,  2, 47, 48, 58],
        [11,  1, 36, 48, 59, 47,  2, 58, 48, 46],
        [58, 48, 57,  7,  2, 22,  2, 43, 54,  2],
        [44, 40, 51,  2, 52, 64,  2, 59, 57, 60],
        [ 2, 43, 60, 

In [523]:
import numpy as np

# Create a sample NumPy array
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

# Define the index 'n' from which you want to start
n = 2

# Get elements from indices n - 5 to n
start_index = max(0, n - 5)  # Ensure the start index is not negative
end_index = n + 1  # Add 1 to include the element at index 'n'

result = arr[start_index:end_index]

print(result)

[0 1 2]


In [548]:
class NGramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n):
        super().__init__()
        self.n = n
        embed_size = vocab_size + 10 * n
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.pos_embedding = nn.Parameter(torch.randn((n, embed_size)))
        self.adding_weight = nn.Parameter(torch.zeros(n))
        self.fc = nn.Linear(embed_size, 200)
        self.relu = nn.ReLU()
        self.final = nn.Linear(200, vocab_size)

    # Create separate function for forward calculation
    def forward_calc(self, x):
        assert len(x.shape) == 2, "input shape should be (batch, time)"
        new_x = torch.zeros((x.shape[0], x.shape[1], self.n), dtype=torch.long) - 69
        # -69 is just a random number so if we see it we know something went wrong
        for t in range(x.shape[1]):
            for pos in range(self.n):
                row = x[:, max(0, t - pos) : t + 1]
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                new_x[:, t] = row 

        x = torch.add(self.token_embedding(new_x), self.pos_embedding)
        x = F.softmax(self.adding_weight, -1) @ x
        x = self.fc(x)
        x = self.relu(x)
        x = self.final(x)
        return x

    def forward(self, x, targets=None):
        if targets is None:
            # we're doing inference, so we don't have targets
            # if x is None, we're generating from scratch
            if x is None:
                x = batch_size

            # generate x batches
            if isinstance(x, int):
                x = torch.zeros(x, 1, dtype=torch.long)
            return self.forward_calc(x), None

        else:
            # we're training, so we do have targets
            logits = self.forward_calc(x)
            B, T, C = logits.shape  # batch, time, channel
            logits_flat = logits.view(B * T, C)
            loss = F.cross_entropy(logits_flat, targets.view(B * T))

            return logits_flat, loss

    def generate(self, x, max_len_new, temperature=1.0):
        for _ in range(max_len_new):
            logits, loss = self(x)
            logits_dist = logits[:, -1] / temperature
            probs = F.softmax(logits_dist, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, next_token], dim=1)

        return x

In [570]:
model = NGramLanguageModel(vocab_size, 10)
# x = (torch.arange(12) + 13).reshape(1, -1).repeat(2, 1)
a = xb[0:2]
print(f"x.shape = {a.shape}")
print(f"x = {a}")
y, loss = model(a)
print(f"y.shape = {y.shape}")
print(f"y = {y}")

x.shape = torch.Size([2, 10])
x = tensor([[41, 44,  2, 59, 47, 44, 57, 44,  7,  2],
        [51, 11,  2, 64, 44, 59,  2, 45, 54, 57]])
y.shape = torch.Size([2, 10, 66])
y = tensor([[[ 0.0985, -0.1438,  0.0432,  ...,  0.1383, -0.0346, -0.0909],
         [ 0.1221, -0.1712,  0.0353,  ...,  0.1874, -0.0151, -0.0839],
         [ 0.1252, -0.1682,  0.0310,  ...,  0.1864, -0.0613, -0.1180],
         ...,
         [ 0.1739, -0.2397,  0.0743,  ...,  0.2541, -0.0431, -0.1290],
         [ 0.1498, -0.2761,  0.0730,  ...,  0.2748,  0.0013, -0.1505],
         [ 0.1722, -0.2853,  0.0896,  ...,  0.2885, -0.0381, -0.2031]],

        [[ 0.0917, -0.1418,  0.0721,  ...,  0.1373, -0.0217, -0.0240],
         [ 0.0998, -0.1791,  0.0677,  ...,  0.1549, -0.0445, -0.0374],
         [ 0.1122, -0.1758,  0.0508,  ...,  0.1657, -0.0885, -0.0771],
         ...,
         [ 0.0947, -0.2881,  0.0695,  ...,  0.2361, -0.1426, -0.1380],
         [ 0.0974, -0.3134,  0.0670,  ...,  0.2459, -0.1378, -0.0842],
         [ 0.096

In [571]:
a = torch.randn((32, 10, 5, 100))
b = torch.randn((5,))
a.shape, b.shape, (b @ a).shape

(torch.Size([32, 10, 5, 100]), torch.Size([5]), torch.Size([32, 10, 100]))

In [572]:
print(model)
logits, loss = model(xb, yb)
print('logits:', logits.shape)
print('loss:', loss)

print(decode(model.generate(xb, 10)[0].tolist()))

NGramLanguageModel(
  (token_embedding): Embedding(66, 166, padding_idx=0)
  (fc): Linear(in_features=166, out_features=200, bias=True)
  (relu): ReLU()
  (final): Linear(in_features=200, out_features=66, bias=True)
)
logits: torch.Size([320, 66])
loss: tensor(4.1990, grad_fn=<NllLossBackward>)
be there, QUUCTPNfgH


In [573]:
# train the model
optimizer = optim.AdamW(model.parameters(), lr=0.002)
batch_size = 32
for step in range(10000):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print(f'step: {step}, loss: {loss.item()}')

print(f'step: {step}, loss: {loss.item()}')

step: 0, loss: 4.206731796264648


step: 500, loss: 2.5691988468170166
step: 1000, loss: 2.227761745452881
step: 1500, loss: 2.1556649208068848
step: 2000, loss: 2.272289752960205
step: 2500, loss: 1.9747114181518555
step: 3000, loss: 2.158937931060791
step: 3500, loss: 1.9785583019256592
step: 4000, loss: 1.8584703207015991
step: 4500, loss: 2.013803005218506
step: 5000, loss: 1.9220317602157593
step: 5500, loss: 2.159390926361084
step: 6000, loss: 1.945309042930603
step: 6500, loss: 1.765811562538147
step: 7000, loss: 1.9038727283477783
step: 7500, loss: 2.0788521766662598
step: 8000, loss: 1.7741708755493164
step: 8500, loss: 1.9944312572479248
step: 9000, loss: 1.829410195350647
step: 9500, loss: 1.8535282611846924
step: 9999, loss: 1.9209753274917603


In [579]:
for x in model.generate(torch.tensor([encode("LUCENT")] * 5), 200, 0.9):
    print(decode(x.tolist()))
    print('----')

LUCENTIO:
God have!
What requeence,
O, I am ged we banst.
But formse, what bust.

MARD IV:
Chasce thus prace I cause?
Let thy, then it my nord reyl mand Sher what kno, sireform ighs'd cyour with a han make 
----
LUCENTIO:
Ther to spared alle slandy woustry, to thins me! O puntempays,
But his a darce, be now on wil by tain estrains, rave have my, fave:
No, ad prove!

KING RICHARD II:
Shall:
I ders in's thus my red
A
----
LUCENTESTER:
Here'd.

Least who, frie which Wome stines tall be worly!
Why!
HENNot the came daunne why thencior, nend my the of thin.
A Hes dier gazen:
Blabeliing. Rich whers, are! Letnonce, hons mann!
I th
----
LUCENTIO:
O, will teach fire and aus my ques:
On;
And thuse him.

DUKE VI:
Part, word;
Not,
Your holive me to deret,
And swere your pount what learle, ad greart my dofroth;
This do patom tiven that to maket
----
LUCENTIO:
I ve sleer mad! whall sesuir amirst are many afforess 'enere, wble mak eweak not thenem.

Land the ire atce to-stion's hore.

VOLURENCENTENIO:

In [577]:
F.softmax(model.adding_weight)

  F.softmax(model.adding_weight)


tensor([0.0059, 0.0058, 0.0046, 0.0042, 0.0059, 0.0132, 0.0396, 0.1321, 0.2804,
        0.5083], grad_fn=<SoftmaxBackward>)

In [578]:
model.pos_embedding

Parameter containing:
tensor([[ 1.5394,  0.8320, -0.2921,  ..., -1.4703,  0.7843,  1.4027],
        [-0.1804,  0.7530, -0.4761,  ...,  0.4572, -1.0601,  0.0435],
        [-0.5856,  1.2243,  0.8450,  ..., -1.1198, -0.6950, -0.4432],
        ...,
        [ 0.1839,  1.7512, -0.5384,  ..., -1.1478,  0.7921, -0.3212],
        [ 0.5457, -1.3205,  0.3002,  ...,  0.4525, -1.1747,  0.3891],
        [-0.0320, -1.1261,  0.8571,  ...,  0.1875,  0.6398, -0.0188]],
       requires_grad=True)