In [2]:
%pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary

In [4]:
# download file to tmp/data.txt
!wget -O tmp/data.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-09-27 20:53:16--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'tmp/data.txt'

     0K .......... .......... .......... .......... ..........  4% 5.01M 0s
    50K .......... .......... .......... .......... ..........  9% 17.4M 0s
   100K .......... .......... .......... .......... .......... 13% 8.23M 0s
   150K .......... .......... .......... .......... .......... 18% 13.8M 0s
   200K .......... .......... .......... .......... .......... 22% 41.1M 0s
   250K .......... .......... .......... .......... .......... 27% 19.9M 0s
   300K .......... .......... .......... .......... .......... 32% 25.9M 0s
   350K ..........

In [5]:
with open('tmp/data.txt', 'r') as f:
    text = f.read()
print('text length:', len(text))

text length: 1115394


In [6]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
chars = ["[PAD]", *sorted(list(set(text)))]
vocab_size = len(chars)
print("".join(chars))
print("vocab size:", vocab_size)

[PAD]
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 66


In [8]:
# Create mapping from character to index and vice versa
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[47, 48, 48, 2, 59, 47, 44, 57, 44]
hii there


In [9]:
# store in tensor
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:100]) # the 100 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60])


In [10]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 100
train_data[:block_size+1]

tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60,  2])

In [12]:
a = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = a[:t+1]
    target = y[t]
    print(f"when input is {context} the target: {target}")

when input is tensor([19]) the target: 48
when input is tensor([19, 48]) the target: 57
when input is tensor([19, 48, 57]) the target: 58
when input is tensor([19, 48, 57, 58]) the target: 59
when input is tensor([19, 48, 57, 58, 59]) the target: 2
when input is tensor([19, 48, 57, 58, 59,  2]) the target: 16
when input is tensor([19, 48, 57, 58, 59,  2, 16]) the target: 48
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48]) the target: 59
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59]) the target: 48
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48]) the target: 65
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65]) the target: 44
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44]) the target: 53
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53]) the target: 11
when input is tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11]) the target: 1
when input is tensor([19, 48, 57, 58, 59,  2, 16, 

In [13]:
torch.manual_seed(1337)
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 10 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape)
print('targets:')
print(yb.shape)

inputs:
torch.Size([32, 10])
targets:
torch.Size([32, 10])


In [14]:
class NGramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n):
        super().__init__()
        self.n = n
        embed_size = vocab_size + 10 * n
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.pos_embedding = nn.Parameter(torch.randn((n, embed_size)))
        self.adding_weight = nn.Parameter(torch.zeros(n))
        self.fc = nn.Linear(embed_size, 200)
        self.relu = nn.ReLU()
        self.final = nn.Linear(200, vocab_size)

    # Create separate function for forward calculation
    def forward_calc(self, x):
        assert len(x.shape) == 2, "input shape should be (batch, time)"
        new_x = torch.zeros((x.shape[0], x.shape[1], self.n), dtype=torch.long) - 69
        # -69 is just a random number so if we see it we know something went wrong
        for t in range(x.shape[1]):
            for pos in range(self.n):
                row = x[:, max(0, t - pos) : t + 1]
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                new_x[:, t] = row 

        x = torch.add(self.token_embedding(new_x), self.pos_embedding)
        x = F.softmax(self.adding_weight, -1) @ x
        x = self.fc(x)
        x = self.relu(x)
        x = self.final(x)
        return x

    def forward(self, x, targets=None):
        if targets is None:
            # we're doing inference, so we don't have targets
            # if x is None, we're generating from scratch
            if x is None:
                x = batch_size

            # generate x batches
            if isinstance(x, int):
                x = torch.zeros(x, 1, dtype=torch.long)
            return self.forward_calc(x), None

        else:
            # we're training, so we do have targets
            logits = self.forward_calc(x)
            B, T, C = logits.shape  # batch, time, channel
            logits_flat = logits.view(B * T, C)
            loss = F.cross_entropy(logits_flat, targets.view(B * T))

            return logits_flat, loss

    def generate(self, x, max_len_new, temperature=1.0):
        for _ in range(max_len_new):
            logits, loss = self(x)
            logits_dist = logits[:, -1] / temperature
            probs = F.softmax(logits_dist, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, next_token], dim=1)

        return x

In [15]:
model = NGramLanguageModel(vocab_size, 10)
summary(model, input_size=(32, 10), dtypes=[torch.long], verbose=2);

Layer (type:depth-idx)                   Output Shape              Param #
NGramLanguageModel                       [32, 10, 66]              1,670
├─pos_embedding                                                    ├─1,660
├─adding_weight                                                    └─10
├─Embedding: 1-1                         [32, 10, 10, 166]         10,956
│    └─weight                                                      └─10,956
├─Linear: 1-2                            [32, 10, 200]             33,400
│    └─weight                                                      ├─33,200
│    └─bias                                                        └─200
├─ReLU: 1-3                              [32, 10, 200]             --
├─Linear: 1-4                            [32, 10, 66]              13,266
│    └─weight                                                      ├─13,200
│    └─bias                                                        └─66
Total params: 59,292
Trainable params: 5

In [16]:
print(model)
logits, loss = model(xb, yb)
print('logits:', logits.shape)
print('loss:', loss)

print(decode(model.generate(xb, 10)[0].tolist()))

NGramLanguageModel(
  (token_embedding): Embedding(66, 166, padding_idx=0)
  (fc): Linear(in_features=166, out_features=200, bias=True)
  (relu): ReLU()
  (final): Linear(in_features=200, out_features=66, bias=True)
)
logits: torch.Size([320, 66])
loss: tensor(4.1999, grad_fn=<NllLossBackward>)
rch o' the$;
Cg'vJ$-


In [17]:
# train the model
optimizer = optim.AdamW(model.parameters(), lr=0.004)
batch_size = 32
for step in range(10001):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if step % 500 == 0:
        print(f'step: {step}, loss: {loss.item():.3f}')

step: 0, loss: 4.201
step: 500, loss: 2.498
step: 1000, loss: 2.048
step: 1500, loss: 2.114
step: 2000, loss: 2.080
step: 2500, loss: 2.035
step: 3000, loss: 1.909
step: 3500, loss: 2.089
step: 4000, loss: 1.997
step: 4500, loss: 1.930
step: 5000, loss: 2.008
step: 5500, loss: 2.021
step: 6000, loss: 1.982
step: 6500, loss: 1.802
step: 7000, loss: 2.079
step: 7500, loss: 2.018
step: 8000, loss: 1.901
step: 8500, loss: 1.960
step: 9000, loss: 1.924
step: 9500, loss: 1.896
step: 10000, loss: 1.800


In [18]:
for x in model.generate(torch.tensor([encode("LUCENT")] * 5), 200, 0.9):
    print(decode(x.tolist()))
    print('----')

LUCENTIO:
Which be sition be to not his se won, I hwill there brothis,
No Edwickepakes will if herave lit, I unce frimfor, whereas upon honoursespercombly grain me fore, brame band grier much hour live true
----
LUCENTIONENIUS:
I hat to man your cilt much and sour I his of his andubleful, here such salour cal I strue, thous to his for that prexcetstaiong hours.

HORMIONE Edwerishe him
Tis
Thou and a puporacef, the 
----
LUCENTIONE:
I ward, goontly fivoly donencles,
Rearthou fforer.

FLORD III:
Tha was that doo fresce, lay, your wilt, and the conto more, wheretring se'ees must hore no me the pries youry mamen thous but whis
----
LUCENTIO:
Pricannot me, sill what is carhe with the spering him. and say sil spe of theme
And-nurs man shat majughty hat o heads thath and dot of enjurespe.

DUKE PEDITHARD III:
Sestannot ith the dall epock
----
LUCENTIO:
Come is nis the bes to sancon,
woulder suse nang dill this spreate trume of or the four what make deaty that my herous there seat then their R

In [19]:
F.softmax(model.adding_weight, -1)

tensor([0.0084, 0.0054, 0.0048, 0.0052, 0.0068, 0.0144, 0.0429, 0.1393, 0.2806,
        0.4921], grad_fn=<SoftmaxBackward>)

In [20]:
model.pos_embedding

Parameter containing:
tensor([[ 1.1868, -0.5051,  0.3174,  ..., -0.6831, -0.0644, -0.9378],
        [ 0.4732, -0.8354, -0.1158,  ...,  0.3992, -0.6814, -0.3946],
        [ 0.3872, -0.6577,  0.8098,  ...,  1.3233, -0.4416, -0.4947],
        ...,
        [-0.5210,  0.3679,  0.2710,  ...,  1.4388,  0.8832,  0.1417],
        [ 0.4946,  1.1246, -0.5485,  ...,  0.0711,  0.1944,  0.7019],
        [-0.2717, -0.3539,  0.1141,  ..., -1.1909,  0.2755,  0.7620]],
       requires_grad=True)