In [1]:
%pip install torchinfo

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchinfo import summary

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [4]:
# download file to tmp/data.txt
!wget -O tmp/data.txt https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-09-27 22:44:28--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: 'tmp/data.txt'

     0K .......... .......... .......... .......... ..........  4% 3.80M 0s
    50K .......... .......... .......... .......... ..........  9% 20.6M 0s
   100K .......... .......... .......... .......... .......... 13% 7.33M 0s
   150K .......... .......... .......... .......... .......... 18% 28.0M 0s
   200K .......... .......... .......... .......... .......... 22% 23.9M 0s
   250K .......... .......... .......... .......... .......... 27% 17.7M 0s
   300K .......... .......... .......... .......... .......... 32% 23.8M 0s
   350K ..........

In [5]:
with open('tmp/data.txt', 'r') as f:
    text = f.read()
print('text length:', len(text))

text length: 1115394


In [6]:
print(text[:100])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [7]:
chars = ["[PAD]", *sorted(list(set(text)))]
vocab_size = len(chars)
print("".join(chars))
print("vocab size:", vocab_size)

[PAD]
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
vocab size: 66


In [8]:
# Create mapping from character to index and vice versa
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

print(encode("hii there"))
print(decode(encode("hii there")))

[47, 48, 48, 2, 59, 47, 44, 57, 44]
hii there


In [9]:
# store in tensor
data = torch.tensor(encode(text), dtype=torch.int64, device=device)
print(data.shape, data.dtype)
print(data[:100]) # the 100 characters we looked at earier will to the GPT look like this

torch.Size([1115394]) torch.int64
tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60], device='cuda:0')


In [10]:
# Let's now split up the data into train and validation sets
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [11]:
block_size = 100
train_data[:block_size+1]

tensor([19, 48, 57, 58, 59,  2, 16, 48, 59, 48, 65, 44, 53, 11,  1, 15, 44, 45,
        54, 57, 44,  2, 62, 44,  2, 55, 57, 54, 42, 44, 44, 43,  2, 40, 53, 64,
         2, 45, 60, 57, 59, 47, 44, 57,  7,  2, 47, 44, 40, 57,  2, 52, 44,  2,
        58, 55, 44, 40, 50,  9,  1,  1, 14, 51, 51, 11,  1, 32, 55, 44, 40, 50,
         7,  2, 58, 55, 44, 40, 50,  9,  1,  1, 19, 48, 57, 58, 59,  2, 16, 48,
        59, 48, 65, 44, 53, 11,  1, 38, 54, 60,  2], device='cuda:0')

In [12]:
a = train_data[:block_size]
y = train_data[1:block_size+1]

In [13]:
torch.manual_seed(1337)
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 10 # what is the maximum context length for predictions?

def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

xb, yb = get_batch('train')
print('inputs:')
print(xb.shape, xb.dtype, xb.device)
print('targets:')
print(yb.shape, yb.dtype, yb.device)

inputs:
torch.Size([32, 10]) torch.int64 cuda:0
targets:
torch.Size([32, 10]) torch.int64 cuda:0


In [14]:
class NGramLanguageModel(nn.Module):
    def __init__(self, vocab_size, n):
        super().__init__()
        super().to(device)
        self.n = n
        embed_size = vocab_size + 10 * n
        self.token_embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.pos_embedding = nn.Parameter(torch.randn(n, embed_size))
        self.adding_weight = nn.Parameter(torch.zeros(n))
        self.fc = nn.Linear(embed_size, 200)
        self.relu = nn.ReLU()
        self.final = nn.Linear(200, vocab_size)

    # Create separate function for forward calculation
    def forward(self, x):
        assert len(x.shape) == 2, "input shape should be (batch, time)"
        new_x = torch.zeros((x.shape[0], x.shape[1], self.n), dtype=torch.int64, device=device) - 69
        # -69 is just a random number so if we see it we know something went wrong
        for t in range(x.shape[1]):
            for pos in range(self.n):
                row = x[:, max(0, t - pos) : t + 1]
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                row = F.pad(row, (self.n - row.shape[1], 0), value=0)
                new_x[:, t] = row

        x = torch.add(self.token_embedding(new_x), self.pos_embedding)
        x = F.softmax(self.adding_weight, -1) @ x
        x = self.fc(x)
        x = self.relu(x)
        x = self.final(x)
        return x

    # def forward(self, x, targets=None):
    #     if targets is None:
    #         # we're doing inference, so we don't have targets
    #         # if x is None, we're generating from scratch
    #         if x is None:
    #             x = batch_size

    #         # generate x batches
    #         if isinstance(x, int):
    #             x = torch.zeros(x, 1, dtype=torch.int32, device=device)
    #         return self.forward_calc(x), None

    #     else:
    #         # we're training, so we do have targets
    #         logits = self.forward_calc(x)
    #         B, T, C = logits.shape  # batch, time, channel
    #         logits_flat = logits.view(B * T, C)
    #         loss = F.cross_entropy(logits_flat, targets.view(B * T))

    #         return logits_flat, loss

    def loss(self, logits, targets):
        B, T, C = logits.shape
        logits_flat = logits.view(B * T, C)
        loss = F.cross_entropy(logits_flat, targets.view(B * T))
        return loss

    def generate(self, x, max_len_new, temperature=1.0):
        for _ in range(max_len_new):
            logits = self(x)[:, -1] / temperature
            probs = F.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
            x = torch.cat([x, next_token], dim=1)

        return x

In [15]:
model = NGramLanguageModel(vocab_size, 7)
summary(model, input_size=(32, 10), dtypes=[torch.long], verbose=2, device=device)
model.to(device)

Layer (type:depth-idx)                   Output Shape              Param #
NGramLanguageModel                       [32, 10, 66]              959
├─pos_embedding                                                    ├─952
├─adding_weight                                                    └─7
├─Embedding: 1-1                         [32, 10, 7, 136]          8,976
│    └─weight                                                      └─8,976
├─Linear: 1-2                            [32, 10, 200]             27,400
│    └─weight                                                      ├─27,200
│    └─bias                                                        └─200
├─ReLU: 1-3                              [32, 10, 200]             --
├─Linear: 1-4                            [32, 10, 66]              13,266
│    └─weight                                                      ├─13,200
│    └─bias                                                        └─66
Total params: 50,601
Trainable params: 50,601
N

NGramLanguageModel(
  (token_embedding): Embedding(66, 136, padding_idx=0)
  (fc): Linear(in_features=136, out_features=200, bias=True)
  (relu): ReLU()
  (final): Linear(in_features=200, out_features=66, bias=True)
)

In [16]:
print(model)
# logits, loss = model(xb, yb)
logits = model(xb)
loss = model.loss(logits, yb)
print('logits:', logits.shape)
print('loss:', loss)

print(decode(model.generate(xb, 10)[0].tolist()))

NGramLanguageModel(
  (token_embedding): Embedding(66, 136, padding_idx=0)
  (fc): Linear(in_features=136, out_features=200, bias=True)
  (relu): ReLU()
  (final): Linear(in_features=200, out_features=66, bias=True)
)
logits: torch.Size([32, 10, 66])
loss: tensor(4.1809, device='cuda:0', grad_fn=<NllLossBackward0>)
rch o' the;peywaCjQY


In [17]:
# train the model
optimizer = optim.AdamW(model.parameters(), lr=0.004)
batch_size = 256
for step in range(1001):
    xb, yb = get_batch('train')
    # logits, loss = model(xb, yb)
    logits = model(xb)
    loss = model.loss(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if step % 50 == 0:
        print(f'step: {step}, loss: {loss.item():.3f}')

step: 0, loss: 4.174
step: 50, loss: 2.824
step: 100, loss: 2.578
step: 150, loss: 2.409
step: 200, loss: 2.264
step: 250, loss: 2.243
step: 300, loss: 2.164
step: 350, loss: 2.170
step: 400, loss: 2.107
step: 450, loss: 2.121
step: 500, loss: 2.060
step: 550, loss: 2.029
step: 600, loss: 2.051
step: 650, loss: 2.034
step: 700, loss: 2.042
step: 750, loss: 2.040
step: 800, loss: 2.003
step: 850, loss: 1.976
step: 900, loss: 1.920
step: 950, loss: 1.932
step: 1000, loss: 1.984


In [23]:
for x in model.generate(torch.tensor([encode("LUCENT")] * 5, device=device), 200, 0.9):
    print(decode(x.tolist()))
    print('----')

LUCENTIONTERD IVER:
Prove the beenuse o to arhank.

AUS:
Wheret in his the their fall the lost fring hor fot word and deis,
And hear for willif and.

ANGERITONEN LINA:
We pertion.

CKING EDWARD III:
The see
----
LUCENTIO:
's this
cI conde therence hat onor Penobles
Your geand Carnise fear whose good mans and me Clarefoly band he's butir;
And the thy im ble arsecusbe you, un comoners wiced not thou lcus he courgh;
A
----
LUCENTES:
Wher bones noord, and the vers of him you shot. Han thou we thou come; you she im'st, bace was if that with and us in heesatine, shat be corly worduen: me sould the the of unace
det roves gin to h
----
LUCENTER:
Andiedr him nirn bece:
'Ttian ther! Gody geme know, hom onoter wifine the it and sort', and mer his londe?

QUEENCUS:
Verre as an whing on him then weatid: on gre be
And to mobe!

Leard live he di
----
LUCENTES:
But our fle, 'eat his ding.

KINGHARD IV:
It I drown.
Whith worcit: will odinfest thisce;
As Resuming cout your he mor loved it the come of to

In [27]:
model.generate(torch.tensor([encode("hello")], device=device), 10, 0.0001)

tensor([[47, 44, 51, 51, 54, 62,  2, 59, 47, 44,  2, 59, 47, 44,  2]],
       device='cuda:0')

In [19]:
F.softmax(model.adding_weight, -1)

tensor([0.0218, 0.0231, 0.0289, 0.0437, 0.1194, 0.2628, 0.5004],
       device='cuda:0', grad_fn=<SoftmaxBackward0>)

In [29]:
# save model
import os, json

os.makedirs("tmp/ngram", exist_ok=True)

with open("tmp/ngram/config.json", "w") as f:
    json.dump({"chars": chars, "vocab_size": vocab_size, "n": model.n}, f)

torch.onnx.export(
    model,
    torch.zeros(1, 10, dtype=torch.int32, device=device),
    "tmp/ngram/model.onnx",
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={
        "input": {0: "batch", 1: "time"},
        "output": {0: "batch", 1: "time"},
    },
)