In [1]:
with open("data/paul_graham_essay.txt", "r") as f:
    data = f.read()

In [2]:
len(data)

75012

In [3]:
vocab = set(data)

In [4]:
"".join(vocab)

'YWgs4e.tILj;cy3K+\'ia,v zN7[0E!DuHMC6p1T\nArqR8"Vlx]$XB%Umnkâ€”-29&So/O:(f5hJwGbFPd)?'

In [5]:
len(vocab)

81

In [6]:
stoi = {char: idx for idx, char in enumerate(vocab)}
itos = {idx: char for char, idx in stoi.items()}

def encode(text):
    return [stoi[char] for char in text]

def decode(tokens):
    return "".join(itos[token] for token in tokens)

test_text = "some random string"

print(decode(encode(test_text)))

some random string


In [7]:
import torch

tokenized_data = torch.tensor(encode(data), dtype=torch.long)

In [8]:
n = int(0.9 * len(tokenized_data))
train_data = tokenized_data[:n]
test_data = tokenized_data[n:]

In [10]:
import transformer
import importlib
importlib.reload(transformer)
from transformer import Transformer

In [None]:
device = "cuda"
VOCAB_SIZE = len(vocab)
EMBED_SIZE = 384
NUM_HEADS = 8
CONTEXT_LENGTH = 16
NUM_LAYERS = 6

In [11]:
m = Transformer(vocab_size=VOCAB_SIZE, embed_size=EMBED_SIZE, num_heads=NUM_HEADS, context_length=CONTEXT_LENGTH, num_layers=NUM_LAYERS)
m.to(device)

Transformer(
  (token_embedding): Embedding(81, 384)
  (pos_embedding): Embedding(16, 384)
  (attn_blocks): ModuleList(
    (0-5): 6 x AttentionBlock(
      (attn_heads): ModuleList(
        (0-7): 8 x CausalSelfAttention(
          (Q): Linear(in_features=384, out_features=48, bias=False)
          (K): Linear(in_features=384, out_features=48, bias=False)
          (V): Linear(in_features=384, out_features=48, bias=False)
        )
      )
      (mlp): MLP(
        (fcn): Linear(in_features=384, out_features=1536, bias=True)
        (activation): ReLU()
        (proj): Linear(in_features=1536, out_features=384, bias=True)
      )
      (layer_norm_1): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      (layer_norm_2): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
    )
  )
  (lm_head): Linear(in_features=384, out_features=81, bias=False)
)

In [12]:
context_length = CONTEXT_LENGTH
batch_size = 32
unif = torch.ones(train_data.shape[0] - context_length)
unif.to(device)
def get_batch():
    batch_indices = unif.multinomial(batch_size, replacement=False)
    inputs = torch.stack([train_data[i:i+context_length] for i in batch_indices])
    targets = torch.stack([train_data[i+1:i+1+context_length] for i in batch_indices])
    inputs = inputs.to(device)
    targets = targets.to(device)
    return inputs, targets

logits, loss = m(*get_batch())

In [13]:
logits.shape

torch.Size([32, 16, 81])

In [14]:
loss

tensor(4.6083, device='cuda:0', grad_fn=<NllLoss2DBackward0>)

In [15]:
start_idx = get_batch()[0]
print(start_idx)
m.generate(start_idx, 1)

tensor([[78, 22, 75, 13, 22, 71,  5, 41, 22,  3,  7, 31, 78,  5, 56,  7],
        [ 7, 71,  5, 22, 12,  5, 18, 47, 18, 56,  2,  6, 22,  1, 71, 18],
        [ 5,  5, 56, 22, 69,  5, 19,  3, 18, 75, 47,  5, 22, 19,  7, 22],
        [56,  7, 47, 13, 22,  3, 18, 56, 12,  5, 22, 73,  5, 22,  3,  7],
        [19, 12, 57, 22, 18, 56,  7, 64, 22,  3, 71, 19, 36,  5, 20, 22],
        [78, 22,  3,  7, 41, 19, 56,  2,  5, 22, 19, 78, 21, 18, 12,  5],
        [ 7, 71, 18,  3, 22, 56,  5, 73, 22, 12, 64, 55, 36, 19, 56, 13],
        [56, 18, 23,  5, 78, 20, 22, 12,  5, 41,  7, 19, 18, 56, 47, 13],
        [18, 56,  2, 22, 47, 18, 69,  5, 22, 41, 18,  2, 71,  7,  6, 22],
        [64, 31, 47, 78, 22,  5, 21,  5, 56, 22, 41,  5, 19, 47, 18, 23],
        [22, 69,  5, 73, 22, 55, 64, 56,  7, 71,  3,  6, 22, 24, 64, 41],
        [75, 31,  7, 22,  5, 48, 36, 47, 18, 12, 18,  7, 47, 13, 22, 78],
        [56, 22,  5,  3,  3, 19, 13, 22,  8, 17, 55, 22,  2, 64, 18, 56],
        [22,  5, 56, 78, 22, 64, 69, 2

tensor([[78, 22, 75, 13, 22, 71,  5, 41, 22,  3,  7, 31, 78,  5, 56,  7, 65],
        [ 7, 71,  5, 22, 12,  5, 18, 47, 18, 56,  2,  6, 22,  1, 71, 18, 76],
        [ 5,  5, 56, 22, 69,  5, 19,  3, 18, 75, 47,  5, 22, 19,  7, 22, 34],
        [56,  7, 47, 13, 22,  3, 18, 56, 12,  5, 22, 73,  5, 22,  3,  7, 52],
        [19, 12, 57, 22, 18, 56,  7, 64, 22,  3, 71, 19, 36,  5, 20, 22, 34],
        [78, 22,  3,  7, 41, 19, 56,  2,  5, 22, 19, 78, 21, 18, 12,  5, 11],
        [ 7, 71, 18,  3, 22, 56,  5, 73, 22, 12, 64, 55, 36, 19, 56, 13, 55],
        [56, 18, 23,  5, 78, 20, 22, 12,  5, 41,  7, 19, 18, 56, 47, 13, 63],
        [18, 56,  2, 22, 47, 18, 69,  5, 22, 41, 18,  2, 71,  7,  6, 22, 13],
        [64, 31, 47, 78, 22,  5, 21,  5, 56, 22, 41,  5, 19, 47, 18, 23, 59],
        [22, 69,  5, 73, 22, 55, 64, 56,  7, 71,  3,  6, 22, 24, 64, 41, 16],
        [75, 31,  7, 22,  5, 48, 36, 47, 18, 12, 18,  7, 47, 13, 22, 78, 60],
        [56, 22,  5,  3,  3, 19, 13, 22,  8, 17, 55, 22,  2, 64,

In [16]:
decode(m.generate(start_idx, 100)[0].tolist())

"d by her studentVe$%7I8Tz$UwOf$3v$CTkCnEOH+(fUA]-UY,poaUz8/DW[CCP]4,XtC37OwyD6k9vCCIC\nmyYiCG3TK!V9OTDD[WE3Iw]jGPT'C&"

In [19]:
optim = torch.optim.AdamW(params=m.parameters(), lr=1e-4)

In [28]:
# this is not the first run
for step in range(1000):
    _, loss = m(*get_batch())
    optim.zero_grad()
    loss.backward()
    optim.step()
    if step % 100 == 0:
        print(loss.item())

1.044907569885254
1.1167418956756592
1.0756452083587646
1.0323002338409424
1.014737606048584
0.9621953964233398
1.0066713094711304
0.8820263147354126
0.9256763458251953
0.9642428159713745


In [29]:
with torch.no_grad():
    print(decode(m.generate(torch.tensor([encode("I was a")], device=device), 1000)[0].tolist()))

I was an early ram, so off maine days, to but to do) I was in retain me, but painting the that Interleaf in I was onaY was a penter of the store that sourgent now that we had to beauild about retail, so it as puccally later anything to write the student prestigious on this poitten by Bento Word: witch. At the time thinking. Witor kind of work on it on my writing Rtm to offer raisons vesters that art nurned ough something last compositions of with AI, as we should I ran ealient proximation of the right to be one dayer in New York.

Computs of new kind of some sort of be than about Lisp was hacking when I husked for the language is a book. [2]

I wanted not me to RISD, but it was not documentary to layous, so off the visual cues begied. In because that it on my model to get a mother of days of Harvalual stocks wouldn't need very engaging. So main more than intestood is a book, now I was a difficult surprise,  the started to sign but to du0. I kept that instead the channel language. We kn