<a href="https://colab.research.google.com/github/steelannelida/nanoGPT/blob/master/nanogpt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!git clone https://github.com/steelannelida/nanoGPT.git

Cloning into 'nanoGPT'...
remote: Enumerating objects: 700, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (12/12), done.[K
remote: Total 700 (delta 10), reused 10 (delta 6), pack-reused 682 (from 1)[K
Receiving objects: 100% (700/700), 1.10 MiB | 20.43 MiB/s, done.
Resolving deltas: 100% (395/395), done.


In [2]:
!pip install torch numpy transformers datasets tiktoken wandb tqdm


Collecting datasets
  Downloading datasets-3.0.2-py3-none-any.whl.metadata (20 kB)
Collecting tiktoken
  Downloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.2-py3-none-any.whl (472 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m472.7/472.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tiktoken-0.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none

In [3]:
!yes  | python nanoGPT/data/shakespeare/prepare.py

train has 301,966 tokens
val has 36,059 tokens


In [1]:
with open('nanoGPT/data/shakespeare/input.txt') as f:
  text = f.read()

chars = sorted(set(text))
vocab_size=len(chars)
stoi = {ch: i for i, ch in enumerate(chars)}

def encode(t):
  return [stoi[c] for c in t]

def decode(seq):
  return ''.join([chars[i] for i in seq])

decode(encode("sandwitch"))

'sandwitch'

In [2]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

data = torch.tensor(encode(text), device=device)
data.shape, data.dtype

n = int(data.shape[0] * 0.9)
train_data = data[:n]
valid_data = data[n:]

In [3]:
#torch.manual_seed(1337)


def get_batch(data_set=train_data, batch_size=32, seq_length=128):
    x = torch.zeros([batch_size, seq_length], dtype=torch.int, device=device)
    y = torch.zeros([batch_size, seq_length], dtype=torch.int, device=device)
    for b in range(batch_size):
        t = torch.randint(0, data_set.shape[0] - seq_length - 1, [1])
        x[b] = data_set[t:t+seq_length]
        y[b] = data_set[t+1:t+seq_length+1]
    return x, y

x, y = get_batch()
print(decode(x[13]))
print(decode(y[13]))

l news at hand:
My bosom's lord sits lightly in his throne;
And all this day an unaccustom'd spirit
Lifts me above the ground wi
 news at hand:
My bosom's lord sits lightly in his throne;
And all this day an unaccustom'd spirit
Lifts me above the ground wit


In [7]:
import random
arith_chars = '0123456789-=*+'
arith_idx = {c:idx for idx, c in enumerate(arith_chars)}


def gen_expression(digits=5, terms=3):
  s = ''
  for i in range(terms):
    if i > 0:
      s += random.choice('*-+')
    num = ''
    for j in range(digits):
      num += random.choice('0123456789')
    num = str(int(num))
    s += num
  return s

def gen_eq():
  expr = gen_expression()
  value = eval(expr)
  return f'{expr}={value}'

gen_eq()


'70059-41345*43069=-1780617746'

In [4]:
import torch.nn as nn

class DecoderLayer(nn.Module):
  def __init__(self, embed_size=256, wide_size=1024, nheads=16, dropout=0.5):
    super().__init__()
    self.attn = nn.MultiheadAttention(embed_size, nheads,
                                      batch_first=True)
    self.ln1 = nn.LayerNorm(embed_size)
    self.ln2 = nn.LayerNorm(embed_size)
    self.lin1 = nn.Linear(embed_size, wide_size)
    self.lin2 = nn.Linear(wide_size, embed_size)
    self.drop = nn.Dropout(dropout)

  def forward(self, x, mask):
    x = self.ln1(x)
    a, w = self.attn.forward(x, x, x, attn_mask=mask, is_causal=True)
    x = x + self.drop(a)
    x = self.ln2(x)
    xx = self.lin1(x)
    xx = nn.functional.gelu(xx)
    y = self.lin2(xx)
    return x + y



class LM(nn.Module):
  def __init__(self, vocab_size, embed_size=256, nheads=16, max_pos=2048, num_layers=3, dropout=0.5):
    super().__init__()
    self.embeddings = nn.Embedding(vocab_size, embed_size)
    self.pos_embeddings = nn.Embedding(max_pos, embed_size)
    self.layers = [
        DecoderLayer(embed_size=embed_size, wide_size=4 * embed_size,
                     nheads=nheads, dropout=dropout)
        for _ in range(num_layers)
    ]
    for i, layer in enumerate(self.layers):
      self.add_module(f'decoder-{i}', layer)
    self.ln = nn.LayerNorm(embed_size)
    self.out = nn.Linear(embed_size, vocab_size)
    self.max_pos = max_pos
    self.to(device)

  def forward(self, idx):
    idx = torch.tensor(idx, device=device)
    l = idx.shape[-1]
    pe = self.pos_embeddings(torch.arange(0, l, device=device))
    e = self.embeddings(idx)
    e = e + pe.view(1, *pe.shape)
    mask = ~torch.tril(torch.ones([l, l], dtype=torch.bool, device=device))
    #print(mask)
    #print(w)
    for layer in self.layers:
      e = layer(e, mask)
    logits = self.out.forward(self.ln(e))
    return logits

  def generate(self, prompt, l):
    prompt = torch.tensor(prompt)
    pl = prompt.shape[0]
    result = torch.zeros([pl + l], dtype=torch.int, device=device)
    result[:pl] = prompt
    for i in range(l):
      logits = self.forward(result[:pl + i])
      sm = logits[:,-1].flatten().softmax(0)
      next_idx = torch.multinomial(sm, 1)
      result[i+pl] = next_idx
    return result

model = LM(vocab_size)

k=x
q=x[:,:-1]
model.forward(encode('jello'))
decode(model.generate(encode('hello'), 10))

  idx = torch.tensor(idx, device=device)


'hellomblgYonD!M'

In [5]:
x, y = get_batch()
logits = model(x)
loss_fun = nn.CrossEntropyLoss()
loss = loss_fun(logits.permute(0, 2, 1), y.long())
loss

  idx = torch.tensor(idx, device=device)


tensor(4.2949, device='cuda:0', grad_fn=<NllLoss2DBackward0>)

In [18]:
torch.manual_seed(1337)

model = LM(vocab_size, num_layers=8)
opt = torch.optim.AdamW(model.parameters(), lr=1e-3)
xv, yv = get_batch(valid_data, batch_size=256)
scaler = torch.cuda.amp.GradScaler()

  scaler = torch.cuda.amp.GradScaler()


In [22]:
torch.set_float32_matmul_precision('medium')
with torch.profiler.profile(with_stack=True, profile_memory=True) as prof:
  for step in range(10):
    x, y = get_batch(batch_size=32, seq_length=128)
    with torch.autocast(device_type='cuda', dtype=torch.float16):
      model.train()
      logits = model.forward(x)
      loss = loss_fun(logits.permute(0, 2, 1), y.long())

    #scaler.scale(loss).backward()
    #scaler.step(opt)
    #scaler.update()
    loss.backward()
    opt.step()
    opt.zero_grad()

    if step % 10 == 0:
      with torch.no_grad():
        with torch.autocast(device_type='cuda', dtype=torch.float16):
          model.eval()
          vlogits = model.forward(xv)
          vloss = loss_fun(vlogits.permute(0,2,1), yv.long())
        print('%d\t%f\t%f'%(step, loss, vloss))


  idx = torch.tensor(idx, device=device)


0	2.124569	2.153862
10	2.075121	2.118199
20	2.077763	2.122209
30	2.019752	2.102204
40	2.061837	2.096250
50	2.008243	2.091561
60	2.057459	2.079584
70	2.045761	2.063578
80	2.027742	2.054062
90	1.969361	2.037554


In [27]:
print(prof.export_chrome_trace('trace1.json'))


None


In [28]:
!ls -lh trace1.json

-rw-r--r-- 1 root root 524M Oct 31 14:35 trace1.json


In [9]:
with torch.no_grad():
  with torch.autocast(device_type='cuda', dtype=torch.float16):
    model.eval()
    print(decode(model.generate(encode('To dream'), 256)))

  idx = torch.tensor(idx, device=device)


To dream revenge with my devil land Montagued
When are here this living ere such wears escape
For shall then place in thine own: prucacrumelde, fowese din.
Wathe, hese thesers allay se the Juse bade, wbur rbear fo iethe warrurie mino ambon adld
Nes sene rine, ouse


In [None]:
import matplotlib.pyplot as plt

plt.imshow(model.embeddings.weight.detach().cpu().numpy())
# plt.plot(model.pos_embeddings.weight[:,37].detach().cpu().numpy())
# plt.plot(model.pos_embeddings.weight[:,4].detach().cpu().numpy())


In [None]:
from sklearn.manifold import TSNE

tsne = TSNE()
y = tsne.fit_transform(model.out.weight.detach().cpu().numpy())

plt.scatter(y[:,0], y[:,1], s=1, alpha=0.1)
for i, c in enumerate(chars):
  plt.text(y[i,0], y[i,1], c)

In [None]:
model.out.weight.shape