In [1]:
!mkdir data
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
!mv input.txt data/shakespeare.txt
!pip install git+https://github.com/shuiruge/nanogpt.git

--2024-03-16 06:54:50--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2024-03-16 06:54:50 (135 MB/s) - ‘input.txt’ saved [1115394/1115394]

Collecting git+https://github.com/shuiruge/nanogpt.git
  Cloning https://github.com/shuiruge/nanogpt.git to /tmp/pip-req-build-351mjjhc
  Running command git clone --filter=blob:none --quiet https://github.com/shuiruge/nanogpt.git /tmp/pip-req-build-351mjjhc
  Resolved https://github.com/shuiruge/nanogpt.git to commit 8d4b2125238af07de365540828b6ea98397db953
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: nanogpt
  Building wheel for

In [2]:
import numpy as np
import tensorflow as tf
from keras.layers import Layer, Dense
from keras.models import Model
from keras.optimizers import AdamW
from keras.losses import SparseCategoricalCrossentropy
from keras.utils import Progbar
from nanogpt.utils import CharacterTokenizer, Embed, ResBlockWrapper, FeedForward, attention, MultiHeadWrapper

In [3]:
with open('data/shakespeare.txt', 'r') as f:
    corpus = ''.join(f.readlines())
print(corpus[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [4]:
tokenizer = CharacterTokenizer(corpus)
tokenizer.vocab_size

65

In [5]:
seq_len = 100

In [None]:
# import pickle

# with open('tmp.pkl', 'rb') as f:
#     contexts, targets = pickle.load(f)

In [6]:
n_data = len(corpus) - seq_len
contexts = []
targets = []
for i in range(n_data):
    context = tokenizer.encode(corpus[i:(i+seq_len)])
    target = tokenizer.encode(corpus[i+seq_len])[0]
    contexts.append(context)
    targets.append(target)
contexts = np.asarray(contexts, dtype='int32')
targets = np.asarray(targets, dtype='int32')

# import pickle

# with open('tmp.pkl', 'wb') as f:
#     pickle.dump((contexts, targets), f)

In [None]:
# contexts[0], targets[0]

In [12]:
num_train = int(contexts.shape[0] * 0.9)

contexts_train = contexts[:num_train]
targets_train = targets[:num_train]

contexts_test = contexts[num_train:]
targets_test = targets[num_train:]

In [7]:
class FullSelfAttention(Layer):
    def __init__(self, model_dim, num_heads):
        super().__init__()
        self.model_dim = model_dim
        self.num_heads = num_heads

        self.multihead = MultiHeadWrapper(self.num_heads)
        self.attention_weights = None

        self.get_key = Dense(model_dim)
        self.get_value = Dense(model_dim)

    def call(self, x):
        query = self.multihead.split_heads(x)
        key = self.multihead.split_heads(self.get_key(x))
        value = self.multihead.split_heads(self.get_value(x))

        output, attention_weights = attention(query, key, value)
        output = self.multihead.concat_heads(output)
        self.attention_weights = attention_weights
        return output

In [49]:
class MyGPT(Model):

    def __init__(self, vocab_size, seq_len, embed_dim, model_dim, num_heads, ffd_dim, num_blocks):
        super().__init__()
        self.vocab_size = vocab_size
        self.seq_len = seq_len
        self.embed_dim = embed_dim
        self.model_dim = model_dim
        self.num_heads = num_heads
        self.ffd_dim = ffd_dim
        self.num_blocks = num_blocks

        self.hidden_layers = [Embed(seq_len, seq_len, embed_dim)]
        for i in range(num_blocks):
            self.hidden_layers += [
                ResBlockWrapper(FullSelfAttention(model_dim, num_heads)),
                ResBlockWrapper(FeedForward([ffd_dim], model_dim)),
            ]
        self.output_layer = Dense(vocab_size)

    def call(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
        x = x[:, -1, :]
        x = self.output_layer(x)
        return x

In [50]:
model = MyGPT(tokenizer.vocab_size, seq_len, 384, 384, 6, 4*384, 6)
model.build([None, seq_len])
model.summary()

Model: "my_gpt_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed_4 (Embed)             multiple                  76800     
                                                                 
 res_block_wrapper_16 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_17 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_18 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_19 (ResB  multiple                  0 (unused)
 lockWrapper)                                             

In [51]:
model.compile(
    optimizer=AdamW(3e-4),
    loss=SparseCategoricalCrossentropy(from_logits=True),
)
model.fit(
    contexts_train, targets_train,
    batch_size=128,
    validation_data=[contexts_test, targets_test],
    epochs=5,
)

Epoch 1/5
1144/7842 [===>..........................] - ETA: 30:24 - loss: 2.6475

KeyboardInterrupt: 

In [48]:
def generate_text(model, token_ids, max_new_tokens, T=1e+0):
    for _ in range(max_new_tokens):
        # [batch_size, vocab_size]
        logits = model(token_ids[:, -seq_len:])
        next_token_id = tf.random.categorical(T * logits, 1)
        token_ids = tf.concat([token_ids, next_token_id], axis=1)
    return token_ids

In [47]:
generated = generate_text(model, contexts[:1], 500)
print(tokenizer.decode(generated.numpy()[0])[500:])

 trom him
sood father a your worn's tentiing banon:
And valubtig Hastief'd: boy I your and to dow an


In [25]:
model(contexts[:1])

<tf.Tensor: shape=(1, 65), dtype=float32, numpy=
array([[  1.3672082 ,   8.305435  ,   2.4076118 ,  -4.6232176 ,
         -3.911023  ,   3.531605  ,   6.5494742 ,   1.7726717 ,
          3.360558  ,  -2.8447194 ,   2.3191185 ,   3.2174513 ,
          2.9754303 ,  -4.548216  ,  -3.108481  , -10.734207  ,
         -3.2567148 ,  -9.551634  ,  -4.2646604 ,  -4.017103  ,
         -6.056969  ,  -8.996709  ,  -5.9012647 ,  -6.7119308 ,
         -7.5072613 ,  -5.8581533 ,  -1.4336154 , -10.264325  ,
         -4.4024167 ,  -4.600944  ,  -8.042351  ,  -5.4092865 ,
         -7.328374  ,  -7.3929725 ,  -5.041228  ,  -5.2883825 ,
         -4.456541  ,  -6.830352  ,  -3.2494333 ,   0.24196754,
          2.697912  ,   1.790708  ,   1.685373  ,   2.357638  ,
          1.4519004 ,   4.3814187 ,   0.31953815,  -0.10114458,
         -0.6006948 ,   2.5732815 ,   4.046399  ,   1.9746957 ,
          3.5377014 ,  -1.3791897 ,  -0.19261901,  -1.4315499 ,
          6.8487144 ,   4.8945813 ,   3.1387863 ,   1.5