In [1]:
# # On Google colab:

# # Data
# !mkdir data
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !mv input.txt data/shakespeare.txt

# # Module
# !pip install git+https://github.com/shuiruge/nanogpt.git

In [2]:
# Locally:
import sys
sys.path.append('../nanogpt')

In [3]:
import numpy as np
import tensorflow as tf
from keras.layers import Layer, Dense, LayerNormalization, Dropout
from keras.models import Model
from keras.optimizers import AdamW
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping
from dataclasses import dataclass
from typing import List

# from nanogpt.utils import (
from utils import (
    CharacterTokenizer, LanguageModelDataGenerator, TokPosEmbedding, FeedForward,
    MultiHeadWrapper, ResNetWrapper, luong_attention,
)

tf.random.set_seed(42)

2024-03-17 19:53:42.426062: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
with open('data/shakespeare.txt', 'r') as f:
    corpus = ''.join(f.readlines())
print(corpus[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [5]:
tokenizer = CharacterTokenizer(corpus)
tokenizer.vocab_size

65

In [6]:
seq_len = 64
data = LanguageModelDataGenerator(tokenizer.encode(corpus))

contexts = []
targets = []
context = None
while True:
    try:
        next_context, _ = data(seq_len, False)
        if context is None:
            context = next_context
            continue
        contexts.append(context)
        targets.append(next_context)
        context = next_context
    except StopIteration:
        break
contexts = np.stack(contexts).astype('int64')
targets = np.stack(targets).astype('int64')

In [7]:
contexts[:3], targets[:3]

(array([[18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14,
         43, 44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,
          1, 39, 52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43,
         39, 56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50],
        [47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
         44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1,
         39, 52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39,
         56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50],
        [56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
         53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39,
         52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,
          1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10]]),
 array([[47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
         44, 53, 56, 43,  1, 61, 43,  1, 54, 5

In [8]:
class CausalSelfAttention(Layer):
    """A standard version of self-attention for GPT.

    Args:
        num_heads: int
    """

    def __init__(self, model_dim, num_heads):
        super().__init__()
        self.model_dim = model_dim
        self.num_heads = num_heads

        self.multihead = MultiHeadWrapper(self.num_heads)
        self.get_query = Dense(self.model_dim)
        self.get_key = Dense(self.model_dim)
        self.get_value = Dense(self.model_dim)

    def call(self, x, return_weights=False):
        """
        Args:
            x: tf.Tensor
                Shape [..., seq_len, dim]
            return_weights: bool
                If return the attention weights. Defaults to False.

        Returns: tf.Tensor or (tf.Tensor, tf.Tensor)
            If return_weights is false, then return the output only, which has
            the same shape and dtype as the x. Otherwise, return the output as
            well as the attention weights, which has shape
            [..., num_heads, seq_len, seq_len].
        """
        query = self.get_query(x)
        key = self.get_key(x)
        value = self.get_value(x)

        # Split the last dimension into multiple heads.
        query = self.multihead.split_heads(query)
        key = self.multihead.split_heads(key)
        value = self.multihead.split_heads(value)

        # Mask the self-communication.
        seq_len = tf.shape(x)[-2]
        # Like
        # [[0., 1., 1., 1.],
        #  [0., 0., 1., 1.],
        #  [0., 0., 0., 1.],
        #  [0., 0., 0., 0.]]
        mask = 1 - tf.linalg.band_part(tf.ones([seq_len, seq_len]), -1, 0)

        # The communication is implemented by a Luong-style attention.
        output, attention_weights = luong_attention(query, key, value, mask)

        # Concatenate the heads together.
        output = self.multihead.concat_heads(output)

        # The ResNet trick
        return (output, attention_weights) if return_weights else output

In [10]:
@dataclass
class GPTConfig:
    vocab_size: int
    seq_len: int
    embed_dim: int
    model_dim: int
    num_heads: int
    ffd_hidden_units: List[int]
    num_trans_blocks: int


class NanoGPT(Model):

    def __init__(self, cfg: GPTConfig, **kwargs):
        super().__init__(**kwargs)
        self.cfg = cfg

        self.embedding_layer = TokPosEmbedding(
            cfg.vocab_size, cfg.seq_len, cfg.embed_dim)

        # The so-called transformer-blocks.
        self.trans_blocks = []
        for _ in range(cfg.num_trans_blocks):
            self.trans_blocks.append(
                ResNetWrapper(CausalSelfAttention(cfg.model_dim, cfg.num_heads))
            )
            self.trans_blocks.append(
                ResNetWrapper(FeedForward(cfg.ffd_hidden_units, cfg.model_dim))
            )

        self.output_layer = Dense(cfg.vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        for layer in self.trans_blocks:
            x = layer(x)
        x = self.output_layer(x)
        return x

    def generate(self, init_token_ids, num_new_tokens, T):
        """Generates new tokens from the initial.

        The "temperature" T controls the randomness, as in the Boltzmann
        distributions.

        Args:
            init_token_ids: List[int]
            num_new_tokens: int
            T: float

        Returns: List[int]
            It also includes the initial token-IDs. So, the length is the
            `len(initial_token_ids) + 
        """
        init_token_ids = tf.convert_to_tensor(init_token_ids)

        # Add batch_size for matching the input shape of `self.call`.
        # [1, len(init_token_ids)]
        token_ids = tf.expand_dims(init_token_ids, axis=0)

        for _ in range(num_new_tokens):
            # [1, seq_len, vocab_size]
            logits = self(token_ids[:, -self.cfg.seq_len:])
            # We only use the last sequence element for output.
            # [1, vocab_size]
            logits = logits[:, -1, :]
            # [1, 1]
            next_token_id = tf.random.categorical(logits/T, 1)
            token_ids = tf.concat([token_ids, next_token_id], axis=1)

        # Drop the batch_size
        token_ids = tf.squeeze(token_ids, axis=0)
        return token_ids

In [11]:
# Andrej Karpathy's configuration.
# See: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=6068s, at 1:40:30.
# This configuration will get a minimal validation loss about 1.48.
# And this configuration is not for our vanilla GPT, but Andrej's nano GPT,
# which is much more complicated than ours.
# cfg = GPTConfig(tokenizer.vocab_size, seq_len,
#                 embed_dim=384,
#                 model_dim=384,
#                 num_heads=6,
#                 ffd_hidden_units=[4*384],
#                 num_trans_blocks=6)

# cfg = GPTConfig(tokenizer.vocab_size, seq_len,
#                 embed_dim=64,
#                 model_dim=64,
#                 num_heads=4,
#                 ffd_hidden_units=[4*64],
#                 num_trans_blocks=4)

# We try a much much smaller one.
cfg = GPTConfig(tokenizer.vocab_size, seq_len,
                embed_dim=64,
                model_dim=64,
                num_heads=4,
                ffd_hidden_units=[4*64],
                num_trans_blocks=2)

model = NanoGPT(cfg)
model.build([None, seq_len])
model.summary()

Model: "nano_gpt"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tok_pos_embedding (TokPosE  multiple                  8256      
 mbedding)                                                       
                                                                 
 trans_block (TransBlock)    multiple                  45824     
                                                                 
 trans_block_1 (TransBlock)  multiple                  45824     
                                                                 
 dense_10 (Dense)            multiple                  4225      
                                                                 
Total params: 104129 (406.75 KB)
Trainable params: 104129 (406.75 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [12]:
# model(contexts[:10])

In [13]:
model.compile(
    optimizer=AdamW(),
    loss=SparseCategoricalCrossentropy(from_logits=True),
)
model.fit(
    x=contexts,
    y=targets,
    batch_size=64,
    validation_split=0.1,
    # The epochs argument shall be as large as possible. And we control the
    # true epochs by early-stopping.
    epochs=100,
    callbacks=[EarlyStopping()]
)
# For our smaller configuration, the training will overfit after epoch 9.

2024-03-17 19:53:55.974183: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 513943552 exceeds 10% of free system memory.
2024-03-17 19:53:57.140134: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 513943552 exceeds 10% of free system memory.


Epoch 1/100
Epoch 2/100
Epoch 3/100


<keras.src.callbacks.History at 0x7f3f86b5de10>

In [14]:
generated = model.generate(contexts[0, :], 500, 0.5)
print(tokenizer.decode(generated.numpy())[seq_len:])

l:
Well, I did that are shore punishes, to the consul!

CLARENCE:
I will not be purgundy her and and am and the horse.

FLORIZEL:
And say the word with a bones of the world,
But strees with the commons of the fault of father's daughter.

KING RICHARD III:
My lord, I let thee counterfeit were in grace
That he like a long man of bend the strength was should
In a to the earth. Thou art not beat intend
And so his father had made the fear was are prove
With this hours are a wash
And the should be dea
