In [1]:
# # On Google colab:

# # Data
# !mkdir data
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !mv input.txt data/shakespeare.txt

# # Module
# !pip install git+https://github.com/shuiruge/nanogpt.git

In [2]:
# Locally:
import sys
sys.path.append('../nanogpt')

In [3]:
import numpy as np
import tensorflow as tf
from keras.layers import Layer, Dense, LayerNormalization, Dropout
from keras.models import Model, Sequential
from keras.optimizers import AdamW
from keras.losses import SparseCategoricalCrossentropy
from keras.callbacks import EarlyStopping, TensorBoard
from dataclasses import dataclass
from typing import List
from datetime import datetime

# from nanogpt.utils import (
from utils import (
    CharacterTokenizer, LanguageModelDataGenerator, TokPosEmbedding, FeedForward,
    MultiHeadWrapper, ResNetWrapper, luong_attention,
)

tf.random.set_seed(42)

2024-03-17 18:56:07.403557: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
with open('data/shakespeare.txt', 'r') as f:
    corpus = ''.join(f.readlines())
print(corpus[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [5]:
tokenizer = CharacterTokenizer(corpus)
tokenizer.vocab_size

65

In [6]:
seq_len = 64
data = LanguageModelDataGenerator(tokenizer.encode(corpus))

contexts = []
targets = []
while True:
    try:
        context, target = data(seq_len, False)
        contexts.append(context)
        targets.append(target)
    except StopIteration:
        break
contexts = np.stack(contexts).astype('int64')
targets = np.asarray(targets, 'int64')

In [7]:
contexts[:3], targets[:3]

(array([[18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14,
         43, 44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,
          1, 39, 52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43,
         39, 56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50],
        [47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43,
         44, 53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1,
         39, 52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39,
         56,  1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50],
        [56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
         53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39,
         52, 63,  1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,
          1, 51, 43,  1, 57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10]]),
 array([50, 10,  0]))

In [8]:
class VanillaSelfAttention(Layer):
    """A vanilla version of self-attention.
    
    It is called vanilla because it has no trainable variables at all! The
    function of this kind of self-attention is just communicating between
    each token (or each node if there is an image of community in your mind).
    So, it has no resposibility for computation, which is completely left to
    feed-forward layers.

    For each node i with state $x_i$ and each node j with state $x_j$, the
    the information propagate from node i to node j is $w_{ij} x_i$, where
    $w_{ij}$ is proportional to $\exp(x_i \dot x_j)$ and is normalized so that,
    for all nodes that sends to node j, the total $\sum_i w_{ij}$ shall be unit.

    In this communication viewpoint, the multi-head means the multi-channel
    of communication. Each channel propagates one kind of information. And
    different kinds of information are sent to different target nodes.

    We mask out the self-communication. This is like a Hopfield network, where
    self-interaction is neglected (the weight matrix has a vanished diagonal).

    Args:
        num_heads: int
    """

    def __init__(self, num_heads):
        super().__init__()
        self.num_heads = num_heads

        self.multihead = MultiHeadWrapper(self.num_heads)

    def call(self, x, return_weights=False):
        """
        Args:
            x: tf.Tensor
                Shape [..., seq_len, dim]
            return_weights: bool
                If return the attention weights. Defaults to False.

        Returns: tf.Tensor or (tf.Tensor, tf.Tensor)
            If return_weights is false, then return the output only, which has
            the same shape and dtype as the x. Otherwise, return the output as
            well as the attention weights, which has shape
            [..., num_heads, seq_len, seq_len].
        """
        # Split the last dimension into multiple heads.
        query = self.multihead.split_heads(x)
        key = self.multihead.split_heads(x)
        value = self.multihead.split_heads(x)

        # Mask the self-communication.
        seq_len = tf.shape(x)[-2]
        mask = tf.linalg.diag(tf.ones([seq_len]))

        # The communication is implemented by a Luong-style attention.
        output, attention_weights = luong_attention(query, key, value, mask)

        # Concatenate the heads together.
        output = self.multihead.concat_heads(output)
        return (output, attention_weights) if return_weights else output

In [9]:
@dataclass
class GPTConfig:
    vocab_size: int
    seq_len: int
    embed_dim: int
    model_dim: int
    num_heads: int
    ffd_hidden_units: List[int]
    num_trans_blocks: int


class VanillaGPT(Model):
    """Build a vanilla GPT with vanilla self-attention."""

    def __init__(self, cfg: GPTConfig, **kwargs):
        super().__init__(**kwargs)
        self.cfg = cfg

        self.embedding_layer = TokPosEmbedding(
            cfg.vocab_size, cfg.seq_len, cfg.embed_dim)

        # The so-called transformer-block.
        self.trans_block = Sequential([
            ResNetWrapper(VanillaSelfAttention(cfg.num_heads)),
            ResNetWrapper(FeedForward(cfg.ffd_hidden_units, cfg.model_dim)),
        ])

        self.output_layer = Dense(cfg.vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        for _ in range(self.cfg.num_trans_blocks):
            x = self.trans_block(x)
        # We only use the last sequence element for output.
        # It is such a waste!
        # And why it is the last one? Why not the others? Maybe some of the
        # others out-performs the last.
        x = x[:, -1, :]
        x = self.output_layer(x)
        return x

    def generate(self, init_token_ids, num_new_tokens, T):
        """Generates new tokens from the initial.

        The "temperature" T controls the randomness, as in the Boltzmann
        distributions.

        Args:
            init_token_ids: List[int]
            num_new_tokens: int
            T: float

        Returns: List[int]
            It also includes the initial token-IDs. So, the length is the
            `len(initial_token_ids) + 
        """
        init_token_ids = tf.convert_to_tensor(init_token_ids)

        # Add batch_size for matching the input shape of `self.call`.
        # [1, len(init_token_ids)]
        token_ids = tf.expand_dims(init_token_ids, axis=0)

        for _ in range(num_new_tokens):
            # [1, vocab_size]
            logits = self(token_ids[:, -self.cfg.seq_len:])
            # [1, 1]
            next_token_id = tf.random.categorical(logits/T, 1)
            token_ids = tf.concat([token_ids, next_token_id], axis=1)

        # Drop the batch_size
        token_ids = tf.squeeze(token_ids, axis=0)
        return token_ids

In [10]:
# Andrej Karpathy's configuration.
# See: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=6068s, at 1:40:30.
# This configuration will get a minimal validation loss about 1.48.
# And this configuration is not for our vanilla GPT, but Andrej's nano GPT,
# which is much more complicated than ours.
# cfg = GPTConfig(tokenizer.vocab_size, seq_len,
#                 embed_dim=384,
#                 model_dim=384,
#                 num_heads=6,
#                 ffd_hidden_units=[4*384],
#                 num_trans_blocks=6)

# cfg = GPTConfig(tokenizer.vocab_size, seq_len,
#                 embed_dim=64,
#                 model_dim=64,
#                 num_heads=4,
#                 ffd_hidden_units=[4*64],
#                 num_trans_blocks=4)

# We try a much much smaller one.
# This configuration will get a minimal validation loss about 1.70.
cfg = GPTConfig(tokenizer.vocab_size, seq_len,
                embed_dim=64,
                model_dim=64,
                num_heads=4,
                ffd_hidden_units=[4*64],
                num_trans_blocks=4)

model = VanillaGPT(cfg)
model.build([None, seq_len])
model.summary()

Model: "vanilla_gpt"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tok_pos_embedding (TokPosE  multiple                  8256      
 mbedding)                                                       
                                                                 
 sequential (Sequential)     (None, 64, 64)            33344     
                                                                 
 dense_2 (Dense)             multiple                  4225      
                                                                 
Total params: 45825 (179.00 KB)
Trainable params: 45825 (179.00 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [11]:
model.compile(
    optimizer=AdamW(),
    loss=SparseCategoricalCrossentropy(from_logits=True),
)
callbacks = [
    EarlyStopping(),
    # TensorBoard(log_dir=('data/logs/' + datetime.now().strftime("%Y%m%d-%H%M%S"))),
]
model.fit(
    x=contexts,
    y=targets,
    batch_size=64,
    validation_split=0.1,
    # The epochs argument shall be as large as possible. And we control the
    # true epochs by early-stopping.
    epochs=100,
    callbacks=callbacks,
)
# For our smaller configuration, the training will overfit after epoch 9.

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
   91/15685 [..............................] - ETA: 43:26 - loss: 1.6484

KeyboardInterrupt: 

In [13]:
generated = model.generate(contexts[0, :], 1000, 1)
print(tokenizer.decode(generated.numpy())[seq_len:])

d-grue canneratis.

HOMBROSEM:
Hast wills I was will whose for by was full him burge
ventretieve
Than latied-were ro and beseech'd a but heed ander'd: I merch we on soness,
That in fromlon, to perform my pake hours,
Isser'd all give instry, and with speak; nobcass
At
Mosting in roming, sight, not we like, feash.

ESCALUS:
Whilming of the clat the we inders
The the herm, dukes of must of I weigd;
indear Aarnalmy that bright Pisit; but
Warwick hook in mopk and go, frienfate true;
And how in and sinst life, hime to light
TLEY:
LeOf altherve my dine grace? clay, we thand.
Lord as hest I darthemmemb? this no on think, and word;
Piegreeams yeftions in't: bure than heep they longer:
Your their brave? boye to her ender.
Your on Berievoss vut sach, whom hast agains. Clarench'd I bekeem,
I warlient him, aswomen knew the wars, in to kendly an might
God but be dreid comforeg noing; took a princest not off may?

KING EDWARD:
I know the can
This thou frest them, frow flus of her of heighbide:
She be

In [None]:
# %load_ext tensorboard
# %tensorboard --logdir data/logs