In [1]:
# # On Google colab:

# # Data
# !mkdir data
# !wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
# !mv input.txt data/shakespeare.txt

# # Module
# !pip install git+https://github.com/shuiruge/nanogpt.git

In [2]:
# Locally:
import sys
sys.path.append('../nanogpt')

In [4]:
import numpy as np
import tensorflow as tf
from keras.layers import Layer, Dense
from keras.models import Model
from keras.optimizers import AdamW
from keras.losses import SparseCategoricalCrossentropy
from keras.utils import Progbar
from dataclasses import dataclass
from typing import List

# from nanogpt.utils import (
from utils import (
    CharacterTokenizer, LanguageModelDataGenerator, TokPosEmbedding, FeedForward,
    luong_attention, MultiHeadWrapper,
)

tf.random.set_seed(42)

In [5]:
with open('data/shakespeare.txt', 'r') as f:
    corpus = ''.join(f.readlines())
print(corpus[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [6]:
tokenizer = CharacterTokenizer(corpus)
tokenizer.vocab_size

65

In [7]:
seq_len = 20
data = LanguageModelDataGenerator(tokenizer.encode(corpus))

contexts = []
targets = []
while True:
    try:
        context, target = data(seq_len, False)
        contexts.append(context)
        targets.append(target)
    except StopIteration:
        break
contexts = np.stack(contexts)
targets = np.asarray(targets, 'int32')

In [8]:
class VanillaSelfAttention(Layer):
    """A vanilla version of self-attention.
    
    It is called vanilla because it has no trainable variables at all! The
    function of this kind of self-attention is just communicating between
    each token (or each node if there is an image of community in your mind).
    So, it has no resposibility for computation, which is completely left to
    feed-forward layers.

    In this communication viewpoint, the multi-head means the multi-channel
    of communication. Each channel propagates one kind of information. And
    different kinds of information are sent to different target nodes.

    Args:
        num_heads: int
    """

    def __init__(self, num_heads):
        super().__init__()
        self.num_heads = num_heads

        self.multihead = MultiHeadWrapper(self.num_heads)

    def call(self, x, return_weights=False):
        """
        Args:
            x: tf.Tensor
                Shape [..., seq_len, dim]
            return_weights: bool
                If return the attention weights. Defaults to False.

        Returns: tf.Tensor
            The same shape and dtype as the x.
        """
        query = self.multihead.split_heads(x)
        key = self.multihead.split_heads(x)
        value = self.multihead.split_heads(x)

        output, attention_weights = luong_attention(query, key, value)
        output = self.multihead.concat_heads(output)
        return (output, attention_weights) if return_weights else output

In [10]:
@dataclass
class GPTConfig:
    vocab_size: int
    seq_len: int
    embed_dim: int
    model_dim: int
    num_heads: int
    ffd_hidden_units: List[int]
    num_trans_blocks: int


class VanillaGPT(Model):
    """Build a vanilla GPT with vanilla self-attention."""

    def __init__(self, cfg: GPTConfig, **kwargs):
        super().__init__(**kwargs)
        self.cfg = cfg

        self.embedding_layer = TokPosEmbedding(
            cfg.vocab_size, cfg.seq_len, cfg.embed_dim)

        # The so-called transformer-blocks.
        self.trans_blocks = []
        for _ in range(cfg.num_trans_blocks):
            # As we have discussed in the docstring of VanillaSelfAttention,
            # the task of computation is left to feed-forward layers. So,
            # for communication:
            self.trans_blocks.append(
                VanillaSelfAttention(cfg.num_heads)
            )
            # and for computation:
            self.trans_blocks.append(
                FeedForward(cfg.ffd_hidden_units, cfg.model_dim)
            )

        self.output_layer = Dense(cfg.vocab_size)

    def call(self, x):
        x = self.embedding_layer(x)
        for layer in self.trans_blocks:
            x = layer(x)
        x = x[:, -1, :]  # output the last sequence element.
        x = self.output_layer(x)
        return x

    def generate(self, init_token_ids, num_new_tokens, T):
        """Generates new tokens from the initial.

        The "temperature" T controls the randomness, as in the Boltzmann
        distributions.

        Args:
            init_token_ids: List[int]
            num_new_tokens: int
            T: float

        Returns: List[int]
            It also includes the initial token-IDs. So, the length is the
            `len(initial_token_ids) + 
        """
        init_token_ids = tf.convert_to_tensor(init_token_ids)

        # Add batch_size for matching the input shape of `self.call`.
        # [1, len(init_token_ids)]
        token_ids = tf.expand_dims(init_token_ids, axis=0)

        for _ in range(num_new_tokens):
            # [1, vocab_size]
            logits = self(token_ids[:, -self.cfg.seq_len:])
            # [1, 1]
            next_token_id = tf.random.categorical(logits/T, 1)
            token_ids = tf.concat([token_ids, next_token_id], axis=1)

        # Drop the batch_size
        token_ids = tf.squeeze(token_ids, axis=0)
        return token_ids

In [12]:
# Andrej Karpathy's configuration.
# See: https://www.youtube.com/watch?v=kCc8FmEb1nY&t=6068s, at 1:40:30.
# This configuration will get a minimal validation loss about 1.48.
# And this configuration is not for our vanilla GPT, but Andrej's nano GPT,
# which is much more complicated than ours.
# cfg = GPTConfig(tokenizer.vocab_size, seq_len,
#                 embed_dim=384,
#                 model_dim=384,
#                 num_heads=6,
#                 ffd_hidden_units=[4*384],
#                 num_trans_blocks=6)

# We try a much much smaller one.
# This configuration will get a minimal validation loss about 1.77.
cfg = GPTConfig(tokenizer.vocab_size, seq_len,
                embed_dim=64,
                model_dim=64,
                num_heads=4,
                ffd_hidden_units=[256],
                num_trans_blocks=2)

model = VanillaGPT(cfg)
model.build([None, seq_len])
model.summary()

Model: "vanilla_gpt"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 tok_pos_embedding (TokPosE  multiple                  5440      
 mbedding)                                                       
                                                                 
 vanilla_self_attention (Va  multiple                  0         
 nillaSelfAttention)                                             
                                                                 
 feed_forward (FeedForward)  multiple                  33216     
                                                                 
 vanilla_self_attention_1 (  multiple                  0         
 VanillaSelfAttention)                                           
                                                                 
 feed_forward_1 (FeedForwar  multiple                  33216     
 d)                                                    

In [13]:
model.compile(
    optimizer=AdamW(),
    loss=SparseCategoricalCrossentropy(from_logits=True),
)
model.fit(
    x=contexts,
    y=targets,
    batch_size=64,
    validation_split=0.1,
    epochs=100,  # shall be as large as possible.
)
# For our smaller configuration, the training will overfit at epoch 8.

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
 3014/15685 [====>.........................] - ETA: 1:56 - loss: 1.5580

KeyboardInterrupt: 

In [21]:
generated = model.generate(contexts[0, :], 500, 0.5)
print(tokenizer.decode(generated.numpy())[seq_len:])

KING EDWARD IV:
Ay, how they will do his present of the griefes now in the earth by mine she would sun the charment we for more for the dest the pade in the rement,
The lived the death, and how is is so death.

CLAUDIO:
I will she had so me of the fly make of the set face you conters I tell have to the head we better left it he service the for thee come in the put, and their a preth the short in the soul: and their w
