In [1]:
import numpy as np
import tensorflow as tf
from keras.layers import Layer, Dense
from keras.models import Model
from keras.optimizers import AdamW
from keras.losses import SparseCategoricalCrossentropy
from keras.utils import Progbar
from nanogpt.utils import CharacterTokenizer, Embed, ResBlockWrapper, FeedForward, attention, MultiHeadWrapper

2024-03-16 14:33:53.157388: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
with open('data/shakespeare.txt', 'r') as f:
    corpus = ''.join(f.readlines())
print(corpus[:200])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you


In [3]:
tokenizer = CharacterTokenizer(corpus)
tokenizer.vocab_size

65

In [4]:
seq_len = 100

In [5]:
import pickle

with open('tmp.pkl', 'rb') as f:
    contexts, targets = pickle.load(f)

In [6]:
# n_data = len(corpus) - seq_len
# contexts = []
# targets = []
# for i in range(n_data):
#     context = tokenizer.encode(corpus[i:(i+seq_len)])
#     target = tokenizer.encode(corpus[i+seq_len])[0]
#     contexts.append(context)
#     targets.append(target)
# contexts = np.asarray(contexts, dtype='int32')
# targets = np.asarray(targets, dtype='int32')

# import pickle

# with open('tmp.pkl', 'wb') as f:
#     pickle.dump((contexts, targets), f)

In [7]:
# contexts[0], targets[0]

In [8]:
class FullSelfAttention(Layer):
    def __init__(self, model_dim, num_heads):
        super().__init__()
        self.model_dim = model_dim
        self.num_heads = num_heads

        self.multihead = MultiHeadWrapper(self.num_heads)
        self.attention_weights = None

        self.get_key = Dense(model_dim)
        self.get_value = Dense(model_dim)

    def call(self, x):
        query = self.multihead.split_heads(x)
        key = self.multihead.split_heads(self.get_key(x))
        value = self.multihead.split_heads(self.get_value(x))

        output, attention_weights = attention(query, key, value)
        output = self.multihead.concat_heads(output)
        self.attention_weights = attention_weights
        return output

In [9]:
class MyGPT(Model):
    
    def __init__(self, vocab_size, seq_len, embed_dim, model_dim, num_heads, ffd_dim, num_blocks):
        super().__init__()

        self.hidden_layers = [Embed(seq_len, seq_len, embed_dim)]
        for i in range(num_blocks):
            self.hidden_layers += [
                ResBlockWrapper(FullSelfAttention(model_dim, num_heads)),
                ResBlockWrapper(FeedForward([ffd_dim], model_dim)),
            ]
        self.output_layer = Dense(vocab_size)

    def call(self, x):
        for layer in self.hidden_layers:
            x = layer(x)
        x = x[:, -1, :]
        x = self.output_layer(x)
        return x

In [15]:
model = MyGPT(tokenizer.vocab_size, seq_len, 64, 64, 8, 128, 2)
model.build([None, seq_len])
model.summary()

Model: "my_gpt_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embed_4 (Embed)             multiple                  12800     
                                                                 
 res_block_wrapper_16 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_17 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_18 (ResB  multiple                  0 (unused)
 lockWrapper)                                                    
                                                                 
 res_block_wrapper_19 (ResB  multiple                  0 (unused)
 lockWrapper)                                             

In [16]:
model.compile(optimizer='adam', loss=SparseCategoricalCrossentropy(from_logits=True))
model.fit(contexts, targets)

2024-03-16 14:38:37.436332: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 446117600 exceeds 10% of free system memory.


 3075/34853 [=>............................] - ETA: 31:53 - loss: 2.3592

KeyboardInterrupt: 

In [None]:
def get_train_step(model, optimizer, loss):
    step = tf.Variable(0, dtype=tf.int64, trainable=False)

    @tf.function
    def train_step(x, y):
        with tf.GradientTape() as tape:
            y_pred = model(x)
            loss_value = tf.reduce_mean(loss(y, y_pred))
        grads = tape.gradient(loss_value, model.weights)
        optimizer.apply_gradients(zip(grads, model.weights))
        step.assign_add(1)
        return loss_value

    return train_step, step

In [None]:
train_step, step = get_train_step(model, AdamW(), SparseCategoricalCrossentropy(from_logits=True))

In [None]:
ds = tf.data.Dataset.from_tensor_slices((contexts, targets))
ds = ds.batch(128)

In [None]:
process_bar = Progbar(len(ds))
for x, y in ds:
    loss_value = train_step(x, y)
    process_bar.update(current=tf.cast(step, tf.float32), values=[('loss', loss_value)])