In [1]:
!pip install tensorflow



In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
from tensorflow.keras import Model

class MultiHeadSelfAttention(Layer):
    def __init__(self, embed_dim, num_heads):
        super().__init__()
        self.num_heads = num_heads
        self.embed_dim = embed_dim
        self.depth = embed_dim // num_heads

        self.wq = Dense(embed_dim)
        self.wk = Dense(embed_dim)
        self.wv = Dense(embed_dim)

        self.dense = Dense(embed_dim)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        q = self.split_heads(self.wq(q), batch_size)
        k = self.split_heads(self.wk(k), batch_size)
        v = self.split_heads(self.wv(v), batch_size)

        matmul_qk = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, v)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.embed_dim))
        return self.dense(concat_attention)

class FeedForwardNetwork(Layer):
    def __init__(self, embed_dim, dff):
        super().__init__()
        self.dense1 = Dense(dff, activation='gelu')
        self.dense2 = Dense(embed_dim)

    def call(self, x):
        return self.dense2(self.dense1(x))

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = FeedForwardNetwork(embed_dim, dff)
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, x, mask=None):
        attn_output = self.att(x, x, x, mask)
        attn_output = self.dropout1(attn_output)
        out1 = self.norm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.norm2(out1 + ffn_output)

class GPT2(Model):
    def __init__(self, vocab_size, max_length, embed_dim=768, num_heads=12, dff=3072, num_layers=12, dropout_rate=0.1):
        super().__init__()

        self.token_emb = Embedding(vocab_size, embed_dim)
        self.pos_emb = Embedding(max_length, embed_dim)

        self.transformer_blocks = [TransformerBlock(embed_dim, num_heads, dff, dropout_rate) for _ in range(num_layers)]

        self.norm = LayerNormalization(epsilon=1e-6)
        self.out = Dense(vocab_size)

    def create_causal_mask(self, seq_len):
        mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return 1 - mask

    def call(self, x):
        seq_len = tf.shape(x)[1]
        mask = self.create_causal_mask(seq_len)

        token_embeddings = self.token_emb(x)
        position_ids = tf.range(start=0, limit=seq_len, delta=1)
        position_embeddings = self.pos_emb(position_ids)

        x = token_embeddings + position_embeddings

        for transformer in self.transformer_blocks:
            x = transformer(x, mask)

        x = self.norm(x)
        return self.out(x)

VOCAB_SIZE = 50257
MAX_LENGTH = 1024

inputs = tf.keras.Input(shape=(MAX_LENGTH,), dtype=tf.int32)
outputs = GPT2(vocab_size=VOCAB_SIZE, max_length=MAX_LENGTH)(inputs)
gpt2 = Model(inputs, outputs)

gpt2.build(input_shape=(1, MAX_LENGTH))

gpt2.summary()

In [8]:
# Example toy dataset (you can replace this with any text)
text = """
To be, or not to be, that is the question:
Whether 'tis nobler in the mind to suffer
The slings and arrows of outrageous fortune,
Or to take arms against a sea of troubles
"""

# Tokenize text into characters (super simple tokenizer)
# Real GPT uses BPE, but for demo chars are fine
vocab = sorted(set(text))
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = {i:u for i, u in enumerate(vocab)}

VOCAB_SIZE = len(vocab)
print("Vocab size:", VOCAB_SIZE)

# Convert text to ints
encoded = [char2idx[c] for c in text]

# Build input-output pairs (next char prediction)
seq_length = 40
examples = []
for i in range(len(encoded) - seq_length):
    inp = encoded[i:i+seq_length]
    tgt = encoded[i+1:i+seq_length+1]
    examples.append((inp, tgt))

# Now split inputs and targets
inputs = [ex[0] for ex in examples]
targets = [ex[1] for ex in examples]

# Create dataset
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets))
dataset = dataset.shuffle(100).batch(8, drop_remainder=True) # batch size=8


MAX_LENGTH = seq_length  # match dataset sequence length

model = GPT2(
    vocab_size=VOCAB_SIZE,
    max_length=MAX_LENGTH,
    embed_dim=64,     # small embedding for demo
    num_heads=4,      # fewer heads
    dff=128,          # smaller FFN
    num_layers=2      # just 2 transformer blocks
)

Vocab size: 27


In [3]:
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)

In [4]:
@tf.function
def train_step(inputs, targets):
    with tf.GradientTape() as tape:
        predictions = model(inputs)
        loss = loss_fn(targets, predictions)
    grads = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(grads, model.trainable_variables))
    return loss

In [9]:
EPOCHS = 5
for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    for step, (inp, tgt) in enumerate(dataset):
        loss = train_step(inp, tgt)
        if step % 10 == 0:
            print(f"Step {step}, Loss: {loss.numpy():.4f}")


Epoch 1/5
Step 0, Loss: 4.0620
Step 10, Loss: 2.4756

Epoch 2/5
Step 0, Loss: 1.9230
Step 10, Loss: 1.6194

Epoch 3/5
Step 0, Loss: 1.5331
Step 10, Loss: 1.4367

Epoch 4/5
Step 0, Loss: 1.3186
Step 10, Loss: 1.1082

Epoch 5/5
Step 0, Loss: 1.0269
Step 10, Loss: 0.9317


In [10]:
def generate_text(start_string, num_chars=100):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)  # batch size 1

    text_generated = []

    for _ in range(num_chars):
        predictions = model(input_eval)
        predictions = predictions[:, -1, :]  # last token
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        input_eval = tf.concat([input_eval, [[predicted_id]]], axis=-1)
        text_generated.append(idx2char[predicted_id])

    return start_string + "".join(text_generated)

print(generate_text("To be"))

To be is ti
:
Wheo thes is and of onos se qunows ques ain:
Wherin:
Wherrutof or to: s
Whethetinganonoble 
