In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LayerNormalization, MultiHeadAttention, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Dense, Dropout, LayerNormalization
from tensorflow.keras.layers import MultiHeadAttention


import logging

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
# Setup Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [19]:
class LoggingCallback(tf.keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs=None):
        logger.info(f"Starting Epoch {epoch + 1}")

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        logger.info(f"End of Epoch {epoch + 1}")
        for key, value in logs.items():
            logger.info(f"... {key}: {value:.4f}")


In [4]:
# Detect TPU and initialize
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
    strategy = tf.distribute.TPUStrategy(tpu)
    print("TPU found and initialized!")
except ValueError:
    tpu = None
    strategy = tf.distribute.get_strategy()  # Default strategy that works on CPU and single GPU

print("Number of replicas:", strategy.num_replicas_in_sync)


TPU found and initialized!
Number of replicas: 8


In [5]:
# Generate Look-ahead Mask
def generate_lookahead_mask(size):
    """
    Generate a lookahead mask to mask future tokens in a sequence.
    """
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # (seq_len, seq_len)

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [34]:
def subsequent_mask(sz):
    mask = 1 - tf.linalg.band_part(tf.ones((sz, sz)), -1, 0)
    return mask  # shape [sz, sz]


In [42]:
class CustomMultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, embed_size, num_heads):
        super(CustomMultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.embed_size = embed_size

        assert self.embed_size % self.num_heads == 0

        self.depth = embed_size // num_heads

        self.wq = Dense(embed_size)
        self.wk = Dense(embed_size)
        self.wv = Dense(embed_size)

        self.dense = Dense(embed_size)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, query, key, value, mask):
        batch_size = tf.shape(query)[0]

        query = self.split_heads(self.wq(query), batch_size)
        key = self.split_heads(self.wk(key), batch_size)
        value = self.split_heads(self.wv(value), batch_size)

        scaled_attention_logits = tf.matmul(query, key, transpose_b=True) / tf.math.sqrt(float(self.depth))
        if mask is not None:
            scaled_attention_logits += (mask * -1e9)

        attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)
        output = tf.matmul(attention_weights, value)
        output = tf.transpose(output, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(output, (batch_size, -1, self.embed_size))
        return self.dense(concat_attention)


In [50]:
class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self, embed_size, heads, dropout_rate, forward_expansion):
        super(TransformerBlock, self).__init__()
        self.attention = CustomMultiHeadAttention(embed_size, heads)
        self.norm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.norm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.feed_forward = tf.keras.Sequential([
            tf.keras.layers.Dense(forward_expansion * embed_size, activation="relu"),
            tf.keras.layers.Dense(embed_size)
        ])

        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, inputs, training=False):
        x, mask = inputs
        attn_output = self.attention(query=x, key=x, value=x, mask=mask)
        attn_output = self.dropout(attn_output)
        out1 = self.norm1(x + attn_output)

        ffn_output = self.feed_forward(out1)
        ffn_output = self.dropout(ffn_output)
        return self.norm2(out1 + ffn_output)


In [48]:
def build_nanogpt(vocab_size, embed_size, num_layers, num_heads, ff_hidden_dim, dropout_rate, max_length):
    inputs = tf.keras.layers.Input(shape=(None,), dtype=tf.int32)
    mask_input = tf.keras.layers.Input(shape=(max_length, max_length), dtype=tf.float32)

    word_embeddings = tf.keras.layers.Embedding(vocab_size, embed_size)(inputs)
    position_embeddings = tf.keras.layers.Embedding(max_length, embed_size)(tf.range(tf.shape(inputs)[1]))
    x = word_embeddings + position_embeddings

    for _ in range(num_layers):
        x = TransformerBlock(embed_size, num_heads, dropout_rate, ff_hidden_dim)([x, mask_input])

    outputs = tf.keras.layers.Dense(vocab_size)(x)

    return tf.keras.Model(inputs=[inputs, mask_input], outputs=outputs)


In [22]:
def create_book_dataset(book_path, vocab, sequence_length, batch_size):
    """
    Create a tf.data.Dataset from the text of a book.

    Args:
    - book_path (str): Path to the book's text file.
    - vocab (dict): Dictionary mapping characters to integer tokens.
    - sequence_length (int): Length of each input sequence.
    - batch_size (int): Number of sequences in each batch.

    Returns:
    - dataset (tf.data.Dataset): Dataset of tokenized sequences.
    """
    # Load and preprocess the book text
    with open(book_path, 'r') as f:
        text = f.read()

    # Convert text to tokens
    tokens = [vocab[char] for char in text if char in vocab]

    # Create TensorFlow dataset from tokens
    token_dataset = tf.data.Dataset.from_tensor_slices(tokens)

    # Convert individual tokens to sequences of desired length
    sequences = token_dataset.batch(sequence_length, drop_remainder=True)

    # Batch sequences
    dataset = sequences.batch(batch_size, drop_remainder=True)

    return dataset


In [58]:
# Hyperparameters and data loading
book_path = "/content/drive/MyDrive/Harry Potter.txt"  # Adjust path as needed
text = open(book_path, 'r').read()
vocab = {char: idx for idx, char in enumerate(sorted(set(text)))}
sequence_length = 100
batch_size = 16 * strategy.num_replicas_in_sync
EMBED_SIZE = 256
NUM_LAYERS = 2
NUM_HEADS = 2
FF_HIDDEN_DIM = EMBED_SIZE

with strategy.scope():
    model = build_nanogpt(len(vocab), EMBED_SIZE, NUM_LAYERS, NUM_HEADS, FF_HIDDEN_DIM, DROPOUT_RATE, MAX_LENGTH)
    model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
    # Dataset
    dataset = create_book_dataset(book_path, vocab, sequence_length, batch_size).prefetch(tf.data.AUTOTUNE)

    # Training
    EPOCHS = 10
    logging_callback = LoggingCallback()
    model.fit(dataset, epochs=EPOCHS, callbacks=[logging_callback])

    # Define the path to save the model
    save_path = "/content/drive/MyDrive/tensorflow_nanogpt"

    # Save the entire model (architecture + weights)
    model.save(save_path)




ResourceExhaustedError: ignored

In [None]:
loaded_model = tf.keras.models.load_model("/content/drive/MyDrive/NanoGPT_model")


In [None]:
def generate_text(model, start_string, vocab, inv_vocab, num_generate=500, temperature=1.0):
    """
    Generate text using a trained model.

    Args:
    - model (tf.keras.Model): Trained model.
    - start_string (str): Initial string to start the text generation.
    - vocab (dict): Dictionary mapping characters to integer tokens.
    - inv_vocab (dict): Dictionary mapping integer tokens back to characters.
    - num_generate (int): Number of characters to generate.
    - temperature (float): Controls the randomness of the output. Higher value is more random.

    Returns:
    - generated_text (str): Generated text.
    """

    # Convert start string to tokens
    input_eval = [vocab[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    generated_text = []

    model.reset_states()
    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Update the input to the model for the next token prediction
        input_eval = tf.expand_dims([predicted_id], 0)

        # Append the predicted character to the generated text
        generated_text.append(inv_vocab[predicted_id])

    return start_string + ''.join(generated_text)


In [None]:
inv_vocab = {v: k for k, v in vocab.items()}  # Inverse vocabulary mapping integer tokens to characters
start_string = "Once upon a time"  # You can choose any starting string
generated_output = generate_text(loaded_model, start_string, vocab, inv_vocab, num_generate=1000, temperature=0.7)

print(generated_output)



In [None]:
inv_vocab = {v: k for k, v in vocab.items()}  # Inverse vocabulary mapping integer tokens to characters

question = "Who is the main protagonist in the book?"
prompt = f"Answering a question about the book: {question}"
answer = generate_text(loaded_model, prompt, vocab, inv_vocab, num_generate=150, temperature=0.7)

print(answer)
