<a href="https://colab.research.google.com/github/pranukrish/CMPE297-SpecialTopics/blob/main/Assignment3/NanoGPT_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorflow tensorflow_datasets

In [None]:
import tensorflow_datasets as tfds

In [None]:
# Load IMDB dataset
(train_data, test_data), info = tfds.load('imdb_reviews',
                                          split=['train', 'test'],
                                          with_info=True,
                                          as_supervised=True)

In [None]:
# Tokenization and preprocessing
tokenizer = tfds.deprecated.text.Tokenizer()

In [None]:
# Build vocabulary
vocabulary = set()
for text, _ in train_data:
    vocabulary.update(tokenizer.tokenize(text.numpy().lower()))

In [None]:
# Encoder
encoder = tfds.deprecated.text.TokenTextEncoder(vocabulary)

In [None]:
# Encode data
def encode(text_tensor, _):
    return encoder.encode(text_tensor.numpy())

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64))

In [None]:
train_data = train_data.map(encode_map_fn)
test_data = test_data.map(encode_map_fn)

In [None]:
# Pad data
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

In [None]:
# Hyperparameters
VOCAB_SIZE = len(vocabulary) + 1
D_MODEL = 128
NHEAD = 4
NUM_LAYERS = 2
MAX_LENGTH = 1000  # Adjust based on your dataset
LR = 0.001
EPOCHS = 5
BATCH_SIZE = 64

In [None]:
# Define the model
class NanoGPT(tf.keras.Model):
    def __init__(self, vocab_size, d_model, nhead, num_layers, max_length):
        super(NanoGPT, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model)
        self.transformer = tf.keras.layers.Transformer(num_layers=num_layers,
                                                       key_dim=d_model//nhead,
                                                       num_heads=nhead,
                                                       feed_forward_dim=256,
                                                       dropout_rate=0.1,
                                                       max_sequence_length=max_length)
        self.fc = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.transformer(x)
        x = self.fc(x)
        return x

In [None]:
# Instantiate the model and compile
model = NanoGPT(VOCAB_SIZE, D_MODEL, NHEAD, NUM_LAYERS, MAX_LENGTH)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=LR),
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [None]:
# Train the model
model.fit(train_data, epochs=EPOCHS, validation_data=test_data)