## Setup

In [1]:
!pip install keras-nlp



In [2]:
import os
import keras_nlp
import tensorflow as tf
from tensorflow import keras
import time

os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"


Using TensorFlow backend




## Settings & hyperparameters

In [3]:
# Data
BATCH_SIZE = 64
SEQ_LEN = 128
MIN_TRAINING_SEQ_LEN = 450

# Model
EMBED_DIM = 256
FEED_FORWARD_DIM = 256
NUM_HEADS = 3
NUM_LAYERS = 2
VOCAB_SIZE = 5000  # Limits parameters in model.

# Training
EPOCHS = 6

# Inference
NUM_TOKENS_TO_GENERATE = 80

## Load the data

In [4]:
keras.utils.get_file(
    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
    extract=True,
)
dir = os.path.expanduser("~/.keras/datasets/simplebooks/")

# Load simplebooks-92 train set and filter out short lines.
raw_train_ds = (
    tf.data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
    .shuffle(buffer_size=256)
)

# Load simplebooks-92 validation set and filter out short lines.
raw_val_ds = (
    tf.data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
    .filter(lambda x: tf.strings.length(x) > MIN_TRAINING_SEQ_LEN)
    .batch(BATCH_SIZE)
)

Downloading data from https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip


## Train the tokenizer

In [5]:
# Train tokenizer vocabulary
vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
    raw_train_ds,
    vocabulary_size=VOCAB_SIZE,
    lowercase=True,
    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
)

## Load tokenizer

In [6]:
tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
    vocabulary=vocab,
    sequence_length=SEQ_LEN,
    lowercase=True,
)

## Tokenize data

In [7]:
# packer adds a start token
start_packer = keras_nlp.layers.StartEndPacker(
    sequence_length=SEQ_LEN,
    start_value=tokenizer.token_to_id("[BOS]"),
)


def preprocess(inputs):
    outputs = tokenizer(inputs)
    features = start_packer(outputs)
    labels = outputs
    return features, labels


# Tokenize and split into train and label sequences.
train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)
val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
    tf.data.AUTOTUNE
)

## Build the model

In [8]:
inputs = keras.layers.Input(shape=(None,), dtype=tf.int32)
# Embedding.
embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
    vocabulary_size=VOCAB_SIZE,
    sequence_length=SEQ_LEN,
    embedding_dim=EMBED_DIM,
    mask_zero=True,
)
x = embedding_layer(inputs)
# Transformer decoders.
for _ in range(NUM_LAYERS):
    decoder_layer = keras_nlp.layers.TransformerDecoder(
        num_heads=NUM_HEADS,
        intermediate_dim=FEED_FORWARD_DIM,
    )
    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
# Output.
outputs = keras.layers.Dense(VOCAB_SIZE)(x)
model = keras.Model(inputs=inputs, outputs=outputs)
loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])

In [9]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 token_and_position_embeddi  (None, None, 256)         1312768   
 ng (TokenAndPositionEmbedd                                      
 ing)                                                            
                                                                 
 transformer_decoder (Trans  (None, None, 256)         394749    
 formerDecoder)                                                  
                                                                 
 transformer_decoder_1 (Tra  (None, None, 256)         394749    
 nsformerDecoder)                                                
                                                                 
 dense (Dense)               (None, None, 5000)        128500

## Training

In [10]:
#model.fit(train_ds, validation_data=val_ds, verbose=2, epochs=EPOCHS)

## Load reddit dataset

In [11]:
import tensorflow_datasets as tfds
reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)

[1mDownloading and preparing dataset 639.54 MiB (download: 639.54 MiB, generated: 141.46 MiB, total: 781.00 MiB) to /root/tensorflow_datasets/reddit_tifu/short/1.1.2...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/1 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/79740 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/reddit_tifu/short/1.1.2.incompleteXULVBK/reddit_tifu-train.tfrecord*...:  …

[1mDataset reddit_tifu downloaded and prepared to /root/tensorflow_datasets/reddit_tifu/short/1.1.2. Subsequent calls will reuse this data.[0m


In [12]:
for document, title in reddit_ds:
    print(document.numpy())
    print(title.numpy())
    break

b"me and a friend decided to go to the beach last sunday. we loaded up and headed out. we were about half way there when i decided that i was not leaving till i had seafood. \n\nnow i'm not talking about red lobster. no friends i'm talking about a low country boil. i found the restaurant and got directions. i don't know if any of you have heard about the crab shack on tybee island but let me tell you it's worth it. \n\nwe arrived and was seated quickly. we decided to get a seafood sampler for two and split it. the waitress bought it out on separate platters for us. the amount of food was staggering. two types of crab, shrimp, mussels, crawfish, andouille sausage, red potatoes, and corn on the cob. i managed to finish it and some of my friends crawfish and mussels. it was a day to be a fat ass. we finished paid for our food and headed to the beach. \n\nfunny thing about seafood. it runs through me faster than a kenyan \n\nwe arrived and walked around a bit. it was about 45min since we a

## Define training dataset

In [13]:
train_ds = (
    reddit_ds.map(lambda document, _: document)
    .batch(32)
    .cache()
    .prefetch(tf.data.AUTOTUNE)
)

# GPT2

## Load the model

In [14]:
preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
    "gpt2_base_en",
    sequence_length=128,
)
gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
    "gpt2_base_en", preprocessor=preprocessor
)

Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/vocab.json
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/merges.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/gpt2_base_en/v1/model.h5


## Test the model

In [15]:
start = time.time()

output = gpt2_lm.generate("My trip to Yosemite was", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
My trip to Yosemite was a bit of a roller coaster ride, and I was really excited to see the spectacular beauty of the Yosemite Valley. It was a very nice and beautiful place and I was very excited to see the beautiful scenery of the Yosemite Valley. There was so much to see and see and see. The trail was very easy and there were a few other trails that you could do on the trails. It was pretty much the same as the other hikes on my list, but there was some new stuff to see. There were many different types of hiking and camping options. I did find a lot of people who wanted to do a lot of camping, and I was glad to see that. There are some other campsites that were a bit different from the others on my list. I was also able to see some of the more unique places that were out there. I was able to find a few people who were looking for a more relaxing way to do something different. There were some
TOTAL TIME ELAPSED: 21.75s


## Fine tune - Train GPT on reddit dataset

In [16]:
train_ds = train_ds.take(10)
num_epochs = 1

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    5e-5,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(train_ds, epochs=num_epochs)



<keras.src.callbacks.History at 0x789e0518b340>

## Test the output after finetuning

In [17]:
start = time.time()

output = gpt2_lm.generate("I like basketball", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
I like basketball, but I also love to play the game. I have been a basketball fan since I was 12.

My parents are both former NBA players. They were drafted by the Cleveland Cavaliers in the third round of the 2003 NBA draft. My mom and dad were the first to play for the Cavaliers. My mom was drafted in the first round and my dad, a college coach, was the second one. They played for a team that went to the NBA Finals and won the championship. I was a freshman at the University of Michigan, where I majored in history. My mom played for the team in high school, and my dad played college ball. I had to play in the NBA finals to get my degree, but after the game, my dad said, "That's a pretty good one."

I've been a fan since I was a little kid. It's hard to believe that my mom, who was born and raised in Michigan, would play for
TOTAL TIME ELAPSED: 13.40s


## GPT2 Version 2 - Adjust learning rate

In [18]:
train_ds_subset = train_ds.take(10)
num_epochs = 1  # Increase the number of epochs for longer training

#  learning rate
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    1e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(
    train_ds_subset,
    epochs=num_epochs,
)



<keras.src.callbacks.History at 0x789dec12ef20>

In [19]:
start = time.time()

output = gpt2_lm.generate("The day started with", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
The day started with a little bit of a bit of a joke. I was walking down the street when someone said, "Hey, look how you're doing today."

"Yeah, it's a little bit different, but it's not bad."

"Yeah, it's a bit different."

"Oh, yeah. I don't know what that means."

"It's not really, like, a joke."

"Yeah, but it's funny."

I looked up and saw a girl with a large nose, who was wearing a long, dark, white, long sleeve dress.

"Hey, what's up?"

The girl said, "Well I've been dating a friend of ours since the day we were in elementary school."

"Yeah, that's right."

"Yeah? So we went to a movie theater, and she came home and said something funny to me."

"
TOTAL TIME ELAPSED: 13.23s


## GPT2 Version 2 - Changed back optimizer and adding more epochs

In [20]:
train_ds_subset = train_ds.take(10)
num_epochs = 1  # Increase the number of epochs for longer training

#  learning rate
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    1e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.SGD(learning_rate) ,
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(
    train_ds_subset,
    epochs=num_epochs,
)




<keras.src.callbacks.History at 0x789de455c6a0>

In [21]:
start = time.time()

output = gpt2_lm.generate("Solar system has", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
Solar system has been around since ancient times and has been used as a power source for centuries, with a variety of uses including solar panels, power generators, and power generators for various applications.

In the past, solar power was used for electricity generation and distribution. However, solar power has been used in the form of a solar array. Solar arrays are used in many different applications. For instance, solar arrays are used to store and transmit electricity and to operate a solar power station.

Solar power is used for various applications including solar arrays, wind turbines, and other types of energy.
TOTAL TIME ELAPSED: 13.08s


In [22]:
train_ds_subset = train_ds.take(15)
num_epochs = 2  # Increased the number of epochs for longer training

#  learning rate
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    1e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
gpt2_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

gpt2_lm.fit(
    train_ds_subset,
    epochs=num_epochs,
)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x78a20c69a1a0>

In [23]:
start = time.time()

output = gpt2_lm.generate("This book is", max_length=200)
print("\nGPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


GPT-2 output:
This book is about how a girl named Chloe came into a relationship with a guy named Nick and he started dating her for some reason. 

I'm talking about a girl named Chloe who is a very shy girl, and is often seen in public with a large amount of confidence. 

Chloe is a shy girl with a shyness that makes it impossible to talk to her. 

She's a shy girl with a shyness that makes it impossible for her to get out of her relationship with him. 

She has an extremely shy and unassuming way of being, and often tries to hide her shyness by pretending to talk about it. 

She doesn't want anyone to know about her shyness because it's embarrassing and it would make her look bad, but it's a good thing she didn't tell anyone. 

I'm talking about a young girl named Chloe, who is a shy girl with a shyness that makes
TOTAL TIME ELAPSED: 12.44s


# OPT

In [24]:
opt_lm = keras_nlp.models.OPTCausalLM.from_preset("opt_125m_en")

Downloading data from https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/vocab.json
Downloading data from https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/merges.txt
Downloading data from https://storage.googleapis.com/keras-nlp/models/opt_125m_en/v1/model.h5


In [25]:
start = time.time()

output = opt_lm.generate("I like basketball", max_length=200)
print("\nOPT output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


OPT output:
I like basketball too! It's a sport that's fun to play and I like to learn from it and enjoy it.
I love basketball as well and I've been playing since the 90s, but I'm still not sure what's going to get me back into it.  I'm still a bit confused on what the hell is wrong with the NBA, and what's the point in playing if you don't enjoy it?
TOTAL TIME ELAPSED: 14.65s


In [None]:
train_ds_subset = train_ds.take(10)
num_epochs = 1 

#  learning rate
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    1e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
opt_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)
opt_lm.fit(
    train_ds_subset,
    epochs=num_epochs,
)

In [29]:
start = time.time()

output = opt_lm.generate("My trip to Yosemite was", max_length=200)
print("\nOPT-2 output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


OPT-2 output:
My trip to Yosemite was amazing! I had a really good time there too!
I had a good time in Yosemite too! I had my first day there too!
I was there for the first time too! It was amazing!
TOTAL TIME ELAPSED: 11.89s


# OPT - Adjust Learning Rate

In [None]:
train_ds = train_ds.take(1)
num_epochs = 1

# Linearly decaying learning rate.
learning_rate = keras.optimizers.schedules.PolynomialDecay(
    1e-4,
    decay_steps=train_ds.cardinality() * num_epochs,
    end_learning_rate=0.0,
)
loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
opt_lm.compile(
    optimizer=keras.optimizers.Adam(learning_rate),
    loss=loss,
    weighted_metrics=["accuracy"],
)

opt_lm.fit(
    train_ds_subset,
    epochs=num_epochs,
)

In [30]:
start = time.time()

output = opt_lm.generate("This book is", max_length=200)
print("\nOPT output:")
print(output)

end = time.time()
print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")


OPT output:
This book is about an ancient Roman god who ruled over the Roman Empire. This god was known as the “Sorin,” and the book is a great introduction to his godly nature. This book is written by a man named John C. B. B. C. and includes a lot of great historical facts about the Roman Empire. It is a must-read for those who are interested in the Roman Empire.

This book is about an ancient Roman god who ruled over the Roman Empire. This god was known as the “Sorin,” and the book is a great introduction to his godly nature. This book is written by a man named John C. B. C. and includes a lot of great historical facts about the Roman Empire. It is a must-read for those who are interested in the Roman Empire.

This book is about an ancient Roman god who ruled over the Roman Empire. This god was known as the
TOTAL TIME ELAPSED: 1.26s


After multiple memory issues and the IDE crashing , I was able to try two CasualLMs with different hyperparameters.