In [1]:
# 1. Dataset Preparation and Preprocessing
import requests

# Download dataset
url = 'https://www.gutenberg.org/files/11/11-0.txt'
response = requests.get(url)

if response.status_code == 200:
    raw_text = response.text
else:
    raise Exception("Failed to download dataset")

# Clean Project Gutenberg header/footer
start_phrase = '*** START OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***'
end_phrase = '*** END OF THIS PROJECT GUTENBERG EBOOK ALICE’S ADVENTURES IN WONDERLAND ***'

start_idx = raw_text.find(start_phrase) + len(start_phrase)
end_idx = raw_text.find(end_phrase)

# Extract and clean the main content
clean_text = raw_text[start_idx:end_idx].strip()

# Limit text size for faster processing (optional)
text = clean_text[:100000]

# Display some info
print(f"Length of cleaned text: {len(text)} characters")
print("\nSample Text:\n")
print(text[:500])


Length of cleaned text: 100000 characters

Sample Text:

dventures in Wonderland

by Lewis Carroll

THE MILLENNIUM FULCRUM EDITION 3.0

Contents

 CHAPTER I.     Down the Rabbit-Hole
 CHAPTER II.    The Pool of Tears
 CHAPTER III.   A Caucus-Race and a Long Tale
 CHAPTER IV.    The Rabbit Sends in a Little Bill
 CHAPTER V.     Advice from a Caterpillar
 CHAPTER VI.    Pig and Pepper
 CHAPTER VII.   A Mad Tea-Party
 CHAPTER VIII.  The Queen’s Croquet-Ground
 CHAPTER IX.    The Mock Turtle’s Story
 CHAPTER X.     The Lobster Quadrille
 CHAPTER XI.    Wh


In [2]:
# 2. Exploring Generative Pre-trained Transformers (GPTs)
import numpy as np
import tensorflow as tf

# Assume `text` is already loaded and cleaned (from previous steps)

# 1. Create character vocabulary and mappings
vocab = sorted(set(text))
char2idx = {c: i for i, c in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

# 2. Create input-target sequences
seq_length = 100
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    return chunk[:-1], chunk[1:]

dataset = sequences.map(split_input_target)

# 3. Shuffle and batch the dataset
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# 4. Build the LSTM model
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(vocab_size)
])

# 5. Define loss function and compile model
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# 6. Train the model
EPOCHS = 5
history = model.fit(dataset, epochs=EPOCHS)


Epoch 1/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 2s/step - loss: 3.8637
Epoch 2/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - loss: 3.1290
Epoch 3/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2s/step - loss: 2.9129
Epoch 4/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2s/step - loss: 2.6885
Epoch 5/5
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2s/step - loss: 2.5368


In [3]:
# 3. Application Demonstration: Content Creation Tool
import tensorflow as tf
import numpy as np

# Rebuild the model for text generation (batch size = 1)
model_gen = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim),
    tf.keras.layers.LSTM(rnn_units, return_sequences=True),
    tf.keras.layers.Dense(vocab_size)
])

# Build with input shape for single prediction
model_gen.build(tf.TensorShape([1, None]))

# Set weights from the trained model
model_gen.set_weights(model.get_weights())

# Text generation function
def generate_text(model, start_string, num_generate=400, temperature=1.0):
    input_eval = [char2idx.get(s, 0) for s in start_string]  # Map to int IDs
    input_eval = tf.expand_dims(input_eval, 0)  # Batch dimension

    text_generated = []

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = predictions[:, -1, :]  # Last character prediction
        predictions = predictions / temperature  # Adjust temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        # Add predicted character
        text_generated.append(idx2char[predicted_id])

        # Feed prediction back as next input
        input_eval = tf.expand_dims([predicted_id], 0)

    return start_string + ''.join(text_generated)

# Generate example text
prompt = "Once upon a time, there was a curious rabbit"
generated_text = generate_text(model_gen, prompt, temperature=0.4)
print(generated_text)


Once upon a time, there was a curious rabbit o athe an whe helin on’dN
_, ig?F“n he the hendMbZKbff-rs beron;’dushe haghe ad t us aro the and wan oun he he andthe wad
s;;’tit onrm_on he ond llongus he t Int st r mywheg an ithithed h?GKke hingm_ s id athe ad he wn he weshan shee the an fe he h?XThs
“M)gulD“Wre t whe l s.Rje the athe athe and t an the t ingas he wlen thetound s thegle th, the Is he thin adombn the aty””?VI at he he b)in win!‘
