In [54]:
import nltk
from nltk.corpus import gutenberg

In [55]:
nltk.download('gutenberg')

[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

In [56]:
import requests
url = "https://www.gutenberg.org/files/1342/1342-0.txt"
response = requests.get(url)

In [57]:
with open("pride_and_prejudice.txt", "w", encoding="utf-8") as f:
    f.write(response.text)

In [58]:
with open("pride_and_prejudice.txt", "r", encoding="utf-8") as f:
    text = f.read()

In [59]:
import string
import numpy as np

In [60]:
def preprocess_text(text):

    text = text.lower()

    text = text.translate(str.maketrans("", "", string.punctuation))

    tokens = text.split()
    return tokens

tokens = preprocess_text(text)


In [61]:
def create_sequences(tokens, seq_length):
    sequences = []
    for i in range(len(tokens) - seq_length):
        sequences.append(tokens[i:i + seq_length + 1])
    return sequences

seq_length = 30
sequences = create_sequences(tokens, seq_length)

In [62]:
vocab = sorted(set(tokens))
vocab_size = len(vocab)

token_to_int = {token: i for i, token in enumerate(vocab)}
int_to_token = {i: token for i, token in enumerate(vocab)}
encoded_sequences = [[token_to_int[token] for token in seq] for seq in sequences]
X = np.array([seq[:-1] for seq in encoded_sequences])
y = np.array([seq[1:] for seq in encoded_sequences])

In [63]:
import tensorflow as tf


model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=50, input_length=seq_length),
    tf.keras.layers.Flatten(),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [64]:
encoded_sequences = [[token_to_int[token] for token in seq] for seq in sequences]
X = np.array([seq[:-1] for seq in encoded_sequences])
y = np.array([seq[-1] for seq in encoded_sequences])

In [65]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [66]:
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
model.fit(X, y, epochs=20, batch_size=128, validation_split=0.1, callbacks=[early_stopping])

Epoch 1/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 66ms/step - accuracy: 0.0420 - loss: 6.9264 - val_accuracy: 0.0665 - val_loss: 6.2003
Epoch 2/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 64ms/step - accuracy: 0.0811 - loss: 5.9304 - val_accuracy: 0.1017 - val_loss: 5.8467
Epoch 3/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m64s[0m 71ms/step - accuracy: 0.1181 - loss: 5.3806 - val_accuracy: 0.1122 - val_loss: 5.8172
Epoch 4/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m77s[0m 65ms/step - accuracy: 0.1419 - loss: 4.9803 - val_accuracy: 0.1178 - val_loss: 5.8826
Epoch 5/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 67ms/step - accuracy: 0.1709 - loss: 4.6171 - val_accuracy: 0.1170 - val_loss: 6.0241
Epoch 6/20
[1m895/895[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 65ms/step - accuracy: 0.2041 - loss: 4.2633 - val_accuracy: 0.1086 - val_loss: 6.2682


<keras.src.callbacks.history.History at 0x7846c1742fb0>

In [67]:
def prepare_seed(seed_text, token_to_int):
    seed_tokens = seed_text.lower().split()
    return [token_to_int[token] for token in seed_tokens if token in token_to_int]

In [68]:
def generate_text(model, start_seed, gen_length):
    result = []
    input_sequence = prepare_seed(start_seed, token_to_int)

    for _ in range(gen_length):
        padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(
            [input_sequence], maxlen=seq_length, truncating='pre'
        )
        predicted = model.predict(padded_sequence, verbose=0)
        next_token = np.random.choice(range(vocab_size), p=predicted[0])

        input_sequence.append(next_token)
        result.append(int_to_token[next_token])
        input_sequence = input_sequence[1:]

    return ' '.join(result)

In [69]:
import time
seed_text = "it is a truth universally acknowledged"

try:
    while True:
        generated_text = generate_text(model, seed_text, 10)
        print("Generated Text:")
        print(generated_text)
        time.sleep(10)
except KeyboardInterrupt:
    print("Text generation stopped.")

Generated Text:
overpowered every whole good character like execution hearing his most
Generated Text:
having be gardiner lucas shall be belong to receive suffer
Generated Text:
little own pleasant dear suppressed unrestrained bennet seems taken very


KeyboardInterrupt: 