In [87]:
# loading libraries for data manipulation
import numpy as np
import pandas as pd

# loading libraries for data visualization
import matplotlib.pyplot as plt
from plotnine import *
from PIL import Image

# import tensorflow and keras packages
import tensorflow as tf
from tensorflow import keras

# let's also include different Models, Layers directly from keras
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Input,GRU

# use requests package to download some text
import requests

import warnings
warnings.filterwarnings('ignore')

This notebook details the steps to train an LSTM to predict the next **word** given some input. We will use a larger corpus of text (Pride and Prejudice). 

In [None]:
# url to Pride and Prejudice in text form
url = "https://gutenberg.org/cache/epub/1342/pg1342.txt"
text = requests.get(url).text

# clean text 
text = text[text.find("Chapter I.]")+10:text.find("*** END OF THE PROJECT")] # exclude metadata
text = text.lower()
print(f"Length of text: {len(text)} characters")

In [None]:
# identify unique words in text
words = text.split()
print(f"Total words: {len(words)}")

In [None]:
# generate the two dictionaries
vocab = sorted(set(words))
print(f"Unique words: {len(vocab)}")

word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}

Now we can convert the entire text into a series of integers. Here, each word is represented by a unique integer ID. 

In [None]:
text_as_int = np.array([word2idx[w] for w in words], dtype=np.int32)
print("First 20 encoded words:", text_as_int[:20])

For this network, we will use a sequence length of 20 (words).

In [None]:
seq_length = 20  # smaller since words carry more info
examples_per_epoch = len(text_as_int) // (seq_length + 1)
print(f"Number of sequences: {examples_per_epoch}")

Next, we will use tensorflow's from_tensor_slices function to create a stream of sequences. 

In [66]:
word_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)
sequences = word_dataset.batch(seq_length + 1, drop_remainder=True)

In [None]:
# print the first words characters in the data
for i, item in enumerate(word_dataset.take(10)):
    print(item.numpy())

# print the first sequence 
for i, item in enumerate(sequences.take(1)):
    print(item.numpy())


Next, we can define a function that creates our dataset of sequences. 

In [None]:
#   input_text (first 20 chars)
#   target_text (the next 20 chars, shifted by one position)
def split_input_target(chunk):
    input_seq = chunk[:-1]
    target_seq = chunk[1:]
    return input_seq, target_seq

# apply the function to sequences
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input shape:", input_example.shape)
    print("Target shape:", target_example.shape)
    print("First input example (as IDs):", input_example[0].numpy())
    print("First target example (as IDs):", target_example[0].numpy())

In [101]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

We have now created a dataset where each sequence is 20 words long and the target for that sequence is also 20 words long shifted by 1 word. We have also shuffled the input to the model to add some randomness. Note that buffer size if larger than the dataset size means an ideal situation for random selection. 

The Embedding layer will allow us to learn the relationship between characters. This is much better than one-hot encoding. So as part of predicting a sequence of characters, our model will also learn to better represent each character. 

In [71]:
# define hyperparameters for the network
vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

model = Sequential([
    Input(shape=(None,)),
    Embedding(vocab_size, embedding_dim),
    LSTM(rnn_units, return_sequences=True),
    Dropout(0.2),
    Dense(vocab_size)
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True)
)


In [72]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)

Epoch 1/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 342ms/step - loss: 7.4948
Epoch 2/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 344ms/step - loss: 6.7607
Epoch 3/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 359ms/step - loss: 6.4968
Epoch 4/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 403ms/step - loss: 6.2973
Epoch 5/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 391ms/step - loss: 6.0838
Epoch 6/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 381ms/step - loss: 5.8560
Epoch 7/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 383ms/step - loss: 5.6671
Epoch 8/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 384ms/step - loss: 5.5017
Epoch 9/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 403ms/step - loss: 5.3525
Epoch 10/20
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 386ms

In [73]:
model.summary()

In [None]:
# if training takes too long, load pretrained model instead
model = load_model("pride_lstm_word_model.keras")

In [89]:
def generate_text(model, start_seq, num_generate=50, temperature=1.0):
    # Tokenize the starting sequence into words
    input_eval = [word2idx.get(w, 0) for w in start_seq.lower().split()]
    input_eval = tf.expand_dims(input_eval, 0)

    generated_words = []

    for _ in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predictions = tf.squeeze(predictions, 0)
        predictions = predictions / temperature

        predicted_id = tf.random.categorical(predictions[-1:], num_samples=1)[0, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        generated_words.append(idx2word[predicted_id])

    return start_seq + ' ' + ' '.join(generated_words)

In [None]:
generate_text(model, "jane remained", 10, 1.0)

In [None]:
generate_text(model,"he was",5,5.0)

In [None]:
generate_text(model,"it was",5,0.1)

In [None]:
output = generate_text(model, "jane ", num_generate=1000, temperature=0.5)
output = output.split(".")
for sentence in output:
    print(sentence)