In [2]:
# loading libraries for data manipulation
import numpy as np
import pandas as pd

# loading libraries for data visualization
import matplotlib.pyplot as plt
from plotnine import *
from PIL import Image

# import tensorflow and keras packages
import tensorflow as tf
from tensorflow import keras

# let's also include different Models, Layers directly from keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,LSTM,Embedding,Input,GRU

# use requests package to download some text
import requests

import warnings
warnings.filterwarnings('ignore')

Let's train a sequential model using the text from Alice in Wonderland. Project Gutenberg website hosts text versions of many classics. Feel free to donwload another if you want to. 

In [None]:
# url to Alice in Wonderland in text form
url = "https://gutenberg.org/cache/epub/11/pg11.txt"
text = requests.get(url).text

text = text[1451:] # exclude metadata
print(f"Length of text: {len(text)} characters")

In [None]:
# let's print the first 500 characters from text 
print(text[:500])

Models do not understand text like we do. We will need to create some mapping from text to integers to then pass that along to the model.

In [None]:
# lowercase the text
text = text.lower()

# create a list of all characters in the text
chars = sorted(set(text))

print("Characters in the text:",chars,"\n")

# we will create two look up dictionaries 
## char2idx: maps each character to a unique integer (ID)
## idx2char: maps integer IDs back to characters
char2idx = {c: i for i, c in enumerate(chars)}
idx2char = {i: c for i, c in enumerate(chars)}

print("char2idx",char2idx,"\n")
print("idx2char",idx2char)

Now we can convert the entire text into a series of integers.

In [None]:
text_as_int = np.array([char2idx[c] for c in text], dtype=np.int32)

print("Characters in the text:",len(text_as_int))
print(text_as_int[:500])

Now let's define the input for the model. This model will predict the **next character** given an input and not the next word. We will create our inputs to be 100 characters long. Think of this as a time window with 100 steps. 

In [None]:
# sequence length
seq_length = 100

# use sequence length to calculate number of sequences we can produce
examples_per_epoch = len(text_as_int) // (seq_length + 1)

print(examples_per_epoch,"sequences in the input")

Next, we will use tensorflow's from_tensor_slices function to create a stream of sequences. 


Text: "alice in wonderland"


↓


Integer IDs: [1, 12, 9, 3, 5, ...]


↓


Dataset from tensor slices:


  [1] → [12] → [9] → [3] → [5] ...

In [121]:
# create a dataset where each element is a single character (integer-version)
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# a sequence is then a group of these characters 
# + 1 so that we take 100 characters as input and predict the character shifted by 1
# drop_remainder to drop a sequence if it's length is below our desired length
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)


In [None]:
# print the first 10 characters in the data
for i, item in enumerate(char_dataset.take(10)):
    print(item.numpy())

# print the first sequence 
for i, item in enumerate(sequences.take(1)):
    print(item.numpy())


Next, we can define a function that creates our dataset of sequences. 

In [124]:
#   input_text (first 100 chars)
#   target_text (the next 100 chars, shifted by one position)
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

# apply the function to sequences
dataset = sequences.map(split_input_target)

In [None]:
for input_example, target_example in dataset.take(1):
    print("Input shape:", input_example.shape)
    print("Target shape:", target_example.shape)
    print("First input example (as IDs):", input_example[0].numpy())
    print("First target example (as IDs):", target_example[0].numpy())

In [126]:
BATCH_SIZE = 64 # how many sequences the model sees at once
BUFFER_SIZE = 10000 # controls randomness of shuffle

# shuffle randomly picks elements from a buffer of size 10000 - large buffer = more random shuffling
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

We have now created a dataset where each sequence is 100 characters long and the target for that sequence is also 100 characters long shifted by 1 character. We have also shuffled the input to the model to add some randomness. Note that buffer size if larger than the dataset size means an ideal situation for random selection. 

Let's also define the parameters for our network. While converting characters into integers was the first step, these integer IDs are arbitrary and do not truly represent characters and how they should be represented. The Embedding layer will allow us to learn the relationship between characters. This is much better than one-hot encoding. So as part of predicting a sequence of characters, our model will also learn to better represent each character. 

In [127]:
# define hyperparameters for the network
vocab_size = len(chars)   # number of unique characters
embedding_dim = 256       # dimensions of character embeddings
rnn_units = 512           # LSTM hidden units

model = Sequential([
    Input(shape=(None,)), # None makes the model general to different sizes of inputs
    Embedding(vocab_size, embedding_dim), # add an Embedding layer to convert integer 
                                          #representation of characters into vector representation
    LSTM(rnn_units, return_sequences=True), # return output at each time step
    Dropout(0.2),
    Dense(vocab_size) # output is a probability distribution across all characters
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True) # sparse categories so apply softmax to get probability
)

In [None]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)

In [None]:
model.summary()

Once our network is trained, we can now make predictions from it. We will define a function that takes in a starting sequence and then predicts what the next character should be. That then becomes the input again and we can keep predicting the next character to build sentences. 

Temperature regulates how conservative or random the prediction should be. Predictions are the raw predictions from the model which are to passed to a softmax function to calculate probabilities. By dividing the logits with temperature, we can change the shape of the probability distribution. 
- T = 1.0: no change 
- T < 1.0: model is more predictable/confident - largest logits become more prominent
- T > 1.0: model is more random/creative - flattens distribution

In [130]:
def generate_text(model,starting_seq,num_generate=1000,temperature=1.0):
    input_eval = [char2idx[c] for c in starting_seq.lower()] # convert input chars to ints
    input_eval = tf.expand_dims(input_eval, 0)  # add batch dimension for tf

    generated_text = [] # here we will store the predicted characters

    for i in range(num_generate):
        predictions = model.predict(input_eval, verbose=0)
        predictions = tf.squeeze(predictions, 0) # remove batch dimension

        # apply temperature
        predictions = predictions / temperature

        # get the predicted character for each time step
        # but we only need the very last predicted character (-1)
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()
        
        # now this predicted character becomes the new input to make the next prediction
        input_eval = tf.expand_dims([predicted_id], 0)

        generated_text.append(idx2char[predicted_id]) # add prediction to list
    
    return starting_seq + ''.join(generated_text)

In [None]:
generate_text(model,"alice ",5,1.0)

In [None]:
generate_text(model,"alice ",5,5.0)

In [None]:
generate_text(model,"alice ",5,0.1)

In [None]:
print(generate_text(model, "alice ", num_generate=1000, temperature=0.5))

Let's now train a GRU using the same setup. 

In [None]:
# define hyperparameters for the network
vocab_size = len(chars)   # number of unique characters
embedding_dim = 256       # dimensions of character embeddings
rnn_units = 512           # LSTM hidden units

# the dropout here is within the GRU layer call
# dropout will randomly dropinput features at each time step
# recurrent_dropout will randomly drop hidden states from being passed to the next one
model = Sequential([
    Input(shape=(None,)), # None makes the model general to different sizes of inputs
    Embedding(vocab_size, embedding_dim), # add an Embedding layer to convert integer 
                                          #representation of characters into vector representation
    GRU(rnn_units, return_sequences=True,dropout=0.3,recurrent_dropout=0.3), 
    Dense(vocab_size) # output is a probability distribution across all characters
])

model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True) # sparse categories so apply softmax to get probability
)

In [None]:
# train model
history = model.fit(dataset, epochs=20,verbose=1)

In [None]:
model.summary()

In [None]:
generate_text(model,"alice ",5,1.0)

In [None]:
generate_text(model,"alice ",5,5.0)

In [None]:
generate_text(model,"alice ",5,0.1)

In [None]:
print(generate_text(model, "alice ", num_generate=1000, temperature=0.5))