In [1]:
import sys
import numpy as np

# TensorFlow and tf.keras
import tensorflow as tf
from sklearn.utils import shuffle
from tensorflow import keras
import tensorflow.keras as keras
from keras.models import Model, Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras import utils as np_utils 

In [2]:
#open the file

with open("/content/drive/MyDrive/data/hold-out.txt", "r") as f:
    text = f.read().lower()

In [3]:
# Replace '\n' by ' '
text = text.replace('\n', ' ')

In [4]:
# Split the data into train and test set
split = int(0.9 * len(text))
train_text = text[:split]
test_test = text[split:]

In [5]:
# create a set of all unique characters in the text
chars = sorted(list(set(text)))
print(f'Total number of characters: {len(chars)}.')

Total number of characters: 28.


In [6]:
# create a dictionary to map characters to integers and vice versa
char_to_int = dict((c, i) for i, c in enumerate(chars))
idx_to_char = dict((i, c) for i, c in enumerate(chars))

In [7]:
# prepare the dataset of input to output encoded as integers 

def make_sequence(text, seq_length=40, step=3):
    seq_in = []
    seq_out = []
    for i in range(0, len(text)-seq_length,1):
        seq_in.append(text[i:(i + seq_length)])
        seq_out.append(text[i + seq_length])
    return seq_in, seq_out

In [8]:
seq_length = 40
step = 3

seq_in, seq_out = make_sequence(train_text, seq_length, step)
seq_in_test, seq_out_test = make_sequence(test_test, seq_length, step=10)

In [9]:
print(f'There are {len(seq_in)} train sequences and {len(seq_out)} test sequences.')

There are 498 train sequences and 498 test sequences.


In [10]:
seq_in, seq_out = shuffle(seq_in, seq_out, random_state=42) #shuffle the sequences

In [11]:
print(f'The first sequence is `{seq_in[0]}` and the first next character is `{seq_out[0]}`.')

The first sequence is `stress' thrall, came there for cure and ` and the first next character is `t`.


In [12]:
#convert training data to one-hot vectors

n_sequences = len(seq_in)
n_sequences_test = len(seq_in_test)
vocab_size = len(chars)

X = np.zeros((n_sequences, seq_length, vocab_size), dtype=np.float32)            
X_test = np.zeros((n_sequences_test, seq_length, vocab_size), dtype=np.float32)         
y = np.zeros((n_sequences, vocab_size), dtype=np.float32)
y_test = np.zeros((n_sequences_test, vocab_size), dtype=np.float32)
                  

# Fill the training data
for i, sequence in enumerate(seq_in):
    y[i, char_to_int[seq_out[i]]] = 1
    for j, char in enumerate(sequence):
        X[i, j, char_to_int[char]] = 1
        
# Fill the test data
for i, sequence in enumerate(seq_in_test):
    y_test[i, char_to_int[seq_out_test[i]]] = 1
    for j, char in enumerate(sequence):
        X_test[i, j, char_to_int[char]] = 1

In [13]:
print(f'Shape of the tensor X: {X.shape}, shape of the matrix y: {y.shape}.')

Shape of the tensor X: (498, 40, 28), shape of the matrix y: (498, 28).


In [14]:
#Compute the per-character perplexity of model predictions

def perplexity(y_true, y_pred):
    likelihoods = np.sum(y_pred * y_true, axis=1)
    return 2 ** (-np.mean(np.log2(likelihoods)))

In [15]:
# define the LSTM model

model = Sequential()
model.add(LSTM(512, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer="adam") #optimize the model

In [16]:
def model_perplexity(model, X, y):
    predictions = model(X)
    return perplexity(y, predictions)

In [17]:
print(f'Model perplexity on the untrained model is {model_perplexity(model, X_test, y_test)}.')

Model perplexity on the untrained model is 27.953109766532002.


Let's train the model for one epoch on a very small subset of the training set to check that it is well defined.

In [18]:
small_train = slice(0, None, 40)
historic_run = model.fit(X[small_train], y[small_train], validation_split=0.1, batch_size=512, epochs=40)                  

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


In [19]:
print(f'The model perplexity on the model trained one epoch is {model_perplexity(model, X_test, y_test)}.')

The model perplexity on the model trained one epoch is 218.2481985324807.
