#### LSTM with variable length input sequences to one character output

In [5]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from tensorflow.keras.preprocessing.sequence import pad_sequences 

#### Fix random seeds for Reproducibility 

In [7]:
np.random.seed(7)

#### Define row dataset

In [1]:
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

#### Creating mapping of characters to integers (0-25) and reverse

In [23]:
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

#### Prepare the dataset of input and output pairs encoded as integers

In [46]:
num_inputs = 1000
max_len = 5
dataX = []
dataY = []
for i in range(num_inputs):
    start = np.random.randint(len(alphabet)-2)
    end = np.random.randint(start, min(start+max_len,len(alphabet)-1))
    sequence_in = alphabet[start:end+1]
    sequence_out = alphabet[end + 1]
    dataX.append([char_to_int[char] for char in sequence_in])
    dataY.append(char_to_int[sequence_out])
    seq += sequence_in, '->', sequence_out
print(seq[:150])

[[4, 5, 6, 7], 8, 'EFGH', '->', 'I', [14, 15, 16, 17, 18], 19, 'OPQRS', '->', 'T', [2, 3], 4, 'CD', '->', 'E', [10, 11], 12, 'KL', '->', 'M', [14, 15, 16, 17, 18], 19, 'OPQRS', '->', 'T', [3], 4, 'D', '->', 'E', [14, 15, 16, 17], 18, 'OPQR', '->', 'S', [4, 5, 6, 7], 8, 'EFGH', '->', 'I', [16, 17, 18], 19, 'QRS', '->', 'T', [6], 7, 'G', '->', 'H', [22], 23, 'W', '->', 'X', [20, 21, 22], 23, 'UVW', '->', 'X', [13, 14, 15], 16, 'NOP', '->', 'Q', [9, 10, 11, 12], 13, 'JKLM', '->', 'N', [9, 10, 11, 12], 13, 'JKLM', '->', 'N', [18, 19], 20, 'ST', '->', 'U', [2, 3, 4, 5, 6], 7, 'CDEFG', '->', 'H', [21, 22, 23], 24, 'VWX', '->', 'Y', [11], 12, 'L', '->', 'M', [11], 12, 'L', '->', 'M', [12, 13], 14, 'MN', '->', 'O', [10, 11, 12, 13, 14], 15, 'KLMNO', '->', 'P', [9, 10, 11, 12, 13], 14, 'JKLMN', '->', 'O', [20, 21, 22, 23], 24, 'UVWX', '->', 'Y', [13, 14, 15], 16, 'NOP', '->', 'Q', [14, 15, 16], 17, 'OPQ', '->', 'R', [13, 14, 15, 16], 17, 'NOPQ', '->', 'R', [18, 19, 20, 21, 22], 23, 'STUVW', '->

#### Convert the List of Lists to an Array and pad_sequences if needed

In [48]:
# convert list of lists to array and pad sequences if needed
X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X, (X.shape[0], max_len, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
# create and fit the model

In [50]:
batch_size = 50
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], 1)))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, batch_size=batch_size, verbose=0)

2022-09-10 13:51:51.772576: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-10 13:51:51.860907: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-10 13:51:51.897606: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


<keras.callbacks.History at 0x166cb0940>

In [51]:
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))


2022-09-10 13:53:05.332486: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-10 13:53:05.404554: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Model Accuracy: 89.20%


In [52]:
# demonstrate some model predictions
for i in range(20):
    pattern_index = numpy.random.randint(len(dataX))
    pattern = dataX[pattern_index]
    x = pad_sequences([pattern], maxlen=max_len, dtype='float32')
    x = numpy.reshape(x, (1, max_len, 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)

2022-09-10 13:53:16.035537: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2022-09-10 13:53:16.073444: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


['P', 'Q'] -> R
['G', 'H', 'I', 'J', 'K'] -> L
['E', 'F', 'G'] -> H
['O', 'P'] -> Q
['W'] -> Y
['N', 'O', 'P', 'Q'] -> R
['W', 'X'] -> Y
['Q', 'R', 'S', 'T', 'U'] -> V
['F', 'G'] -> H
['M', 'N', 'O', 'P', 'Q'] -> R
['J', 'K', 'L'] -> M
['X'] -> Y
['I'] -> J
['R', 'S', 'T', 'U'] -> V
['E', 'F'] -> G
['W', 'X', 'Y'] -> Z
['M', 'N', 'O', 'P', 'Q'] -> R
['J', 'K', 'L', 'M', 'N'] -> O
['O', 'P', 'Q', 'R', 'S'] -> T
['P', 'Q', 'R'] -> S
