In [20]:
# LSTM with Variable Length Input Sequences to One Character Output
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.utils import np_utils
from keras.preprocessing.sequence import pad_sequences

In [21]:
# fix random seed for reproducibility
numpy.random.seed(7)

In [22]:
# define the raw dataset
alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"

In [23]:
# create mapping of characters to integers (0-25) and the reverse
char_to_int = dict((c, i) for i, c in enumerate(alphabet))
int_to_char = dict((i, c) for i, c in enumerate(alphabet))

In [24]:
# prepare the dataset of input to output pairs encoded as integers
num_inputs = 1000
max_len = 5
dataX = []
dataY = []
for i in range(num_inputs):
    start = numpy.random.randint(len(alphabet)-2)
    end = numpy.random.randint(start, min(start+max_len,len(alphabet)-1))
    sequence_in = alphabet[start:end+1]
    sequence_out = alphabet[end + 1]
    dataX.append([char_to_int[char] for char in sequence_in])
    dataY.append(char_to_int[sequence_out])
    seq += sequence_in, '->', sequence_out
print(seq[:150])


('QR', '->', 'S', 'BCDEF', '->', 'G', 'PQRS', '->', 'T', 'BC', '->', 'D', 'TUVW', '->', 'X', 'DEFGH', '->', 'I', 'TU', '->', 'V', 'B', '->', 'C', 'VW', '->', 'X', 'MN', '->', 'O', 'CDEFG', '->', 'H', 'DEFG', '->', 'H', 'QRST', '->', 'U', 'RS', '->', 'T', 'OPQ', '->', 'R', 'KLMNO', '->', 'P', 'GHIJ', '->', 'K', 'STUV', '->', 'W', 'OPQRS', '->', 'T', 'L', '->', 'M', 'GHIJ', '->', 'K', 'JKL', '->', 'M', 'AB', '->', 'C', 'OP', '->', 'Q', 'IJ', '->', 'K', 'ABC', '->', 'D', 'HIJK', '->', 'L', 'IJKLM', '->', 'N', 'TUVWX', '->', 'Y', 'R', '->', 'S', 'X', '->', 'Y', 'IJKLM', '->', 'N', 'S', '->', 'T', 'PQ', '->', 'R', 'DEF', '->', 'G', 'KLMNO', '->', 'P', 'NOP', '->', 'Q', 'EF', '->', 'G', 'M', '->', 'N', 'KLMN', '->', 'O', 'NO', '->', 'P', 'ST', '->', 'U', 'C', '->', 'D', 'QR', '->', 'S', 'D', '->', 'E', 'KLM', '->', 'N', 'LMN', '->', 'O', 'X', '->', 'Y', 'STU', '->', 'V', 'WX', '->', 'Y')


In [25]:
# convert list of lists to array and pad sequences if needed
X = pad_sequences(dataX, maxlen=max_len, dtype='float32')
# reshape X to be [samples, time steps, features]
X = numpy.reshape(X, (X.shape[0], max_len, 1))
# normalize
X = X / float(len(alphabet))
# one hot encode the output variable
y = np_utils.to_categorical(dataY)
# create and fit the model

In [39]:
batch_size = 50
model = Sequential()
model.add(LSTM(32, input_shape=(X.shape[1], 1)))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, batch_size=batch_size, verbose=0)

<keras.callbacks.callbacks.History at 0x1f4c3398e48>

In [40]:
# summarize performance of the model
scores = model.evaluate(X, y, verbose=0)
print("Model Accuracy: %.2f%%" % (scores[1]*100))

Model Accuracy: 88.50%


In [41]:
# demonstrate some model predictions
for i in range(20):
    pattern_index = numpy.random.randint(len(dataX))
    pattern = dataX[pattern_index]
    x = pad_sequences([pattern], maxlen=max_len, dtype='float32')
    x = numpy.reshape(x, (1, max_len, 1))
    x = x / float(len(alphabet))
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    print(seq_in, "->", result)

['I'] -> J
['G', 'H', 'I', 'J', 'K'] -> L
['U', 'V', 'W', 'X'] -> Y
['T', 'U', 'V', 'W', 'X'] -> Y
['T', 'U', 'V'] -> W
['V', 'W', 'X', 'Y'] -> Z
['J', 'K'] -> L
['H', 'I', 'J', 'K'] -> L
['W', 'X'] -> Y
['K', 'L', 'M', 'N', 'O'] -> P
['C', 'D'] -> E
['M'] -> N
['V', 'W', 'X'] -> Y
['N'] -> N
['T', 'U', 'V', 'W'] -> X
['Q'] -> R
['I', 'J', 'K'] -> L
['I'] -> J
['J', 'K', 'L', 'M', 'N'] -> O
['M'] -> N
