In [None]:
import numpy as np
import pickle
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

In [None]:
# load
in_filename = 'data/republic_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [None]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [None]:
# save the tokenizer
pickle.dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
# load the tokenizer
tokenizer = pickle.load(open('tokenizer.pkl', 'rb'))

In [None]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [None]:
# separate into input and output
sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

In [None]:
# compile model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# fit model
batch_size=256
epochs=100
model.fit(X, y, batch_size=batch_size, epochs=epochs)

In [None]:
# save the model to file
model_name = str(batch_size) + '-' + str(epochs) + '.h5'
model.save(model_name)