In [1]:
#One-Word-In and One-Word-Out Sequences 
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
# define the model
def define_model(vocab_size):
	model = Sequential()
	model.add(Embedding(vocab_size, 10, input_length=1)) #real valued vector for each word (input sequence contains single word)
	model.add(LSTM(50))
	model.add(Dense(vocab_size, activation='softmax'))
	# compile network
	model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
	return model



In [5]:
# generate a sequence from the model
def generate_seq(model, tokenizer, seed_text, n_words):
	in_text, result = seed_text, seed_text
	# generate a fixed number of words
	for _ in range(n_words):
		# encode the text as integer
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = array(encoded)
		# predict a word in the vocabulary
		yhat = model.predict_classes(encoded, verbose=0)
		# map predicted word index to word
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == yhat:
				out_word = word
				break
		# append to input
		in_text, result = out_word, result + ' ' + out_word
	return result

In [6]:
# source text
data = """ Jack and Jill went up the hill\n
		To fetch a pail of water\n
		Jack fell down and broke his crown\n
		And Jill came tumbling after\n """
# integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)


Vocabulary Size: 22


In [7]:
encoded

[2,
 1,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 2,
 14,
 15,
 1,
 16,
 17,
 18,
 1,
 3,
 19,
 20,
 21]

In [8]:
# create word -> word sequences
sequences = list()
for i in range(1, len(encoded)):
	sequence = encoded[i-1:i+1]
	sequences.append(sequence)
print('Total Sequences: %d' % len(sequences))


Total Sequences: 24


In [9]:
sequences

[[2, 1],
 [1, 3],
 [3, 4],
 [4, 5],
 [5, 6],
 [6, 7],
 [7, 8],
 [8, 9],
 [9, 10],
 [10, 11],
 [11, 12],
 [12, 13],
 [13, 2],
 [2, 14],
 [14, 15],
 [15, 1],
 [1, 16],
 [16, 17],
 [17, 18],
 [18, 1],
 [1, 3],
 [3, 19],
 [19, 20],
 [20, 21]]

In [10]:
# split into X and y elements
sequences = array(sequences)
X, y = sequences[:,0],sequences[:,1]
# one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)
# define model
model = define_model(vocab_size)
# fit network
model.fit(X, y, epochs=100, verbose=2)
# evaluate
print(generate_seq(model, tokenizer, 'Jack', 6))

Epoch 1/100
 - 1s - loss: 3.0912 - acc: 0.0000e+00
Epoch 2/100
 - 0s - loss: 3.0904 - acc: 0.0833
Epoch 3/100
 - 0s - loss: 3.0896 - acc: 0.0833
Epoch 4/100
 - 0s - loss: 3.0888 - acc: 0.1250
Epoch 5/100
 - 0s - loss: 3.0881 - acc: 0.1250
Epoch 6/100
 - 0s - loss: 3.0873 - acc: 0.1250
Epoch 7/100
 - 0s - loss: 3.0865 - acc: 0.1250
Epoch 8/100
 - 0s - loss: 3.0857 - acc: 0.1250
Epoch 9/100
 - 0s - loss: 3.0849 - acc: 0.1250
Epoch 10/100
 - 0s - loss: 3.0841 - acc: 0.1250
Epoch 11/100
 - 0s - loss: 3.0832 - acc: 0.1250
Epoch 12/100
 - 0s - loss: 3.0824 - acc: 0.1250
Epoch 13/100
 - 0s - loss: 3.0815 - acc: 0.1250
Epoch 14/100
 - 0s - loss: 3.0807 - acc: 0.1250
Epoch 15/100
 - 0s - loss: 3.0798 - acc: 0.1250
Epoch 16/100
 - 0s - loss: 3.0789 - acc: 0.1250
Epoch 17/100
 - 0s - loss: 3.0780 - acc: 0.1250
Epoch 18/100
 - 0s - loss: 3.0770 - acc: 0.1250
Epoch 19/100
 - 0s - loss: 3.0761 - acc: 0.1250
Epoch 20/100
 - 0s - loss: 3.0751 - acc: 0.1250
Epoch 21/100
 - 0s - loss: 3.0741 - acc: 0.12

In [16]:
#Randomly check for any word
# evaluate
in_text = 'Jack'
print(in_text)
encoded = tokenizer.texts_to_sequences([in_text])[0]
encoded = array(encoded)
yhat = model.predict_classes(encoded, verbose=0)
for word, index in tokenizer.word_index.items():
	if index == yhat:
		print(word)

Jack
and
