In [1]:
from numpy import array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from keras.utils.vis_utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [2]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, max_length, seed_text, n_words):
    
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        
        # pre-pad sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=max_length, padding='pre')

        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        print('yhat :',yhat)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
    return in_text

In [3]:
# define the model
def define_model(vocab_size, max_length):
    
    model = Sequential()
    model.add(Embedding(vocab_size, 10, input_length=max_length-1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation='softmax'))

    # compile network
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # summarize defined model
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [4]:
# source text
data = """ Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

In [5]:
# prepare the tokenizer on the source text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

In [6]:
# determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 22


In [7]:
# create line-based sequences
sequences = list()

for line in data.split('\n'):
    
    encoded = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(encoded)):
        sequence = encoded[:i+1]
        sequences.append(sequence)

print('Total Sequences: %d' % len(sequences))

Total Sequences: 21


In [8]:
sequences

[[2, 1],
 [2, 1, 3],
 [2, 1, 3, 4],
 [2, 1, 3, 4, 5],
 [2, 1, 3, 4, 5, 6],
 [2, 1, 3, 4, 5, 6, 7],
 [8, 9],
 [8, 9, 10],
 [8, 9, 10, 11],
 [8, 9, 10, 11, 12],
 [8, 9, 10, 11, 12, 13],
 [2, 14],
 [2, 14, 15],
 [2, 14, 15, 1],
 [2, 14, 15, 1, 16],
 [2, 14, 15, 1, 16, 17],
 [2, 14, 15, 1, 16, 17, 18],
 [1, 3],
 [1, 3, 19],
 [1, 3, 19, 20],
 [1, 3, 19, 20, 21]]

In [9]:
# pad input sequences
max_length = max([len(seq) for seq in sequences])
sequences = pad_sequences(sequences, maxlen=max_length, padding='pre')
print('Max Sequence Length: %d' % max_length)

Max Sequence Length: 7


In [10]:
# split into input and output elements
sequences = array(sequences)
X, y = sequences[:,:-1],sequences[:,-1]


In [11]:
X

array([[ 0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  2,  1],
       [ 0,  0,  0,  2,  1,  3],
       [ 0,  0,  2,  1,  3,  4],
       [ 0,  2,  1,  3,  4,  5],
       [ 2,  1,  3,  4,  5,  6],
       [ 0,  0,  0,  0,  0,  8],
       [ 0,  0,  0,  0,  8,  9],
       [ 0,  0,  0,  8,  9, 10],
       [ 0,  0,  8,  9, 10, 11],
       [ 0,  8,  9, 10, 11, 12],
       [ 0,  0,  0,  0,  0,  2],
       [ 0,  0,  0,  0,  2, 14],
       [ 0,  0,  0,  2, 14, 15],
       [ 0,  0,  2, 14, 15,  1],
       [ 0,  2, 14, 15,  1, 16],
       [ 2, 14, 15,  1, 16, 17],
       [ 0,  0,  0,  0,  0,  1],
       [ 0,  0,  0,  0,  1,  3],
       [ 0,  0,  0,  1,  3, 19],
       [ 0,  0,  1,  3, 19, 20]])

In [12]:
y

array([ 1,  3,  4,  5,  6,  7,  9, 10, 11, 12, 13, 14, 15,  1, 16, 17, 18,
        3, 19, 20, 21])

In [13]:
y = to_categorical(y, num_classes=vocab_size)

In [14]:
# define model
model = define_model(vocab_size, max_length)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 6, 10)             220       
_________________________________________________________________
lstm (LSTM)                  (None, 50)                12200     
_________________________________________________________________
dense (Dense)                (None, 22)                1122      
Total params: 13,542
Trainable params: 13,542
Non-trainable params: 0
_________________________________________________________________


In [15]:
# fit network
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
1/1 - 0s - loss: 3.0907 - accuracy: 0.0000e+00
Epoch 2/500
1/1 - 0s - loss: 3.0892 - accuracy: 0.0000e+00
Epoch 3/500
1/1 - 0s - loss: 3.0877 - accuracy: 0.0000e+00
Epoch 4/500
1/1 - 0s - loss: 3.0861 - accuracy: 0.1429
Epoch 5/500
1/1 - 0s - loss: 3.0846 - accuracy: 0.1905
Epoch 6/500
1/1 - 0s - loss: 3.0829 - accuracy: 0.1905
Epoch 7/500
1/1 - 0s - loss: 3.0813 - accuracy: 0.1905
Epoch 8/500
1/1 - 0s - loss: 3.0795 - accuracy: 0.1905
Epoch 9/500
1/1 - 0s - loss: 3.0777 - accuracy: 0.1905
Epoch 10/500
1/1 - 0s - loss: 3.0758 - accuracy: 0.1429
Epoch 11/500
1/1 - 0s - loss: 3.0739 - accuracy: 0.0952
Epoch 12/500
1/1 - 0s - loss: 3.0718 - accuracy: 0.0952
Epoch 13/500
1/1 - 0s - loss: 3.0696 - accuracy: 0.0952
Epoch 14/500
1/1 - 0s - loss: 3.0673 - accuracy: 0.0952
Epoch 15/500
1/1 - 0s - loss: 3.0648 - accuracy: 0.0952
Epoch 16/500
1/1 - 0s - loss: 3.0622 - accuracy: 0.0952
Epoch 17/500
1/1 - 0s - loss: 3.0594 - accuracy: 0.0952
Epoch 18/500
1/1 - 0s - loss: 3.0563 - accura

Epoch 147/500
1/1 - 0s - loss: 0.8853 - accuracy: 0.8095
Epoch 148/500
1/1 - 0s - loss: 0.8773 - accuracy: 0.8095
Epoch 149/500
1/1 - 0s - loss: 0.8693 - accuracy: 0.8095
Epoch 150/500
1/1 - 0s - loss: 0.8615 - accuracy: 0.8095
Epoch 151/500
1/1 - 0s - loss: 0.8539 - accuracy: 0.8095
Epoch 152/500
1/1 - 0s - loss: 0.8466 - accuracy: 0.8095
Epoch 153/500
1/1 - 0s - loss: 0.8395 - accuracy: 0.8095
Epoch 154/500
1/1 - 0s - loss: 0.8323 - accuracy: 0.8095
Epoch 155/500
1/1 - 0s - loss: 0.8252 - accuracy: 0.8095
Epoch 156/500
1/1 - 0s - loss: 0.8182 - accuracy: 0.8095
Epoch 157/500
1/1 - 0s - loss: 0.8114 - accuracy: 0.8095
Epoch 158/500
1/1 - 0s - loss: 0.8048 - accuracy: 0.8095
Epoch 159/500
1/1 - 0s - loss: 0.7983 - accuracy: 0.8095
Epoch 160/500
1/1 - 0s - loss: 0.7920 - accuracy: 0.8095
Epoch 161/500
1/1 - 0s - loss: 0.7858 - accuracy: 0.8095
Epoch 162/500
1/1 - 0s - loss: 0.7797 - accuracy: 0.8095
Epoch 163/500
1/1 - 0s - loss: 0.7738 - accuracy: 0.8095
Epoch 164/500
1/1 - 0s - loss: 

Epoch 291/500
1/1 - 0s - loss: 0.3260 - accuracy: 0.9048
Epoch 292/500
1/1 - 0s - loss: 0.3240 - accuracy: 0.9048
Epoch 293/500
1/1 - 0s - loss: 0.3221 - accuracy: 0.9048
Epoch 294/500
1/1 - 0s - loss: 0.3201 - accuracy: 0.9048
Epoch 295/500
1/1 - 0s - loss: 0.3181 - accuracy: 0.9048
Epoch 296/500
1/1 - 0s - loss: 0.3162 - accuracy: 0.9048
Epoch 297/500
1/1 - 0s - loss: 0.3142 - accuracy: 0.9048
Epoch 298/500
1/1 - 0s - loss: 0.3123 - accuracy: 0.9048
Epoch 299/500
1/1 - 0s - loss: 0.3103 - accuracy: 0.9048
Epoch 300/500
1/1 - 0s - loss: 0.3084 - accuracy: 0.9048
Epoch 301/500
1/1 - 0s - loss: 0.3065 - accuracy: 0.9048
Epoch 302/500
1/1 - 0s - loss: 0.3046 - accuracy: 0.9048
Epoch 303/500
1/1 - 0s - loss: 0.3027 - accuracy: 0.9048
Epoch 304/500
1/1 - 0s - loss: 0.3009 - accuracy: 0.9048
Epoch 305/500
1/1 - 0s - loss: 0.2990 - accuracy: 0.9048
Epoch 306/500
1/1 - 0s - loss: 0.2972 - accuracy: 0.9524
Epoch 307/500
1/1 - 0s - loss: 0.2953 - accuracy: 0.9524
Epoch 308/500
1/1 - 0s - loss: 

Epoch 435/500
1/1 - 0s - loss: 0.1335 - accuracy: 0.9524
Epoch 436/500
1/1 - 0s - loss: 0.1329 - accuracy: 0.9524
Epoch 437/500
1/1 - 0s - loss: 0.1323 - accuracy: 0.9524
Epoch 438/500
1/1 - 0s - loss: 0.1317 - accuracy: 0.9524
Epoch 439/500
1/1 - 0s - loss: 0.1311 - accuracy: 0.9524
Epoch 440/500
1/1 - 0s - loss: 0.1305 - accuracy: 0.9524
Epoch 441/500
1/1 - 0s - loss: 0.1299 - accuracy: 0.9524
Epoch 442/500
1/1 - 0s - loss: 0.1293 - accuracy: 0.9524
Epoch 443/500
1/1 - 0s - loss: 0.1287 - accuracy: 0.9524
Epoch 444/500
1/1 - 0s - loss: 0.1282 - accuracy: 0.9524
Epoch 445/500
1/1 - 0s - loss: 0.1276 - accuracy: 0.9524
Epoch 446/500
1/1 - 0s - loss: 0.1271 - accuracy: 0.9524
Epoch 447/500
1/1 - 0s - loss: 0.1265 - accuracy: 0.9524
Epoch 448/500
1/1 - 0s - loss: 0.1260 - accuracy: 0.9524
Epoch 449/500
1/1 - 0s - loss: 0.1255 - accuracy: 0.9524
Epoch 450/500
1/1 - 0s - loss: 0.1249 - accuracy: 0.9524
Epoch 451/500
1/1 - 0s - loss: 0.1244 - accuracy: 0.9524
Epoch 452/500
1/1 - 0s - loss: 

<tensorflow.python.keras.callbacks.History at 0x1ccbd237048>

In [16]:
""" Jack and Jill went up the hill\n
To fetch a pail of water\n
Jack fell down and broke his crown\n
And Jill came tumbling after\n """

' Jack and Jill went up the hill\n\nTo fetch a pail of water\n\nJack fell down and broke his crown\n\nAnd Jill came tumbling after\n '

In [17]:
# evaluate model
print(generate_seq(model, tokenizer, max_length-1, 'Jack and Jill', 3))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
yhat : [4]
yhat : [5]
yhat : [6]
Jack and Jill went up the


In [18]:
print(generate_seq(model, tokenizer, max_length-1, 'Jack fell down', 3))

yhat : [1]
yhat : [16]
yhat : [17]
Jack fell down and broke his


In [19]:
print(generate_seq(model, tokenizer, max_length-1, 'Jack fell down', 4))

yhat : [1]
yhat : [16]
yhat : [17]
yhat : [18]
Jack fell down and broke his crown


In [20]:
print(generate_seq(model, tokenizer, max_length-1, 'Jack fell down and', 3))

yhat : [16]
yhat : [17]
yhat : [18]
Jack fell down and broke his crown


In [21]:
print(generate_seq(model, tokenizer, max_length-1, 'And Jill came', 3))

yhat : [20]
yhat : [21]
yhat : [7]
And Jill came tumbling after hill
