In [1]:
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding

Using TensorFlow backend.


In [2]:
# source text
data = """My name is prakhar mishra . prakhar mishra writes blog on medium ."""

In [4]:
# tokenization and encoding into sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded = tokenizer.texts_to_sequences([data])[0]
encoded

[3, 4, 5, 1, 2, 1, 2, 6, 7, 8, 9]

In [5]:
# vocabulary size
vocab_size = len(tokenizer.word_index)+1
vocab_size

10

In [9]:
# creating i/o sequence pairs
print(len(encoded))
sequences = list()
for i in range(1, len(encoded)):
    sequence = encoded[i-1:i+1]
    sequences.append(sequence)

11


In [10]:
print(sequences)

[[3, 4], [4, 5], [5, 1], [1, 2], [2, 1], [1, 2], [2, 6], [6, 7], [7, 8], [8, 9]]


In [11]:
sequences = np.array(sequences)
X, y = sequences[:,0],sequences[:,1]

In [12]:
X

array([3, 4, 5, 1, 2, 1, 2, 6, 7, 8])

In [13]:
y

array([4, 5, 1, 2, 1, 2, 6, 7, 8, 9])

In [14]:
#converting output to one-hot representation
y = to_categorical(y, num_classes=vocab_size)

In [15]:
y

array([[ 0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.]])

In [16]:
# NN define
embedding_size = 10
def model(vocab_size):
    model = Sequential()
    # input_length = 1 (one word at a time)
    model.add(Embedding(vocab_size, embedding_size, input_length=1))
    model.add(LSTM(50))
    model.add(Dense(vocab_size, activation= "softmax"))
    model.compile(loss="categorical_crossentropy" , optimizer= "adam" , metrics=["accuracy"] )
    model.summary()
    return model

In [17]:
model = model(vocab_size)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1, 10)             100       
_________________________________________________________________
lstm_1 (LSTM)                (None, 50)                12200     
_________________________________________________________________
dense_1 (Dense)              (None, 10)                510       
Total params: 12,810
Trainable params: 12,810
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.fit(X, y, epochs=500, verbose=2)

Epoch 1/500
1s - loss: 2.3030 - acc: 0.0000e+00
Epoch 2/500
0s - loss: 2.3017 - acc: 0.0000e+00
Epoch 3/500
0s - loss: 2.3005 - acc: 0.1000
Epoch 4/500
0s - loss: 2.2992 - acc: 0.3000
Epoch 5/500
0s - loss: 2.2979 - acc: 0.3000
Epoch 6/500
0s - loss: 2.2967 - acc: 0.3000
Epoch 7/500
0s - loss: 2.2954 - acc: 0.4000
Epoch 8/500
0s - loss: 2.2941 - acc: 0.4000
Epoch 9/500
0s - loss: 2.2928 - acc: 0.4000
Epoch 10/500
0s - loss: 2.2915 - acc: 0.4000
Epoch 11/500
0s - loss: 2.2902 - acc: 0.4000
Epoch 12/500
0s - loss: 2.2888 - acc: 0.4000
Epoch 13/500
0s - loss: 2.2875 - acc: 0.4000
Epoch 14/500
0s - loss: 2.2861 - acc: 0.4000
Epoch 15/500
0s - loss: 2.2847 - acc: 0.4000
Epoch 16/500
0s - loss: 2.2833 - acc: 0.4000
Epoch 17/500
0s - loss: 2.2819 - acc: 0.4000
Epoch 18/500
0s - loss: 2.2804 - acc: 0.4000
Epoch 19/500
0s - loss: 2.2789 - acc: 0.4000
Epoch 20/500
0s - loss: 2.2774 - acc: 0.4000
Epoch 21/500
0s - loss: 2.2758 - acc: 0.4000
Epoch 22/500
0s - loss: 2.2742 - acc: 0.4000
Epoch 23/50

0s - loss: 0.5506 - acc: 0.9000
Epoch 223/500
0s - loss: 0.5441 - acc: 0.9000
Epoch 224/500
0s - loss: 0.5378 - acc: 0.9000
Epoch 225/500
0s - loss: 0.5315 - acc: 0.9000
Epoch 226/500
0s - loss: 0.5252 - acc: 0.9000
Epoch 227/500
0s - loss: 0.5191 - acc: 0.9000
Epoch 228/500
0s - loss: 0.5130 - acc: 0.9000
Epoch 229/500
0s - loss: 0.5070 - acc: 0.9000
Epoch 230/500
0s - loss: 0.5011 - acc: 0.9000
Epoch 231/500
0s - loss: 0.4952 - acc: 0.9000
Epoch 232/500
0s - loss: 0.4895 - acc: 0.9000
Epoch 233/500
0s - loss: 0.4838 - acc: 0.9000
Epoch 234/500
0s - loss: 0.4782 - acc: 0.9000
Epoch 235/500
0s - loss: 0.4726 - acc: 0.9000
Epoch 236/500
0s - loss: 0.4672 - acc: 0.9000
Epoch 237/500
0s - loss: 0.4618 - acc: 0.9000
Epoch 238/500
0s - loss: 0.4565 - acc: 0.9000
Epoch 239/500
0s - loss: 0.4513 - acc: 0.9000
Epoch 240/500
0s - loss: 0.4461 - acc: 0.9000
Epoch 241/500
0s - loss: 0.4410 - acc: 0.9000
Epoch 242/500
0s - loss: 0.4360 - acc: 0.9000
Epoch 243/500
0s - loss: 0.4311 - acc: 0.9000
Ep

0s - loss: 0.1589 - acc: 0.9000
Epoch 469/500
0s - loss: 0.1587 - acc: 0.9000
Epoch 470/500
0s - loss: 0.1586 - acc: 0.9000
Epoch 471/500
0s - loss: 0.1585 - acc: 0.9000
Epoch 472/500
0s - loss: 0.1583 - acc: 0.9000
Epoch 473/500
0s - loss: 0.1582 - acc: 0.9000
Epoch 474/500
0s - loss: 0.1581 - acc: 0.9000
Epoch 475/500
0s - loss: 0.1579 - acc: 0.9000
Epoch 476/500
0s - loss: 0.1578 - acc: 0.9000
Epoch 477/500
0s - loss: 0.1577 - acc: 0.9000
Epoch 478/500
0s - loss: 0.1575 - acc: 0.9000
Epoch 479/500
0s - loss: 0.1574 - acc: 0.9000
Epoch 480/500
0s - loss: 0.1573 - acc: 0.9000
Epoch 481/500
0s - loss: 0.1572 - acc: 0.9000
Epoch 482/500
0s - loss: 0.1570 - acc: 0.9000
Epoch 483/500
0s - loss: 0.1569 - acc: 0.9000
Epoch 484/500
0s - loss: 0.1568 - acc: 0.9000
Epoch 485/500
0s - loss: 0.1567 - acc: 0.9000
Epoch 486/500
0s - loss: 0.1566 - acc: 0.9000
Epoch 487/500
0s - loss: 0.1564 - acc: 0.9000
Epoch 488/500
0s - loss: 0.1563 - acc: 0.9000
Epoch 489/500
0s - loss: 0.1562 - acc: 0.9000
Ep

<keras.callbacks.History at 0x7fdfaf79c278>

In [19]:
# testing the model
seed = "writes"
encoded = tokenizer.texts_to_sequences([seed])[0]
encoded = np.array(encoded)
y_pred = model.predict_classes(encoded, verbose=0)

In [20]:
y_pred

array([7])

In [22]:
for word, index in tokenizer.word_index.items():
    if index==y_pred:
        print(word)

blog
