## Predict next words in a sentence using LSTM Recurrent Neural Network (RNN)

In [88]:
from numpy import array
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding

In [89]:
#source text
data = open('C:\\Users\\HP.DESKTOP-85IK0UN\\Machine_Learning_Bootcamp\\MinorProject\\word_prediction_model\\story.txt').read()

In [90]:
#integer encode text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
encoded_data = tokenizer.texts_to_sequences([data])[0]
encoded_data

[12,
 11,
 9,
 1,
 348,
 586,
 8,
 6,
 587,
 7,
 349,
 74,
 47,
 2,
 246,
 19,
 198,
 350,
 29,
 351,
 21,
 5,
 352,
 3,
 588,
 83,
 589,
 9,
 22,
 7,
 70,
 145,
 199,
 590,
 5,
 12,
 11,
 353,
 20,
 86,
 8,
 1,
 133,
 34,
 591,
 354,
 47,
 2,
 76,
 95,
 3,
 592,
 593,
 64,
 12,
 68,
 31,
 594,
 8,
 87,
 595,
 47,
 50,
 247,
 1,
 596,
 164,
 3,
 597,
 1,
 108,
 134,
 1,
 355,
 50,
 31,
 598,
 5,
 599,
 48,
 356,
 6,
 600,
 601,
 146,
 51,
 31,
 602,
 3,
 603,
 59,
 35,
 604,
 357,
 1,
 200,
 358,
 146,
 201,
 31,
 605,
 121,
 12,
 606,
 3,
 607,
 83,
 7,
 202,
 608,
 27,
 47,
 25,
 165,
 11,
 609,
 610,
 2,
 68,
 611,
 9,
 25,
 359,
 3,
 360,
 96,
 612,
 23,
 50,
 613,
 3,
 614,
 1,
 349,
 3,
 87,
 361,
 615,
 134,
 203,
 6,
 248,
 23,
 3,
 1,
 616,
 617,
 38,
 33,
 618,
 24,
 1,
 86,
 9,
 619,
 620,
 249,
 3,
 6,
 621,
 88,
 351,
 21,
 11,
 622,
 83,
 623,
 250,
 9,
 251,
 624,
 625,
 250,
 47,
 362,
 3,
 626,
 48,
 627,
 8,
 109,
 146,
 31,
 1,
 252,
 7,
 22,
 7,
 25,
 253,
 628,
 89

In [91]:
#determine the vocabulary size
vocab_size = len(tokenizer.word_index) + 1 #0 is reserved for padding that's why we added 1
print('Vocabulary Size: %d' % vocab_size)

Vocabulary Size: 1431


### Next we need to create sequences  of words to fit the model with one word as input and other as output

In [92]:
#create word -> word sequences
sequences = list()

for i in range(1, len(encoded_data)):
    sequence = encoded_data[i-1:i+1]
    sequences.append(sequence)
    
max_sequence_len = max([len(x) for x in sequences])
    
print('Total Sequences: %d' % len(sequences))
#split into X and y elements

Total Sequences: 6525


In [93]:
sequences #Running this piece shows that we have a total of 34 input-output pairs to train the network

[[12, 11],
 [11, 9],
 [9, 1],
 [1, 348],
 [348, 586],
 [586, 8],
 [8, 6],
 [6, 587],
 [587, 7],
 [7, 349],
 [349, 74],
 [74, 47],
 [47, 2],
 [2, 246],
 [246, 19],
 [19, 198],
 [198, 350],
 [350, 29],
 [29, 351],
 [351, 21],
 [21, 5],
 [5, 352],
 [352, 3],
 [3, 588],
 [588, 83],
 [83, 589],
 [589, 9],
 [9, 22],
 [22, 7],
 [7, 70],
 [70, 145],
 [145, 199],
 [199, 590],
 [590, 5],
 [5, 12],
 [12, 11],
 [11, 353],
 [353, 20],
 [20, 86],
 [86, 8],
 [8, 1],
 [1, 133],
 [133, 34],
 [34, 591],
 [591, 354],
 [354, 47],
 [47, 2],
 [2, 76],
 [76, 95],
 [95, 3],
 [3, 592],
 [592, 593],
 [593, 64],
 [64, 12],
 [12, 68],
 [68, 31],
 [31, 594],
 [594, 8],
 [8, 87],
 [87, 595],
 [595, 47],
 [47, 50],
 [50, 247],
 [247, 1],
 [1, 596],
 [596, 164],
 [164, 3],
 [3, 597],
 [597, 1],
 [1, 108],
 [108, 134],
 [134, 1],
 [1, 355],
 [355, 50],
 [50, 31],
 [31, 598],
 [598, 5],
 [5, 599],
 [599, 48],
 [48, 356],
 [356, 6],
 [6, 600],
 [600, 601],
 [601, 146],
 [146, 51],
 [51, 31],
 [31, 602],
 [602, 3],
 [3, 

### We can then split the sequences into input(X) and output elements(y)

In [94]:
sequences = array(sequences)
X, y = sequences[:, 0], sequences[:, 1]

In [95]:
X[:5]

array([ 12,  11,   9,   1, 348])

In [96]:
y[:5]

array([ 11,   9,   1, 348, 586])

In [97]:
#one hot encode outputs
y = to_categorical(y, num_classes=vocab_size)

#define model
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [98]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = 1)) # Embedding Layer
model.add(Bidirectional(LSTM(150, return_sequences=True)))  # Bidirectional LSTM Layer
model.add(Dropout(0.2)) # Dropout Layer
model.add(LSTM(100)) # LSTM Layer
model.add(Dense(vocab_size/2, activation='relu'))  # A Dense Layer including regularizers
model.add(Dense(vocab_size, activation='softmax'))  # A Dense Layer
print(model.summary())

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1, 10)             14310     
_________________________________________________________________
bidirectional (Bidirectional (None, 1, 300)            193200    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1, 300)            0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_5 (Dense)              (None, 715)               72215     
_________________________________________________________________
dense_6 (Dense)              (None, 1431)              1024596   
Total params: 1,464,721
Trainable params: 1,464,721
Non-trainable params: 0
____________________________________________

In [99]:
#compile network
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy']) # loss function and an optimizer

In [100]:
#fit network
model.fit(X, y, epochs = 250)

Epoch 1/250
Epoch 2/250
Epoch 3/250
Epoch 4/250
Epoch 5/250
Epoch 6/250
Epoch 7/250
Epoch 8/250
Epoch 9/250
Epoch 10/250
Epoch 11/250
Epoch 12/250
Epoch 13/250
Epoch 14/250
Epoch 15/250
Epoch 16/250
Epoch 17/250
Epoch 18/250
Epoch 19/250
Epoch 20/250
Epoch 21/250
Epoch 22/250
Epoch 23/250
Epoch 24/250
Epoch 25/250
Epoch 26/250
Epoch 27/250
Epoch 28/250
Epoch 29/250
Epoch 30/250
Epoch 31/250
Epoch 32/250
Epoch 33/250
Epoch 34/250
Epoch 35/250
Epoch 36/250
Epoch 37/250
Epoch 38/250
Epoch 39/250
Epoch 40/250
Epoch 41/250
Epoch 42/250
Epoch 43/250
Epoch 44/250
Epoch 45/250
Epoch 46/250
Epoch 47/250
Epoch 48/250
Epoch 49/250
Epoch 50/250
Epoch 51/250
Epoch 52/250
Epoch 53/250
Epoch 54/250
Epoch 55/250
Epoch 56/250
Epoch 57/250
Epoch 58/250
Epoch 59/250
Epoch 60/250
Epoch 61/250
Epoch 62/250
Epoch 63/250
Epoch 64/250
Epoch 65/250
Epoch 66/250
Epoch 67/250
Epoch 68/250
Epoch 69/250
Epoch 70/250
Epoch 71/250
Epoch 72/250
Epoch 73/250
Epoch 74/250
Epoch 75/250
Epoch 76/250
Epoch 77/250
Epoch 78

Epoch 80/250
Epoch 81/250
Epoch 82/250
Epoch 83/250
Epoch 84/250
Epoch 85/250
Epoch 86/250
Epoch 87/250
Epoch 88/250
Epoch 89/250
Epoch 90/250
Epoch 91/250
Epoch 92/250
Epoch 93/250
Epoch 94/250
Epoch 95/250
Epoch 96/250
Epoch 97/250
Epoch 98/250
Epoch 99/250
Epoch 100/250
Epoch 101/250
Epoch 102/250
Epoch 103/250
Epoch 104/250
Epoch 105/250
Epoch 106/250
Epoch 107/250
Epoch 108/250
Epoch 109/250
Epoch 110/250
Epoch 111/250
Epoch 112/250
Epoch 113/250
Epoch 114/250
Epoch 115/250
Epoch 116/250
Epoch 117/250
Epoch 118/250
Epoch 119/250
Epoch 120/250
Epoch 121/250
Epoch 122/250
Epoch 123/250
Epoch 124/250
Epoch 125/250
Epoch 126/250
Epoch 127/250
Epoch 128/250
Epoch 129/250
Epoch 130/250
Epoch 131/250
Epoch 132/250
Epoch 133/250
Epoch 134/250
Epoch 135/250
Epoch 136/250
Epoch 137/250
Epoch 138/250
Epoch 139/250
Epoch 140/250
Epoch 141/250
Epoch 142/250
Epoch 143/250
Epoch 144/250
Epoch 145/250
Epoch 146/250
Epoch 147/250
Epoch 148/250
Epoch 149/250
Epoch 150/250
Epoch 151/250
Epoch 152/25

Epoch 158/250
Epoch 159/250
Epoch 160/250
Epoch 161/250
Epoch 162/250
Epoch 163/250
Epoch 164/250
Epoch 165/250
Epoch 166/250
Epoch 167/250
Epoch 168/250
Epoch 169/250
Epoch 170/250
Epoch 171/250
Epoch 172/250
Epoch 173/250
Epoch 174/250
Epoch 175/250
Epoch 176/250
Epoch 177/250
Epoch 178/250
Epoch 179/250
Epoch 180/250
Epoch 181/250
Epoch 182/250
Epoch 183/250
Epoch 184/250
Epoch 185/250
Epoch 186/250
Epoch 187/250
Epoch 188/250
Epoch 189/250
Epoch 190/250
Epoch 191/250
Epoch 192/250
Epoch 193/250
Epoch 194/250
Epoch 195/250
Epoch 196/250
Epoch 197/250
Epoch 198/250
Epoch 199/250
Epoch 200/250
Epoch 201/250
Epoch 202/250
Epoch 203/250
Epoch 204/250
Epoch 205/250
Epoch 206/250
Epoch 207/250
Epoch 208/250
Epoch 209/250
Epoch 210/250
Epoch 211/250
Epoch 212/250
Epoch 213/250
Epoch 214/250
Epoch 215/250
Epoch 216/250
Epoch 217/250
Epoch 218/250
Epoch 219/250
Epoch 220/250
Epoch 221/250
Epoch 222/250
Epoch 223/250
Epoch 224/250
Epoch 225/250
Epoch 226/250
Epoch 227/250
Epoch 228/250
Epoch 

Epoch 234/250
Epoch 235/250
Epoch 236/250
Epoch 237/250
Epoch 238/250
Epoch 239/250
Epoch 240/250
Epoch 241/250
Epoch 242/250
Epoch 243/250
Epoch 244/250
Epoch 245/250
Epoch 246/250
Epoch 247/250
Epoch 248/250
Epoch 249/250
Epoch 250/250


<tensorflow.python.keras.callbacks.History at 0x1549b6ec748>

In [101]:
# generate a sequence from the model
def generate_seq(model, tokenizer, enter_text, n_pred):
    in_text, result = enter_text, enter_text  
    # generate a fix number of words
    for i in range(n_pred):
        
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = array(encoded)
        
        # predict a word in the vocabulary
        yhat = model.predict_classes(encoded)
        
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text, result = out_word, result + ' ' + out_word
    return result

In [102]:
# evaluate
print(generate_seq(model, tokenizer, 'there',10))

there is a man who was a man who was a


In [103]:
# evaluate
print(generate_seq(model, tokenizer, 'english', 6))

english charters researches here and i have


In [104]:
# evaluate
print(generate_seq(model, tokenizer, 'then', 6))

then it is a man who was


In [105]:
# evaluate
print(generate_seq(model, tokenizer, 'what', 6))

what could not a man who was


In [106]:
# evaluate
print(generate_seq(model, tokenizer, 'if', 6))

if you can you can you can


In [107]:
# evaluate
print(generate_seq(model, tokenizer, 'busy', 6))

busy just as we are aware that


In [108]:
# evaluate
print(generate_seq(model, tokenizer, 'right', 6))

right path said holmes that he was
