In [43]:
#Import important libraries
import nltk
#nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
#load the dataset
data=gutenberg.raw('shakespeare-hamlet.txt')

In [3]:
#Saving the file
with open ('hamlet.txt','w') as file:
    file.write(data)

In [4]:
data



In [5]:
#Preprocess the dataset
with open('hamlet.txt','r') as file:
    text=file.read().lower()
    print(text)

[the tragedie of hamlet by william shakespeare 1599]


actus primus. scoena prima.

enter barnardo and francisco two centinels.

  barnardo. who's there?
  fran. nay answer me: stand & vnfold
your selfe

   bar. long liue the king

   fran. barnardo?
  bar. he

   fran. you come most carefully vpon your houre

   bar. 'tis now strook twelue, get thee to bed francisco

   fran. for this releefe much thankes: 'tis bitter cold,
and i am sicke at heart

   barn. haue you had quiet guard?
  fran. not a mouse stirring

   barn. well, goodnight. if you do meet horatio and
marcellus, the riuals of my watch, bid them make hast.
enter horatio and marcellus.

  fran. i thinke i heare them. stand: who's there?
  hor. friends to this ground

   mar. and leige-men to the dane

   fran. giue you good night

   mar. o farwel honest soldier, who hath relieu'd you?
  fra. barnardo ha's my place: giue you goodnight.

exit fran.

  mar. holla barnardo

   bar. say, what is horatio there?
  hor. a peece of

In [6]:
#Tozenize the Text -creating index for words
tozenizer=Tokenizer()
tozenizer.fit_on_texts([text])
total_words=len(tozenizer.word_index)+1
total_words

4818

In [7]:
tozenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'i': 5,
 'you': 6,
 'a': 7,
 'my': 8,
 'it': 9,
 'in': 10,
 'that': 11,
 'ham': 12,
 'is': 13,
 'not': 14,
 'his': 15,
 'this': 16,
 'with': 17,
 'your': 18,
 'but': 19,
 'for': 20,
 'me': 21,
 'lord': 22,
 'as': 23,
 'what': 24,
 'he': 25,
 'be': 26,
 'so': 27,
 'him': 28,
 'haue': 29,
 'king': 30,
 'will': 31,
 'no': 32,
 'our': 33,
 'we': 34,
 'on': 35,
 'are': 36,
 'if': 37,
 'all': 38,
 'then': 39,
 'shall': 40,
 'by': 41,
 'thou': 42,
 'come': 43,
 'or': 44,
 'hamlet': 45,
 'good': 46,
 'do': 47,
 'hor': 48,
 'her': 49,
 'let': 50,
 'now': 51,
 'thy': 52,
 'how': 53,
 'more': 54,
 'they': 55,
 'from': 56,
 'enter': 57,
 'at': 58,
 'was': 59,
 'oh': 60,
 'like': 61,
 'most': 62,
 'there': 63,
 'well': 64,
 'know': 65,
 'selfe': 66,
 'would': 67,
 'them': 68,
 'loue': 69,
 'may': 70,
 "'tis": 71,
 'vs': 72,
 'sir': 73,
 'qu': 74,
 'which': 75,
 'did': 76,
 'why': 77,
 'laer': 78,
 'giue': 79,
 'thee': 80,
 'ile': 81,
 'must': 82,
 'hath': 

In [8]:
#Creating input sequence
input_sequence=[]
for line in text.split('\n'):
    token_list=tozenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequence.append(n_gram_sequence)

In [9]:
input_sequence

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891],
 [57, 407],
 [57, 407, 2],
 [57, 407, 2, 1181],
 [57, 407, 2, 1181, 177],
 [57, 407, 2, 1181, 177, 1892],
 [407, 1182],
 [407, 1182, 63],
 [408, 162],
 [408, 162, 377],
 [408, 162, 377, 21],
 [408, 162, 377, 21, 247],
 [408, 162, 377, 21, 247, 882],
 [18, 66],
 [451, 224],
 [451, 224, 248],
 [451, 224, 248, 1],
 [451, 224, 248, 1, 30],
 [408, 407],
 [451, 25],
 [408, 6],
 [408, 6, 43],
 [408, 6, 43, 62],
 [408, 6, 43, 62, 1893],
 [408, 6, 43, 62, 1893, 96],
 [408, 6, 43, 62, 1893, 96, 18],
 [408, 6, 43, 62, 1893, 96, 18, 566],
 [451, 71],
 [451, 71, 51],
 [451, 71, 51, 1894],
 [451, 71, 51, 1894, 567],
 [451, 71, 51, 1894, 567, 378],
 [451, 71, 51, 1894, 567, 378, 80],
 [451, 71, 51, 1894, 567, 378, 80, 3],
 [451, 71, 51, 1894, 567, 378, 80, 3, 273],
 [451, 71

In [10]:
#Max length of the sentence in the data
max_sequence_len=max([len(x) for x in input_sequence])
max_sequence_len

14

In [11]:
#Applying pad sequence to create same length of the sentences
input_sequence=np.array(pad_sequences(input_sequence,maxlen=max_sequence_len,padding='pre'))
input_sequence

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [12]:
#create predictors and labels
x,y=input_sequence[:,:-1],input_sequence[:,-1]
y

array([ 687,    4,   45, ..., 1047,    4,  193])

In [13]:
#convert the output features to categories so wherever the index word will be present that will be 1 rest will be 0
y=tf.keras.utils.to_categorical(y,num_classes=total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [28]:
#Spliting the data to train and test
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)

In [29]:
print(x_train.shape),print(x_test.shape),print(y_train.shape),print(y_test.shape)

(20585, 13)
(5147, 13)
(20585, 4818)
(5147, 4818)


(None, None, None, None)

In [19]:
max_sequence_len

14

In [42]:
#Train the LSTM model to the dataset
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(100))
model.add(Dense(total_words,activation='softmax'))

#Compile the model
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])



In [32]:
#Train the Model
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 34ms/step - accuracy: 0.0328 - loss: 7.1318 - val_accuracy: 0.0332 - val_loss: 6.7518
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.0388 - loss: 6.4443 - val_accuracy: 0.0437 - val_loss: 6.8320
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.0417 - loss: 6.3142 - val_accuracy: 0.0490 - val_loss: 6.8741
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 31ms/step - accuracy: 0.0546 - loss: 6.1542 - val_accuracy: 0.0517 - val_loss: 6.9056
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 31ms/step - accuracy: 0.0567 - loss: 6.0372 - val_accuracy: 0.0544 - val_loss: 6.9455
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 32ms/step - accuracy: 0.0606 - loss: 5.8981 - val_accuracy: 0.0591 - val_loss: 6.9829
Epoch 7/50
[1m6

In [44]:
#Train the GRU model to the dataset
model_GRU=Sequential()
model_GRU.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model_GRU.add(GRU(150,return_sequences=True))
model_GRU.add(Dropout(0.2))
model_GRU.add(GRU(100))
model_GRU.add(Dense(total_words,activation='softmax'))

#Compile the model
model_GRU.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
#Train the GRU Model
history_GRU=model_GRU.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

Epoch 1/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 34ms/step - accuracy: 0.0325 - loss: 7.1952 - val_accuracy: 0.0375 - val_loss: 6.8263
Epoch 2/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 33ms/step - accuracy: 0.0387 - loss: 6.4655 - val_accuracy: 0.0495 - val_loss: 6.8108
Epoch 3/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 34ms/step - accuracy: 0.0545 - loss: 6.1609 - val_accuracy: 0.0651 - val_loss: 6.7735
Epoch 4/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.0809 - loss: 5.8012 - val_accuracy: 0.0690 - val_loss: 6.8621
Epoch 5/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 33ms/step - accuracy: 0.0916 - loss: 5.4840 - val_accuracy: 0.0711 - val_loss: 6.9084
Epoch 6/50
[1m644/644[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 33ms/step - accuracy: 0.1034 - loss: 5.2025 - val_accuracy: 0.0732 - val_loss: 6.9993
Epoch 7/50
[1m6

In [34]:
#Function to predict the next word
def predict_next_word(model,tokenizer,text,max_sequence_len):
    token_list=tokenizer.texts_to_sequences([text])[0]
    if len(token_list)>=max_sequence_len:
        token_list=token_list[-(max_sequence_len-1):]
    token_list=pad_sequences([token_list],maxlen=max_sequence_len-1,padding='pre')
    predicted=model.predict(token_list,verbose=0)
    predicted_word_next=np.argmax(predicted,axis=1)
    for word,index in tokenizer.word_index.items():
        if index==predicted_word_next:
            return word
    return None

In [46]:
input_text='To be or not to be'
print(f'input text:{input_text}')
max_sequence_len=model_GRU.input_shape[1]+1
next_word=predict_next_word(model_GRU,tozenizer,input_text,max_sequence_len)
print(f"Next Word Prediction:{next_word}")

input text:To be or not to be
Next Word Prediction:time


In [48]:
#Saving the created model
model.save('next_word_lstm.h5')



In [49]:
model.save('next_word_GRU.h5')



In [50]:
#Saving the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tozenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)