In [2]:
import nltk
import pandas as pd

In [4]:
nltk.download('gutenberg')
from nltk.corpus import gutenberg

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [5]:
data=gutenberg.raw('shakespeare-hamlet.txt')

In [6]:
with open('hamlet.txt', 'w') as f:
    f.write(data)

In [7]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split


In [8]:
with open('hamlet.txt', 'r') as f:
    text =f.read().lower()

In [9]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1

In [10]:
total_words

4818

In [13]:
x=tokenizer.word_index

In [15]:
i=0
for key,val in (x.items()):
    print(f'{key}---{val}')
    i+=1
    if i==10:
        break

the---1
and---2
to---3
of---4
i---5
you---6
a---7
my---8
it---9
in---10


In [17]:
#input sequences
inputsequences=[]
for line in text.split('\n'):
    tk_list=tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(tk_list)):
        ngram_seq=tk_list[:i+1]
        inputsequences.append(ngram_seq)
    

In [20]:
len(inputsequences)

25732

In [21]:
max_seq_len=max([len(x) for x in inputsequences])
max_seq_len

14

In [24]:
inputsequences=np.array(pad_sequences(inputsequences,maxlen=max_seq_len,padding='pre'))

In [25]:
inputsequences

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [30]:
# x=extract all rows and and coloumns except the last coloumn
# y=extract the last coloumn in every row
import tensorflow as tf
x,y=inputsequences[:,:-1], inputsequences[:,-1]


In [32]:
y=tf.keras.utils.to_categorical(y,num_classes=total_words)

In [36]:
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [37]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2)

In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout

In [44]:
model= Sequential([
    Embedding(total_words,100,input_length=max_seq_len-1),
    LSTM(150,return_sequences=True),
    Dropout(0.2),
    LSTM(100),
    Dense(total_words,activation='softmax')
])

#Embedding(input_dim, output_dim, input_length)

In [45]:

model.compile(loss="categorical_crossentropy",optimizer='adam', metrics='accuracy')

In [46]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 13, 100)           481800    
                                                                 
 lstm_4 (LSTM)               (None, 13, 150)           150600    
                                                                 
 dropout_2 (Dropout)         (None, 13, 150)           0         
                                                                 
 lstm_5 (LSTM)               (None, 100)               100400    
                                                                 
 dense_2 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1,219,418
Trainable params: 1,219,418
Non-trainable params: 0
_________________________________________________________________


In [48]:
history=model.fit(xtrain,ytrain,epochs=100,validation_data=(xtest,ytest), verbose=1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [49]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [54]:
input_text="This Spirit dumbe to vs, will speake to"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word Prediction:{next_word}")

Input text:This Spirit dumbe to vs, will speake to
Next Word Prediction:him


In [53]:
## Save the model
model.save("model.h5")

## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)