## Next Word Prediction

### Importing required libraries:

In [1]:
import nltk
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
nltk.download('gutenberg')
from nltk.corpus import gutenberg
data = gutenberg.raw('shakespeare-hamlet.txt')




[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\sachi\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


### Loading Data

In [2]:
with open('hamlet.txt','w') as file:
    file.write(data)

### Data Processing and Tockenization

In [3]:
with open('hamlet.txt','r') as file:
    text = file.read().lower()

### Tokeniz the text - Creating the index for words
tockenize = Tokenizer()
tockenize.fit_on_texts([text])
total_words = len(tockenize.word_index) + 1
total_words

4818

In [4]:
tockenize.texts_to_sequences(["Barnardo. Who's there?"])[0]

[407, 1182, 63]

#### Create an input statatement

In [5]:


input_sequences = []
for line in text.split('\n'):
    token_list = tockenize.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)



In [6]:
input_sequences[:10]

[[1, 687],
 [1, 687, 4],
 [1, 687, 4, 45],
 [1, 687, 4, 45, 41],
 [1, 687, 4, 45, 41, 1886],
 [1, 687, 4, 45, 41, 1886, 1887],
 [1, 687, 4, 45, 41, 1886, 1887, 1888],
 [1180, 1889],
 [1180, 1889, 1890],
 [1180, 1889, 1890, 1891]]

#### Apply Padding

In [7]:
# Pad Sequence
max_sequence_length = max([len(x) for x in input_sequences])
max_sequence_length

14

In [8]:
pad_sequences(input_sequences,maxlen=max_sequence_length,padding='pre')

array([[   0,    0,    0, ...,    0,    1,  687],
       [   0,    0,    0, ...,    1,  687,    4],
       [   0,    0,    0, ...,  687,    4,   45],
       ...,
       [   0,    0,    0, ...,    4,   45, 1047],
       [   0,    0,    0, ...,   45, 1047,    4],
       [   0,    0,    0, ..., 1047,    4,  193]])

In [9]:
input_sequences = np.array(pad_sequences(input_sequences,maxlen=max_sequence_length,padding='pre'))

In [10]:
x,y = input_sequences[:,:-1],input_sequences[:,-1]

In [11]:
import tensorflow as tf
y = tf.keras.utils.to_categorical(y,num_classes = total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

### Model Training

In [12]:
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

In [20]:
#Train GRU RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,GRU

#Define the model

model = Sequential()
model.add(Embedding(total_words,100,input_length = max_sequence_length-1))
model.add(GRU(150,return_sequences=True))
model.add(Dropout(0.2))
model.add(GRU(100))
model.add(Dense(total_words,activation='softmax'))

### Callbacks

In [24]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='loss',patience=5,restore_best_weights=True)
check_point = ModelCheckpoint("gru1.h5",monitor="loss",save_best_only = True,model = "auto",verbose=1)
reduced = ReduceLROnPlateau(monitor = "loss",factor=0.2,patience=3,min_lr = 0.0001,verbose=1)
logdir="logs"
tensor_board_visualizations= TensorBoard(log_dir=logdir)

### Compile the model

In [25]:
model.compile(loss = "categorical_crossentropy",optimizer = "adam",metrics = ["accuracy"])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 13, 100)           481800    
                                                                 
 gru_2 (GRU)                 (None, 13, 150)           113400    
                                                                 
 dropout_1 (Dropout)         (None, 13, 150)           0         
                                                                 
 gru_3 (GRU)                 (None, 100)               75600     
                                                                 
 dense_1 (Dense)             (None, 4818)              486618    
                                                                 
Total params: 1157418 (4.42 MB)
Trainable params: 1157418 (4.42 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


### Plot the Model

In [26]:
from tensorflow import keras
from tensorflow.keras.utils import plot_model
plot_model(model,to_file="model.png",show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


### Fit the Model

In [27]:
history = model.fit(xtrain,ytrain,epochs=200,validation_data=(xtest,ytest),verbose=1,callbacks= [early_stopping,check_point,tensor_board_visualizations,reduced])

Epoch 1/200
Epoch 1: loss improved from inf to 0.90199, saving model to gru1.h5
Epoch 2/200
  9/644 [..............................] - ETA: 8s - loss: 0.8411 - accuracy: 0.7674

  saving_api.save_model(


Epoch 2: loss improved from 0.90199 to 0.86609, saving model to gru1.h5
Epoch 3/200
Epoch 3: loss improved from 0.86609 to 0.85250, saving model to gru1.h5
Epoch 4/200
Epoch 4: loss improved from 0.85250 to 0.84126, saving model to gru1.h5
Epoch 5/200
Epoch 5: loss did not improve from 0.84126
Epoch 6/200
Epoch 6: loss did not improve from 0.84126
Epoch 7/200
Epoch 7: loss did not improve from 0.84126

Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00020000000949949026.
Epoch 8/200
Epoch 8: loss improved from 0.84126 to 0.75128, saving model to gru1.h5
Epoch 9/200
Epoch 9: loss improved from 0.75128 to 0.72122, saving model to gru1.h5
Epoch 10/200
Epoch 10: loss improved from 0.72122 to 0.70690, saving model to gru1.h5
Epoch 11/200
Epoch 11: loss improved from 0.70690 to 0.70016, saving model to gru1.h5
Epoch 12/200
Epoch 12: loss improved from 0.70016 to 0.69554, saving model to gru1.h5
Epoch 13/200
Epoch 13: loss improved from 0.69554 to 0.68996, saving model to gru1.h5
Epoch

### Graph

In [34]:
%load_ext tensorboard

In [35]:
%tensorboard --logdir="./logs"


Reusing TensorBoard on port 6007 (pid 298620), started 0:16:20 ago. (Use '!kill 298620' to kill it.)

## Function for next word prediction

In [None]:
def predict_next_word(model,input_text,max_token_length,tockenize):
    input_sequence = tockenize.texts_to_sequences([input_text])[0]
    if len(input_sequence)>= max_token_length:
        input_sequence = input_sequence[-(max_token_length-1):]
    input_sequence = pad_sequences([input_sequence],maxlen=max_token_length-1,padding='pre')
    pred = model.predict(input_sequence,verbose=0)
    predected_index = np.argmax(pred,axis=1)
    for word,index in tockenize.word_index.items():
        if predected_index==index:
            return word
    return None


#### Sample test

In [None]:
input ="To be or not to be"

predict_next_word(model = model,input_text=input,max_token_length= max_sequence_length,tockenize=tockenize)

'honest'

### Save the model and tokenizer

In [None]:
#save the model

model.save("next_word_lstm.h5")

#save the tockenizer
import pickle
with open('tokenize.pickel','wb') as handle:
    pickle.dump(tockenize,handle)


  saving_api.save_model(


## Test a prediction

In [None]:
m1 = tf.keras.models.load_model("next_word_lstm.h5")

In [None]:
with open('tokenize.pickel','rb') as token:    
    tok = pickle.load(token)

In [None]:
input = "Fran. You come most carefully vpon your houre Bar. 'Tis now strook twelue, get thee to"

In [None]:
predict_next_word(model = m1,input_text=input,max_token_length=m1.input_shape[1]+1 ,tockenize=tok)

'night'