In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [1]:
import nltk
nltk.download('gutenberg')
from nltk.corpus import gutenberg
import pandas as pd

[nltk_data] Downloading package gutenberg to /usr/share/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
import numpy as np 
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [3]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [4]:
#Loading the dataset
data = gutenberg.raw('austen-persuasion.txt') 
#Save to a File
with open('Pride&Prejudice.txt','w') as file:
    file.write(data)

In [5]:
#Loading the text file
with open('Pride&Prejudice.txt','r') as file:
    text = file.read().lower()

In [6]:
#Tokenizing the text & creating the indexes for words
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index)+1
total_words

5875

In [7]:
tokenizer.word_index

{'the': 1,
 'to': 2,
 'and': 3,
 'of': 4,
 'a': 5,
 'in': 6,
 'was': 7,
 'her': 8,
 'had': 9,
 'she': 10,
 'i': 11,
 'it': 12,
 'he': 13,
 'be': 14,
 'not': 15,
 'that': 16,
 'as': 17,
 'for': 18,
 'but': 19,
 'his': 20,
 'with': 21,
 'you': 22,
 'have': 23,
 'at': 24,
 'all': 25,
 'been': 26,
 'him': 27,
 'could': 28,
 'anne': 29,
 'very': 30,
 'they': 31,
 'were': 32,
 'by': 33,
 'which': 34,
 'is': 35,
 'on': 36,
 'so': 37,
 'no': 38,
 'would': 39,
 'captain': 40,
 'from': 41,
 'their': 42,
 'mrs': 43,
 'there': 44,
 'or': 45,
 'more': 46,
 'them': 47,
 'mr': 48,
 'elliot': 49,
 'this': 50,
 'an': 51,
 'than': 52,
 'one': 53,
 'must': 54,
 'when': 55,
 'my': 56,
 'being': 57,
 'only': 58,
 'lady': 59,
 'such': 60,
 'do': 61,
 'much': 62,
 'if': 63,
 'any': 64,
 'what': 65,
 'who': 66,
 'wentworth': 67,
 'should': 68,
 'me': 69,
 'good': 70,
 'little': 71,
 'said': 72,
 'will': 73,
 'might': 74,
 'own': 75,
 'well': 76,
 'did': 77,
 'herself': 78,
 'now': 79,
 'never': 80,
 'charles'

In [8]:
#Creating input sequence
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequences = token_list[:i+1]
        input_sequences.append(n_gram_sequences)

In [9]:
input_sequences

[[1103, 33],
 [1103, 33, 3293],
 [1103, 33, 3293, 3294],
 [1103, 33, 3293, 3294, 3295],
 [422, 1953],
 [84, 99],
 [84, 99, 49],
 [84, 99, 49, 4],
 [84, 99, 49, 4, 161],
 [84, 99, 49, 4, 161, 361],
 [84, 99, 49, 4, 161, 361, 6],
 [84, 99, 49, 4, 161, 361, 6, 1646],
 [84, 99, 49, 4, 161, 361, 6, 1646, 7],
 [84, 99, 49, 4, 161, 361, 6, 1646, 7, 5],
 [84, 99, 49, 4, 161, 361, 6, 1646, 7, 5, 93],
 [84, 99, 49, 4, 161, 361, 6, 1646, 7, 5, 93, 66],
 [18, 20],
 [18, 20, 75],
 [18, 20, 75, 833],
 [18, 20, 75, 833, 80],
 [18, 20, 75, 833, 80, 516],
 [18, 20, 75, 833, 80, 516, 130],
 [18, 20, 75, 833, 80, 516, 130, 64],
 [18, 20, 75, 833, 80, 516, 130, 64, 1009],
 [18, 20, 75, 833, 80, 516, 130, 64, 1009, 19],
 [18, 20, 75, 833, 80, 516, 130, 64, 1009, 19, 1],
 [18, 20, 75, 833, 80, 516, 130, 64, 1009, 19, 1, 2422],
 [44, 13],
 [44, 13, 143],
 [44, 13, 143, 1418],
 [44, 13, 143, 1418, 18],
 [44, 13, 143, 1418, 18, 51],
 [44, 13, 143, 1418, 18, 51, 1954],
 [44, 13, 143, 1418, 18, 51, 1954, 337],
 

In [10]:
#Padding sequences
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

18

In [11]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
input_sequences

array([[   0,    0,    0, ...,    0, 1103,   33],
       [   0,    0,    0, ..., 1103,   33, 3293],
       [   0,    0,    0, ...,   33, 3293, 3294],
       ...,
       [   0,    0,    0, ...,   52,    6,  164],
       [   0,    0,    0, ...,    6,  164, 5873],
       [   0,    0,    0, ...,  164, 5873,  776]], dtype=int32)

In [12]:
#Creation of predictors and labels
X,y = input_sequences[:,:-1], input_sequences[:,-1]

In [13]:
X

array([[   0,    0,    0, ...,    0,    0, 1103],
       [   0,    0,    0, ...,    0, 1103,   33],
       [   0,    0,    0, ..., 1103,   33, 3293],
       ...,
       [   0,    0,    0, ...,    0,   52,    6],
       [   0,    0,    0, ...,   52,    6,  164],
       [   0,    0,    0, ...,    6,  164, 5873]], dtype=int32)

In [14]:
y

array([  33, 3293, 3294, ...,  164, 5873,  776], dtype=int32)

In [15]:
y = tf.keras.utils.to_categorical(y, num_classes = total_words)
y

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
y.shape

(76330, 5875)

In [17]:
#Splitting the data into train and validation split
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.20)
Xtrain.shape,Xtest.shape

((61064, 17), (15266, 17))

In [32]:
#Defining early Stopping
early_stopping = EarlyStopping(
    monitor='val_loss',        # Monitor the validation loss
    patience=20,               # Allow up to 20 epochs without improvement
    restore_best_weights=True, # Restore the weights from the best epoch
    min_delta=0.01,            # Consider only changes greater than 0.01 as improvements
    verbose=1,                 # Print a message when early stopping is triggered
    mode='auto'                # Automatically choose the mode based on the metric
)

In [33]:
total_words, max_sequence_len

(5875, 18)

In [38]:
# Training LSTM Model
# Don't forget to use input_dim, output_dim 
model = Sequential()
model.add(Embedding(input_dim = total_words, output_dim = 150, input_shape=(max_sequence_len-1,)))  # Remove input_length if possible
model.add(LSTM(200, return_sequences=True))
model.add(Dropout(0.3))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))


In [39]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
model.summary()

In [41]:
#Training the Model
history = model.fit(Xtrain,ytrain, epochs = 150, validation_data = (Xtest,ytest), verbose = 1)

Epoch 1/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 11ms/step - accuracy: 0.0395 - loss: 6.6865 - val_accuracy: 0.0702 - val_loss: 6.1682
Epoch 2/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.0751 - loss: 5.9623 - val_accuracy: 0.1034 - val_loss: 5.8748
Epoch 3/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.1132 - loss: 5.5169 - val_accuracy: 0.1188 - val_loss: 5.7825
Epoch 4/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.1317 - loss: 5.2581 - val_accuracy: 0.1266 - val_loss: 5.7726
Epoch 5/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.1389 - loss: 5.0758 - val_accuracy: 0.1287 - val_loss: 5.8076
Epoch 6/150
[1m1909/1909[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 10ms/step - accuracy: 0.1456 - loss: 4.9133 - val_accuracy: 0.1315 - val_loss: 5.883

In [42]:
model.save('LSTM_Next_word.h5')
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol = pickle.HIGHEST_PROTOCOL)

In [43]:
#Function to predict the next word
def predict_next_word(model,tokenizer,text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list)>=max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]
    token_list = pad_sequences([token_list],maxlen= max_sequence_len-1,padding = 'pre')
    predicted = model.predict(token_list,verbose = 1)
    predicted_word_idx = np.argmax(predicted,axis = 1)
    for word,index in tokenizer.word_index.items():
        if index ==predicted_word_idx:
            return word
    return None

In [44]:
input_text = "to be or not to be"
print(f'Input: {input_text}')
max_sequence_len = model.input_shape[1]+1
next_word = predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f'Output:{next_word}')

Input: to be or not to be
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 170ms/step
Output:in
