In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding ,LSTM,Dense


**Printing Text**

In [18]:
with open ("/kaggle/input/next-word-prediction/1661-0.txt","r",encoding="utf-8") as file:
    text=file.read()
words=text.split()
print(words[-10:])

['subscribe', 'to', 'our', 'email', 'newsletter', 'to', 'hear', 'about', 'new', 'eBooks.']


**Tokenizing the text**

In [20]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
total_words=len(tokenizer.word_index)+1


In [21]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'i': 6,
 '”': 7,
 'in': 8,
 'that': 9,
 'it': 10,
 'he': 11,
 'was': 12,
 'you': 13,
 'his': 14,
 'is': 15,
 'my': 16,
 'have': 17,
 'with': 18,
 'as': 19,
 'had': 20,
 'at': 21,
 'which': 22,
 'for': 23,
 'be': 24,
 'not': 25,
 'me': 26,
 'but': 27,
 'from': 28,
 'we': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'there': 33,
 'holmes': 34,
 'him': 35,
 'so': 36,
 'her': 37,
 'she': 38,
 'all': 39,
 '’': 40,
 'been': 41,
 'your': 42,
 'on': 43,
 'very': 44,
 'by': 45,
 'one': 46,
 'are': 47,
 '“i': 48,
 'were': 49,
 'an': 50,
 'no': 51,
 'would': 52,
 'out': 53,
 'what': 54,
 'then': 55,
 'up': 56,
 'when': 57,
 'man': 58,
 'could': 59,
 'has': 60,
 'do': 61,
 'into': 62,
 'or': 63,
 'little': 64,
 'will': 65,
 'who': 66,
 'mr': 67,
 'if': 68,
 'some': 69,
 'down': 70,
 'see': 71,
 'now': 72,
 'our': 73,
 'should': 74,
 'may': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'they': 79,
 'can': 80,
 'more': 81,
 'think': 82,
 'about': 83,
 'mu

**N Gram implementation**

In [23]:
input_sequences=[]
for line in text.split("\n"):
    token_list=tokenizer.texts_to_sequences([line])[0]
    for i in range (1,len(token_list)):
        n_gram_sequence=token_list[:i+1]
        input_sequences.append(n_gram_sequence)
print(input_sequences[:10])

[[145, 4790], [145, 4790, 1], [145, 4790, 1, 1020], [145, 4790, 1, 1020, 4], [145, 4790, 1, 1020, 4, 128], [145, 4790, 1, 1020, 4, 128, 34], [145, 4790, 1, 1020, 4, 128, 34, 45], [145, 4790, 1, 1020, 4, 128, 34, 45, 611], [145, 4790, 1, 1020, 4, 128, 34, 45, 611, 2235], [145, 4790, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236]]


**Padding the sequence**

In [24]:
max_sequence_len=max ([len (seq) for seq in input_sequences ])
input_sequences=np.array (pad_sequences(input_sequences,maxlen=max_sequence_len,padding="pre"))

In [25]:
input_sequences[:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,  145, 4790],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,  145, 4790,    1],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,  145, 4790,    1, 1020],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,  145, 4790,    1, 1020,    4],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  145, 4790,    1, 1020,    4,  128],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,  145, 4790,    1, 1020,    4,  128,   34],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  145, 4790,    1, 1020,    4,  128,   34,   45],
       [   0,    0,    0,    0,    0,    

**Standardizing x and y**

In [26]:
X=input_sequences[:,:-1]
y=input_sequences[:,-1]




In [27]:
# printing input and output
print("X",X)
print("Y",y)

X [[   0    0    0 ...    0    0  145]
 [   0    0    0 ...    0  145 4790]
 [   0    0    0 ...  145 4790    1]
 ...
 [   0    0    0 ... 8931    3  360]
 [   0    0    0 ...    3  360   83]
 [   0    0    0 ...  360   83  358]]
Y [4790    1 1020 ...   83  358 1673]


**Converting Y to one hot vector**

In [28]:
y=np.array(tf.keras.utils.to_categorical(y,num_classes=total_words))


**Constructing our model**

In [11]:
model=Sequential()
model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words,activation="softmax"))
print(model.summary())



None


In [None]:
model.compile(loss="categorical_crossentropy",optimizer="adam",metrics=["accuracy"])
model.fit(X,y ,epochs=1,verbose=1)
# due to ram insufficieny and time taking i just tried it for 1 epoch 
# use 100 epochs recommmended

In [33]:
input_text = "favourable"
predict_next_words = 3
for _ in range(predict_next_words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]  # Note: Changed `text_to_sequences` to `texts_to_sequences`
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding="pre")
    predicted = np.argmax(model.predict(token_list), axis=-1)  # Added a comma here
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word
    
    print(input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step
favourable to
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
favourable to me
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
favourable to me to


In [34]:

input_text = "Project Gutenberg's"
predict_next_words = 3
for _ in range(predict_next_words):
    token_list = tokenizer.texts_to_sequences([input_text])[0]  # Note: Changed `text_to_sequences` to `texts_to_sequences`
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding="pre")
    predicted = np.argmax(model.predict(token_list), axis=-1)  # Added a comma here
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word
    
    print(input_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
Project Gutenberg's the
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
Project Gutenberg's the adventures
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
Project Gutenberg's the adventures of
