## Importing the necessary libraries

In [11]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers import Dense, Activation
from keras.optimizers import RMSprop
import matplotlib.pyplot as plt
import pickle
import heapq

## reading the text file

In [14]:
path = '1661-0.txt'
text = open(path).read().lower()
print('corpus length:', len(text))

corpus length: 581888


## Tokenizing - This will divide the dataset into  tokens.without special characters

In [15]:
tokenizer = RegexpTokenizer(r'w+')
words = tokenizer.tokenize(text)

In [16]:
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

In [17]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['w', 'w', 'w', 'w', 'w']
www


In [24]:
features = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
labels = np.zeros((len(next_words), len(unique_words)), dtype=bool)
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        features[i, j, unique_word_index[each_word]] = 1
    labels[i, unique_word_index[next_words[i]]] = 1

## MODEL CREATION

In [25]:
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

In [29]:
optimizer = RMSprop(learning_rate=0.01)
model.compile(optimizer=optimizer,loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(features, labels, validation_split=0.05, batch_size=128, epochs=10, shuffle=True)
hist = model.history

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:

model.save('nwp.h5')
pickle.dump(hist, open("history.p", "wb"))
model = load_model('nwp.h5')
history = pickle.load(open("history.p", "rb"))

  saving_api.save_model(


In [35]:
import random

def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    words = text.split()
    for t in range(min(len(words), WORD_LENGTH)):
        word = words[t]
        index = unique_word_index.get(word, None)
        if index is not None:
            x[0, t, index] = 1
    return x

def predict_next_word(model, text): 
    input_vec = prepare_input(text)
    prediction = model.predict(input_vec)[0]
    next_word_probabilities = prediction / np.sum(prediction)
    return random.choices(list(unique_words), weights=next_word_probabilities)[0]

# Generate a sentence using the trained model
sentence = "Once upon a time"
print("---")
print(f"Starting with: {sentence}")
for _ in range(len(sentence.split()), len(sentence) + 10):  # Iterate for a few more words than the initial sentence length
    word = predict_next_word(model, sentence)
    sentence += f" {word}"
    if word == ".":
        break
print("---")
print(f"Completed sentence: \"{sentence}\"")


---
Starting with: Once upon a time
---
Completed sentence: "Once upon a time w w w w w w w w w w w w w w w w w w w w w w"
