In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pickle
import heapq
from nltk.tokenize import RegexpTokenizer
from keras.models import Sequential, load_model
from keras.layers import LSTM
from keras.layers.core import Dense, Activation
from tensorflow.keras.optimizers import RMSprop

In [6]:
link = 'dataset/1661-0.txt'
with open(link, encoding='utf8') as f:
    text= f.readlines()

In [8]:
text = open(link, encoding='utf8').read().lower()
print('corpus length:', len(text))

corpus length: 581888


In [9]:
tokenizer = RegexpTokenizer(r'\w+')
wo = text.lower() 
words = tokenizer.tokenize(wo)
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

In [10]:
LENGTH = 5
prev = []
next= []
for i in range(len(words) - LENGTH):
    prev.append(words[i:i + LENGTH])
    next.append(words[i + LENGTH])
print("Previous :",prev[0])
print("Next :",next[0])

Previous : ['project', 'gutenberg', 's', 'the', 'adventures']
Next : of


In [11]:
X = np.zeros((len(prev), LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next), len(unique_words)), dtype=bool)

In [12]:
for i, each_words in enumerate(prev):
    for j, word in enumerate(each_words):
        X[i, j, unique_word_index[word]] = 1
    Y[i, unique_word_index[next[i]]] = 1

In [13]:
model = Sequential()
model.add(LSTM(128, input_shape=(LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))
optimizer = RMSprop(learning_rate=0.01)

In [14]:
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

In [15]:
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=2, shuffle=True).history

Epoch 1/2
Epoch 2/2


In [16]:
model.save('next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

In [17]:
def prep(text):
    x = np.zeros((1, LENGTH, len(unique_words)))
    for a, word in enumerate(text.split()):
        print(word)
        x[0, a, unique_word_index[word]] = 1
    return x

In [18]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

In [19]:
def completion(text):
    original_text = text
    generated = text
    completion = ''
    indices_char={}
    while True:
        x = prep(text)
        preds = model.predict(x, verbose=0)[0]
        next_index = sample(preds, top_n=1)[0]
        next_char = indices_char[next_index]
        text = text[1:] + next_char
        completion += next_char
        if len(original_text + completion) + 2 > len(original_text) and next_char == ' ':
            return completion
def completions(text, n=3):
    x = prep(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] + completion(text[1:] + completion[idx]) for idx in next_indices]

In [20]:
def completions(text, n=3):
    if text == "":
        return("0")
    x = prep(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    return [unique_words[idx] for idx in next_indices]

In [22]:
q =  "Your life will never be the same again"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", completions(seq, 5))

correct sentence:  Your life will never be the same again
Sequence:  your life will never be
your
life
will
never
be
next possible words:  ['so', 'a', 'able', 'no', 'in']


In [26]:
q =  "What I’m trying to say"
print("correct sentence: ",q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence: ",seq)
print("next possible words: ", completions(seq, 5))

correct sentence:  What I’m trying to say
Sequence:  what i m trying to
what
i
m
trying
to
next possible words:  ['have', 'the', 'be', 'say', 'do']
