In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, Dense, LSTM, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np 

In [None]:
with open("../data/nietzsche.txt", "r", encoding="utf8") as f:
    text = f.read().lower()

print(len(text))

In [None]:
lines = text.split("\n")

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
num_words = len(tokenizer.word_index) + 1

In [None]:
print(lines[3])
tokenizer.texts_to_sequences([lines[3]])[0]

In [None]:
input_sequences = []

for line in lines:
    tokens = tokenizer.texts_to_sequences([line])[0]

    for i in range(1, len(tokens)):
        input_sequences.append(tokens[:i+1])

In [None]:
input_sequences[0]

In [None]:
max_sequence_len = max([len(i) for i in input_sequences])

In [None]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [None]:
np.random.shuffle(input_sequences)
X, y = input_sequences[:,:-1], np.expand_dims(input_sequences[:,-1], axis=1)

In [None]:
X.shape

In [None]:
X[0]

In [None]:
type(X)

In [None]:
y.shape

In [None]:
y

In [None]:
num_words

In [None]:
import tensorflow as tf
from tensorflow.keras.utils import plot_model

emb_size = 256

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(num_words, emb_size, input_length=max_sequence_len - 1),
    tf.keras.layers.LSTM(120),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(256),
    tf.keras.layers.Dense(units=num_words, activation='softmax')
]) 

model.compile(loss='sparse_categorical_crossentropy', 
              optimizer="adam", metrics=['accuracy'])

model.summary()

In [None]:
plot_model(model, show_shapes=True)

In [None]:
callback = EarlyStopping(monitor="loss", patience=3)
history = model.fit(X, y, epochs=50, batch_size=128, verbose=1, callbacks=[callback])

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['loss'], label='(training data)',color='blue')
plt.title('Neural Network training loss')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'], label='(training data)',color='blue')
plt.title('Neural Network training accuracy')
plt.ylabel('value')
plt.xlabel('No. epoch')
plt.show()

In [None]:
import sys

text = "The philosophy of the dogmatists, it is to be hoped, was only a promise for thousands of years afterwards"
next_words = 100

print(text)
for _ in range(next_words):
    tokens = tokenizer.texts_to_sequences([text])[0]
    tokens = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')   
    predicted = np.argmax(model.predict(tokens, verbose=0), axis=-1)
 
    output_word = ""

    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break

    sys.stdout.write(output_word + " ")
    text += " " + output_word