In [1]:
# Importing Libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Loading text file
with open('word-predict.txt', 'r', encoding='utf-8') as file:
    data = file.read()

In [3]:
# Tokenizing the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
total_words = len(tokenizer.word_index) + 1

In [4]:
# Creating input sequences using the tokenizer
input_sequences = []
for line in data.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [5]:
# Pad sequences to have same length
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [6]:
# Creating predictors and labels
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [7]:
# Using a smaller subset of data
subset_size = 50000
X_subset, y_subset = X[:subset_size], y[:subset_size]

In [8]:
# Building LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

In [9]:
# Compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [10]:
# Training the model
model.fit(X_subset, y_subset, epochs=10, verbose=1, batch_size=64) 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x190ad7281f0>

In [37]:
generated_text = generate_text("I am good", 4, max_sequence_len)
print(generated_text)

I am good to be a little


In [38]:
generated_text = generate_text("Where are", 5, max_sequence_len)
print(generated_text)

Where are as i have been a
