In [1]:
# Importing libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

In [2]:
# Loading text file
with open('word-predict.txt', 'r', encoding='utf-8') as file:
    text = file.read().lower()

In [3]:
# Tokenizing the text 
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [4]:
# Preparing the data for training
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [5]:
# Determining the maximum sequence length among all input sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [6]:
# Shuffling the data
np.random.shuffle(input_sequences)
input_sequences = input_sequences[:10000] 

In [7]:
# Creating predictors and label
xs, labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [8]:
# Building a LSTM model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))  
model.add(LSTM(150, return_sequences=True))  
model.add(LSTM(100))  
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001), metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 19, 100)           893200    
                                                                 
 lstm (LSTM)                 (None, 19, 150)           150600    
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 8932)              902132    
                                                                 
Total params: 2,046,332
Trainable params: 2,046,332
Non-trainable params: 0
_________________________________________________________________


In [9]:
# Training the model
model.fit(xs, ys, epochs=20, verbose=1, batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x250566fbd00>

In [10]:
# Function to generate predictions
def generate_text(seed_text, next_words):
    predictions = []
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted_probs = model.predict(token_list, verbose=0)[0]
        
        # Sample from the predicted probabilities
        predicted_index = np.random.choice(len(predicted_probs), p=predicted_probs)
        
        # Convert index back to word
        output_word = tokenizer.index_word.get(predicted_index, "")
        
        predictions.append(output_word)
    
    return predictions

In [13]:
sentences = [
    "what is the current",
    "in the garden of",
    "have a good",
    "what is your",
    "i love"
]

In [14]:
for sentence in sentences:
    print(sentence)
    predictions = generate_text(sentence, 5)
    print(predictions)
    print()

what is the current
['is', 'looked', 'unreasoning', 'by', 'vivid']

in the garden of
['as', 'fell', 'over', 'quick', 'part']

have a good
['the', 'who', 'must', 'inclined', 'marked']

what is your
['most', 'us', 'matter', 'observe', 'situation']

i love
['holmes', 'mortal', 'left', 'am', 'appear']

