# Next Word Prediction

In [2]:
import numpy as np
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.models import Sequential, load_model  # Modified import
from tensorflow.keras.layers import LSTM  # Modified import
from tensorflow.keras.layers import Dense, Activation  # Modified import
from tensorflow.keras.optimizers.legacy import Adam
import matplotlib.pyplot as plt
import pickle
import heapq

In [3]:
import requests

url = r'https://raw.githubusercontent.com/simranjeet97/75DayHard_GenAI_LLM_Challenge/main/NextWordPrediction_DeepLearning/1661-0.txt'
response = requests.get(url)

if response.status_code == 200:
    text = response.text.lower()
    print('corpus length:', len(text))
else:
    print(f"Error fetching file. Status code: {response.status_code}")

corpus length: 594199


In [4]:
# split the entire dataset into each word in order without the presence of special characters.
tokenizer = RegexpTokenizer(r'\w+')
words = tokenizer.tokenize(text)

In [5]:
# dictionary with each word form the unique_words list as key and its corresponding position as value.
unique_words = np.unique(words)
unique_word_index = dict((c, i) for i, c in enumerate(unique_words))

In [12]:
WORD_LENGTH = 5
prev_words = []
next_words = []
for i in range(len(words) - WORD_LENGTH):
    prev_words.append(words[i:i + WORD_LENGTH])
    next_words.append(words[i + WORD_LENGTH])
print(prev_words[0])
print(next_words[0])

['project', 'gutenberg', 's', 'the', 'adventures']
of


In [13]:
# create two numpy array X(for storing the features) and Y(for storing the corresponding label).
X = np.zeros((len(prev_words), WORD_LENGTH, len(unique_words)), dtype=bool)
Y = np.zeros((len(next_words), len(unique_words)), dtype=bool)

In [17]:
print(unique_word_index['new'])

4841


In [18]:
X[1,2,unique_word_index['new']]

False

In [None]:
# I iterate X and Y if the word is present then the corresponding position is made 1.
for i, each_words in enumerate(prev_words):
    for j, each_word in enumerate(each_words):
        X[i, j, unique_word_index[each_word]] = 1
    Y[i, unique_word_index[next_words[i]]] = 1

# LSTM Model 

In [None]:
#  single-layer LSTM model with 128 neurons, a fully connected layer, and a softmax function for activation.
model = Sequential()
model.add(LSTM(128, input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

# Define the optimizer
optimizer = Adam(learning_rate=0.01)

# Compile the model with the Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model with Adam optimizer
history = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=5, shuffle=True).history

In [None]:
model.save('next_word_model.h5')
pickle.dump(history, open("history.p", "wb"))
model = load_model('next_word_model.h5')
history = pickle.load(open("history.p", "rb"))

In [None]:
# Evaluating the model
plt.plot(history['accuracy'])
plt.plot(history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

In [None]:
plt.plot(history['loss'])
plt.plot(history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')

In [None]:
# Testing next word
def prepare_input(text):
    x = np.zeros((1, WORD_LENGTH, len(unique_words)))
    for t, word in enumerate(text.split()):
        print(word)
        x[0, t, unique_word_index[word]] = 1
    return x

prepare_input("It is not a lack".lower())

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

# I use the function predict_completions which use the model to predict and return the list of n predicted words.
def predict_completions_with_probabilities(text, n=3):
    if text == "":
        return [("0", 0.0)]  # Return a tuple containing a placeholder and probability 0.0
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    probabilities = [preds[idx] for idx in next_indices]
    predicted_words = [unique_words[idx] for idx in next_indices]
    return list(zip(predicted_words, probabilities))

# Your example
q = "Your life will never be there in the same situation again"
print("Correct sentence:", q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence:", seq)

# Get predictions with probabilities for each word in the sequence
predictions_with_probabilities = predict_completions_with_probabilities(seq, 5)

# Display predictions with probabilities
for word, probability in predictions_with_probabilities:
    print(f"Word: {word}, Probability: {probability * 100}")

# LSTM Bidirectional Model

In [None]:
from tensorflow.keras.layers import LSTM, Dense, Activation, Bidirectional

# Define the bidirectional LSTM model
model = Sequential()
model.add(Bidirectional(LSTM(128), input_shape=(WORD_LENGTH, len(unique_words))))
model.add(Dense(len(unique_words)))
model.add(Activation('softmax'))

# Define the optimizer
optimizer = Adam(learning_rate=0.01)

# Compile the model with the Adam optimizer
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model with the Adam optimizer
history_2 = model.fit(X, Y, validation_split=0.05, batch_size=128, epochs=5, shuffle=True).history

In [None]:
def sample(preds, top_n=3):
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds)
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    return heapq.nlargest(top_n, range(len(preds)), preds.take)

# I use the function predict_completions which use the model to predict and return the list of n predicted words.
def predict_completions_with_probabilities(text, n=3):
    if text == "":
        return [("0", 0.0)]  # Return a tuple containing a placeholder and probability 0.0
    x = prepare_input(text)
    preds = model.predict(x, verbose=0)[0]
    next_indices = sample(preds, n)
    probabilities = [preds[idx] for idx in next_indices]
    predicted_words = [unique_words[idx] for idx in next_indices]
    return list(zip(predicted_words, probabilities))

# Your example
q = "Your life will never be there in the same situation again"
print("Correct sentence:", q)
seq = " ".join(tokenizer.tokenize(q.lower())[0:5])
print("Sequence:", seq)

# Get predictions with probabilities for each word in the sequence
predictions_with_probabilities = predict_completions_with_probabilities(seq, 5)

# Display predictions with probabilities
for word, probability in predictions_with_probabilities:
    print(f"Word: {word}, Probability: {probability * 100}")

In [None]:
print("Unidirectional LSTM - Validation Accuracy:", history['val_accuracy'][-1]*100)
print("Bidirectional LSTM - Validation Accuracy:", history_2['val_accuracy'][-1]*100)