##### Next Word Predictor Project using LSTM Architecture

In [None]:
import warnings
warnings.filterwarnings('ignore')

with open('cricket_data.txt', 'r') as file:
    data = file.read() # Reading the entire file

data = data.lower().replace('\n', ' ').replace('  ', ' ').strip()
print(data) # Our cricket data is loaded to the data variable

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras as kr
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
tokenizer = Tokenizer(oov_token = '<oov>')
tokenizer.fit_on_texts([data]) # Fitting the tokenizer on our textual data. Note, we passed our data in the form of a list because there can be multiple texts. So we need to pass them in the form of a list
print('Word Indices:', tokenizer.word_index)

In [None]:
input_sequences = list()
for sentence in data.split('.'):
    # print(sentence, end = ' | ') # To see the sentences in our data
    tokenized_sent = tokenizer.texts_to_sequences([sentence])[0]
    # print(tokenized_sent, end = ' | ') # To see the tokenized sentence
    for i in range(1, len(tokenized_sent)):
        input_sequences.append(tokenized_sent[:i+1]) # From the starting till i (i+1 is excluded as per rule)

print(input_sequences)

In [None]:
# Now to adjust all the input sequences to a same dimension, we need to find the max length sentence and apply zero padding
length = [len(x) for x in input_sequences]
max_len = max(length)
print('Max length:', max_len)

In [None]:
from keras.utils import pad_sequences
padded_input_sequences = pad_sequences(sequences = input_sequences, maxlen = max_len, padding = 'pre') # We applied padding at the starting because we want to create an input output behaviour. We want to keep the output at the end of sequence
padded_input_sequences

In [None]:
# Now to create a input output behaviour out of the sequences, as we can see that, the last number in the sequence is the output and the rest all of them are the input..
X = padded_input_sequences[:, :-1] # All rows and all columns excluding the last (-1) indexed col
y = padded_input_sequences[:, -1] # All rows and only the -1 column
print(X, '\n\n',  y)

In [None]:
# Shape
X.shape, y.shape

In [None]:
# We need our vocabulary size for the num_classes and input_dimension
vocab_size = len(tokenizer.word_index)

In [None]:
# Now we will apply One Hot Encoding on our data
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes = vocab_size + 1) # We added an extra 1 because in tokenizer, words are tokenized starting from 1, and OHE always starts from index 0. So if we do not give the extra 1, the last word will always be missed
y.shape

##### Model Creation

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
model = Sequential()
model.add(Embedding(input_dim = vocab_size + 1, output_dim = 100, input_shape = (max_len,))) # Output_dimension is our choice. It is a hyperparameter which we can tune to find out the best results. Input_shape is the size of each input in our input sequence. Embedding layer does a very simple job. It takes each one of the unique word in our vocabulary and converts it into a 100 (since here our output_dim is 100) dimension numeric vector.
model.add(LSTM(units = 150))
model.add(Dense(units = vocab_size + 1, activation = 'softmax'))

##### Compilation and Fitting

In [None]:
model.compile(optimizer = kr.optimizers.Adam(), loss = kr.losses.categorical_crossentropy, metrics = ['accuracy'])
history = model.fit(X, y, epochs = 100, verbose = 1, validation_split = 0.1)
model.summary()

##### Prediction of the next 'n' words

In [None]:
n = int(input('How many words you want to be predicted? '))
new_word = input('Enter a word: ').lower()
for _ in range(n):
    tokenized_new_word = tokenizer.texts_to_sequences([new_word])[0]
    padded_tokenized_new_word = pad_sequences([tokenized_new_word], maxlen = max_len - 1, padding = 'pre') # max_len - 1 for padding is used because our model input (X) was padded_input_sequences[:, :-1]
    print(padded_tokenized_new_word)

    raw_prob = model.predict(padded_tokenized_new_word) # Returns the raw probability vector
    pred = np.argmax(raw_prob) # This returns the highest probability
    conf = np.max(raw_prob)
    for key, val in tokenizer.word_index.items():
        if val == pred:
            # print(f'Predicted word is "{key}" with a confidence of {conf:.4f}.')
            new_word = new_word + " " + key
            print(new_word)
            break