##### Next Word Predictor Project using LSTM Architecture

In [242]:
import warnings
warnings.filterwarnings('ignore')

with open('cricket_data.txt', 'r') as file:
    data = file.read() # Reading the entire file

data = data.lower().replace('\n', ' ').replace('  ', ' ').strip()
print(data) # Our cricket data is loaded to the data variable

cricket is not merely a game played with bat and ball; it is a sprawling epic, a multi-generational saga written on fields of green across the globe. it is a language of emotion, understood through the visceral crack of willow on leather, the collective gasp of a stadium, and the silent, agonizing wait for an umpire's decision. it is a heartbeat that synchronizes millions, a shared rhythm that transcends borders, cultures, and time itself. from the dusty, sun-baked maidans of mumbai where children learn to defend with makeshift bats, to the hallowed turf of lord's in london where history is etched with every delivery, the spirit of cricket courses like a lifeblood. every ball bowled is a question, every shot played an answer. every session is a chapter, every match a self-contained drama, and every series an unfolding novel filled with heroes, villains, triumphs, and heartbreaks. imagine the first light of a test match morning, a sacred ritual for the purist. the sky, a soft canvas of 

In [243]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras as kr
from tensorflow.keras.preprocessing.text import Tokenizer

In [244]:
tokenizer = Tokenizer(oov_token = '<oov>')
tokenizer.fit_on_texts([data]) # Fitting the tokenizer on our textual data. Note, we passed our data in the form of a list because there can be multiple texts. So we need to pass them in the form of a list
print('Word Indices:', tokenizer.word_index)

Word Indices: {'<oov>': 1, 'the': 2, 'a': 3, 'of': 4, 'and': 5, 'in': 6, 'is': 7, 'to': 8, 'that': 9, 'cricket': 10, 'for': 11, 'it': 12, 'with': 13, 'on': 14, 'was': 15, 'ball': 16, 'his': 17, 'an': 18, 'their': 19, 'by': 20, 'are': 21, 'game': 22, 'from': 23, 'like': 24, 'who': 25, 'players': 26, 'has': 27, 'test': 28, 'or': 29, 'as': 30, 'world': 31, 'into': 32, 'match': 33, 'its': 34, 'can': 35, 'one': 36, 'most': 37, 'at': 38, 'sport': 39, 'team': 40, 'have': 41, 'day': 42, 'bowling': 43, 'this': 44, 'where': 45, 'he': 46, 'new': 47, 'often': 48, 'australia': 49, 'they': 50, 'bowler': 51, 'all': 52, 'but': 53, 'batsman': 54, 'every': 55, 'play': 56, 'between': 57, 'batting': 58, 'not': 59, 'field': 60, 'off': 61, 'art': 62, 'be': 63, 'bat': 64, 'cricketing': 65, 'runs': 66, 'over': 67, 'swing': 68, 'modern': 69, 'fast': 70, 'there': 71, 'india': 72, 't20': 73, 'run': 74, 'fans': 75, 'then': 76, 'played': 77, 'history': 78, 'delivery': 79, 'series': 80, 'will': 81, 'overs': 82, 'wh

In [245]:
input_sequences = list()
for sentence in data.split('.'):
    # print(sentence, end = ' | ') # To see the sentences in our data
    tokenized_sent = tokenizer.texts_to_sequences([sentence])[0]
    # print(tokenized_sent, end = ' | ') # To see the tokenized sentence
    for i in range(1, len(tokenized_sent)):
        input_sequences.append(tokenized_sent[:i+1]) # From the starting till i (i+1 is excluded as per rule)

print(input_sequences)

[[10, 7], [10, 7, 59], [10, 7, 59, 1028], [10, 7, 59, 1028, 3], [10, 7, 59, 1028, 3, 22], [10, 7, 59, 1028, 3, 22, 77], [10, 7, 59, 1028, 3, 22, 77, 13], [10, 7, 59, 1028, 3, 22, 77, 13, 64], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381, 3], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381, 3, 272], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381, 3, 272, 1030], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381, 3, 272, 1030, 1031], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5, 16, 12, 7, 3, 1029, 381, 3, 272, 1030, 1031, 382], [10, 7, 59, 1028, 3, 22, 77, 13, 64, 5,

In [246]:
# Now to adjust all the input sequences to a same dimension, we need to find the max length sentence and apply zero padding
length = [len(x) for x in input_sequences]
max_len = max(length)
print('Max length:', max_len)

Max length: 60


In [247]:
from keras.utils import pad_sequences
padded_input_sequences = pad_sequences(sequences = input_sequences, maxlen = max_len, padding = 'pre') # We applied padding at the starting because we want to create an input output behaviour. We want to keep the output at the end of sequence
padded_input_sequences

array([[   0,    0,    0, ...,    0,   10,    7],
       [   0,    0,    0, ...,   10,    7,   59],
       [   0,    0,    0, ...,    7,   59, 1028],
       ...,
       [   0,    0,    0, ..., 2744,  200,    5],
       [   0,    0,    0, ...,  200,    5,   88],
       [   0,    0,    0, ...,    5,   88, 2745]],
      shape=(8840, 60), dtype=int32)

In [248]:
# Now to create a input output behaviour out of the sequences, as we can see that, the last number in the sequence is the output and the rest all of them are the input..
X = padded_input_sequences[:, :-1] # All rows and all columns excluding the last (-1) indexed col
y = padded_input_sequences[:, -1] # All rows and only the -1 column
print(X, '\n\n',  y)

[[   0    0    0 ...    0    0   10]
 [   0    0    0 ...    0   10    7]
 [   0    0    0 ...   10    7   59]
 ...
 [   0    0    0 ...   81 2744  200]
 [   0    0    0 ... 2744  200    5]
 [   0    0    0 ...  200    5   88]] 

 [   7   59 1028 ...    5   88 2745]


In [249]:
# Shape
X.shape, y.shape

((8840, 59), (8840,))

In [250]:
# We need our vocabulary size for the num_classes and input_dimension
vocab_size = len(tokenizer.word_index)

In [251]:
# Now we will apply One Hot Encoding on our data
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes = vocab_size + 1) # We added an extra 1 because in tokenizer, words are tokenized starting from 1, and OHE always starts from index 0. So if we do not give the extra 1, the last word will always be missed
y.shape

(8840, 2746)

##### Model Creation

In [252]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
model = Sequential()
model.add(Embedding(input_dim = vocab_size + 1, output_dim = 100, input_shape = (max_len,))) # Output_dimension is our choice. It is a hyperparameter which we can tune to find out the best results. Input_shape is the size of each input in our input sequence. Embedding layer does a very simple job. It takes each one of the unique word in our vocabulary and converts it into a 100 (since here our output_dim is 100) dimension numeric vector.
model.add(LSTM(units = 150))
model.add(Dense(units = vocab_size + 1, activation = 'softmax'))

##### Compilation and Fitting

In [253]:
model.compile(optimizer = kr.optimizers.Adam(), loss = kr.losses.categorical_crossentropy, metrics = ['accuracy'])
history = model.fit(X, y, epochs = 100, verbose = 1, validation_split = 0.1)
model.summary()

Epoch 1/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 32ms/step - accuracy: 0.0698 - loss: 6.9367 - val_accuracy: 0.0713 - val_loss: 6.8190
Epoch 2/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.0853 - loss: 6.3843 - val_accuracy: 0.0939 - val_loss: 6.8773
Epoch 3/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 31ms/step - accuracy: 0.0979 - loss: 6.1811 - val_accuracy: 0.0995 - val_loss: 6.9700
Epoch 4/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.1070 - loss: 5.9795 - val_accuracy: 0.0973 - val_loss: 7.0308
Epoch 5/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 30ms/step - accuracy: 0.1110 - loss: 5.7667 - val_accuracy: 0.1018 - val_loss: 7.1097
Epoch 6/100
[1m249/249[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 30ms/step - accuracy: 0.1254 - loss: 5.5336 - val_accuracy: 0.0916 - val_loss: 7.1963
Epoch 7/100
[1m

##### Prediction of the next 'n' words

In [258]:
n = int(input('How many words you want to be predicted? '))
new_word = input('Enter a word: ').lower()
for _ in range(n):
    tokenized_new_word = tokenizer.texts_to_sequences([new_word])[0]
    padded_tokenized_new_word = pad_sequences([tokenized_new_word], maxlen = max_len - 1, padding = 'pre') # max_len - 1 for padding is used because our model input (X) was padded_input_sequences[:, :-1]
    print(padded_tokenized_new_word)

    raw_prob = model.predict(padded_tokenized_new_word) # Returns the raw probability vector
    pred = np.argmax(raw_prob) # This returns the highest probability
    conf = np.max(raw_prob)
    for key, val in tokenizer.word_index.items():
        if val == pred:
            # print(f'Predicted word is "{key}" with a confidence of {conf:.4f}.')
            new_word = new_word + " " + key
            print(new_word)
            break

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    2  610
  1095    4   10]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
the primary objective of cricket is
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    2  610 1095
     4   10    7]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
the primary objective of cricket is for
[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0 