#**Importing Libraries**

In [None]:
# Dictionary operations
import itertools

# Data preprocessing
import numpy as np
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences

# Model Training
import keras
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional

#**Reading the text document**

In [None]:
with open('SherlockHolmesStory.txt', 'r', encoding='utf-8') as file:
  text = file.read()
text

#**Tokenizing the word**

In [None]:
# Tokenization is a way of breaking down the sentences in a piece of text into smaller units called tokens.
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])

total_words = len(tokenizer.word_index) + 1
print("Total no. of unique words in the whole book :", total_words)

# Printing the first 10 items in the tokenizer.word_index dictionary
print(dict(itertools.islice(tokenizer.word_index.items(), 10)), '...')

#**Preparing our dataset**

In [None]:
# N-Gram:
# Text    ==> This is a Big Data AI Book
# Unigram ==> This, is, a, Big, Data, AI, Book
# Bigram  ==> This is, is a, a Big, Big Data, Data AI, AI Book
# Trigram ==> This is a, is a Big, a Big Data, Big Data AI, Data AI Book
input_sequences = []
token_list_chk = []
for line in text.split('\n'):
  token_list = tokenizer.texts_to_sequences([line])[0]
  token_list_chk.append(token_list)

  # Loop to run the N-grams from bi-gram to the length of the whole sentence
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)
print("Sentences after applying separator of \\n : ", [token_list_chk[i] for i in range(0, 5)], '|| where [1, 1561, 5, 129, 34] ==> [the adventures of sherlock holmes]')
print("The sentences after N-gram :", [input_sequences[i] for i in range(0, 4)])

# Extracting the maximum length among the sentences' length
max_sequence_len = max([len(seq) for seq in input_sequences])

# Padding extra zeroes to the start of sentence and converting the whole thing to a numpy array
input_sequences = np.array(pad_sequences(input_sequences, maxlen = max_sequence_len, padding = 'pre'))
print("Input after padding zeroes :", input_sequences[0])

# ‘X’ contains all tokens in each array except the last one, which represents the 'input' context.
# The ‘y’ array is assigned the values of the last column in the input_sequences array that represents the 'target' or the 'predicted' word.
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

#**DO THIS STEP ONLY IF YOU HAVE A HUGE AMOUNT OF RAM**

In [None]:
# One-hot encoding our output parameters (Forming categories out of the whole set)
''' y = np.array(keras.utils.to_categorical(y, num_classes = total_words))
    Note: We skip this step since the categorical encoding of nearly 100,000 output values exhausts all of the RAM memory and system crashes!!'''
'''If your system is capable of doing this, then compile the model as:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
'''

#**Creating model**

In [None]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length = max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
model.summary()

#**Training**

In [None]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs = 100, verbose = 1)

#**Output prediction**

In [None]:
seed_text = "I will close the door if"
next_words = 5

for _ in range(next_words):
  # Convert to token
  token_list = tokenizer.texts_to_sequences([seed_text])[0]
  # Path sequences
  token_list = pad_sequences([token_list], maxlen = max_sequence_len-1, padding = 'pre')
  # Model prediction
  predicted = np.argmax(model.predict(token_list), axis=-1)
  output_word = ""
  # Get predicted words
  for word, index in tokenizer.word_index.items():
    if index == predicted:
      output_word = word
      break
  seed_text += " " + output_word
print(seed_text)