In [1]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Read the text file
with open('sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [33]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'a': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'he': 10,
 'you': 11,
 'was': 12,
 'his': 13,
 'is': 14,
 'my': 15,
 'have': 16,
 'as': 17,
 'with': 18,
 'had': 19,
 'which': 20,
 'at': 21,
 'for': 22,
 'but': 23,
 'me': 24,
 'not': 25,
 'be': 26,
 'we': 27,
 'from': 28,
 'there': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'so': 33,
 'holmes': 34,
 'him': 35,
 'her': 36,
 'she': 37,
 "'": 38,
 'very': 39,
 'your': 40,
 'been': 41,
 'all': 42,
 'on': 43,
 'no': 44,
 'what': 45,
 'one': 46,
 'then': 47,
 'were': 48,
 'by': 49,
 'are': 50,
 'an': 51,
 'would': 52,
 'out': 53,
 'when': 54,
 'up': 55,
 'man': 56,
 'could': 57,
 'has': 58,
 'do': 59,
 'into': 60,
 'mr': 61,
 'who': 62,
 'little': 63,
 'will': 64,
 'if': 65,
 'some': 66,
 'now': 67,
 'see': 68,
 'down': 69,
 'should': 70,
 'our': 71,
 'or': 72,
 'they': 73,
 'may': 74,
 'well': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'more': 79,
 'think': 80,
 'room': 81,
 'know': 82,
 'shall': 83

- In the above code, the text is tokenized, which means it is divided into individual words or tokens. The ‘Tokenizer’ object is created, which will handle the tokenization process.
- The ‘fit_on_texts’ method of the tokenizer is called, passing the ‘text’ as input. This method analyzes the text and builds a vocabulary of unique words, assigning each word a numerical index.
- The ‘total_words’ variable is then assigned the value of the length of the word index plus one, representing the total number of distinct words in the text.


Now let’s create input-output pairs by splitting the text into sequences of tokens and forming n-grams from the sequences:

In [37]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

- In the above code, the text data is split into lines using the ‘\n’ character as a delimiter.
- For each line in the text, the ‘texts_to_sequences’ method of the tokenizer is used to convert the line into a sequence of numerical tokens based on the previously created vocabulary.
- The resulting token list is then iterated over using a for loop. For each iteration, a subsequence, or n-gram, of tokens is extracted, ranging from the beginning of the token list up to the current index ‘i’.



- This n-gram sequence represents the input context, with the last token being the target or predicted word.
- This n-gram sequence is then appended to the ‘input_sequences’ list.
- This process is repeated for all lines in the text, generating multiple input-output sequences that will be used for training the next word prediction model.

Now let’s pad the input sequences to have equal length:

In [7]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

- In the above code, the input sequences are padded to ensure all sequences have the same length.
- The variable ‘max_sequence_len’ is assigned the maximum length among all the input sequences.
- The ‘pad_sequences’ function is used to pad or truncate the input sequences to match this maximum length.

- The ‘pad_sequences’ function takes the input_sequences list, sets the maximum length to ‘max_sequence_len’, and specifies that the padding should be added at the beginning of each sequence using the ‘padding=pre’ argument.
- Finally, the input sequences are converted into a numpy array to facilitate further processing.

Now let’s split the sequences into input and output:

In [9]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

- In the above code, the input sequences are split into two arrays, ‘X’ and ‘y’, to create the input and output for training the next word prediction model.
- The ‘X’ array is assigned the values of all rows in the ‘input_sequences’ array except for the last column.
- It means that ‘X’ contains all the tokens in each sequence except for the last one, representing the input context.


- On the other hand, the ‘y’ array is assigned the values of the last column in the ‘input_sequences’ array, which represents the target or predicted word.

Now let’s convert the output to one-hot encode vectors:

In [11]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

- In the above code, we are converting the output array into a suitable format for training a model, where each target word is represented as a binary vector.

Now let’s build a neural network architecture to train the model:

In [13]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(LSTM(150,return_sequences=True))
model.add(LSTM(150))
model.add(Dense(total_words, activation="softmax"))
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=["accuracy"])
history = model.fit(X, y, epochs=50, verbose=1)
print(model)



Epoch 1/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m126s[0m 39ms/step - accuracy: 0.0589 - loss: 6.6143
Epoch 2/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 39ms/step - accuracy: 0.0940 - loss: 5.7634
Epoch 3/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 42ms/step - accuracy: 0.1204 - loss: 5.4191
Epoch 4/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 39ms/step - accuracy: 0.1365 - loss: 5.1807
Epoch 5/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 40ms/step - accuracy: 0.1465 - loss: 4.9725
Epoch 6/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 39ms/step - accuracy: 0.1551 - loss: 4.7832
Epoch 7/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m140s[0m 46ms/step - accuracy: 0.1616 - loss: 4.6221
Epoch 8/50
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 47ms/step - accuracy: 0.1677 - loss: 4.4655


- The code above defines the model architecture for the next word prediction model.
- The ‘Sequential’ model is created, which represents a linear stack of layers.
- The first layer added to the model is the ‘Embedding’ layer, which is responsible for converting the input sequences into dense vectors of fixed size. It takes three arguments:

1. ‘total_words’, which represents the total number of distinct words in the vocabulary;
2. ‘100’, which denotes the dimensionality of the word embeddings;
3. ‘input_length’, which specifies the length of the input sequences.

- The next layer added is the ‘LSTM’ layer, a type of recurrent neural network (RNN) layer designed for capturing sequential dependencies in the data. It has 150 units, which means it will learn 150 internal representations or memory cells.

- The next layer added is one more ‘LSTM’ layer, a type of recurrent neural network (RNN) layer designed for capturing sequential dependencies in the data. It has 150 units, which means it will learn 150 internal representations or memory cells.

- Finally, the ‘Dense’ layer is added, which is a fully connected layer that produces the output predictions.
- It has ‘total_words’ units and uses the ‘softmax’ activation function to convert the predicted scores into probabilities, indicating the likelihood of each word being the next one in the sequence.

Now let’s compile and train the model:


- The ‘compile’ method configures the model for training.
- The ‘loss’ parameter is set to ‘categorical_crossentropy’, a commonly used loss function for multi-class classification problems.
- The ‘optimizer’ parameter is set to ‘adam’, an optimization algorithm that adapts the learning rate during training.


- The ‘metrics’ parameter is set to ‘accuracy’ to monitor the accuracy during training.
- After compiling the model, the ‘fit’ method is called to train the model on the input sequences ‘X’ and the corresponding output ‘y’.
- The ‘epochs’ parameter specifies the number of times the training process will iterate over the entire dataset.
- The ‘verbose’ parameter is set to ‘1’ to display the training process.



In [19]:
def generate_next_words(text, next_words):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([text])[0]
        token_list = pad_sequences([token_list], 
                                 maxlen=max_sequence_len-1, 
                                 padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted = np.argmax(predicted, axis=-1)[0]
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        text += " " + output_word
    
    return text



In [31]:

text = "You would"
next_words = 5
generated_text = generate_next_words(text, next_words)
print(generated_text)


You would not advise me that you
