In [9]:
Text = '''To Sherlock Holmes she is always _the_ woman. I have seldom heard him
mention her under any other name. In his eyes she eclipses and
predominates the whole of her sex. It was not that he felt any emotion
akin to love for Irene Adler. All emotions, and that one particularly,
were abhorrent to his cold, precise but admirably balanced mind. He
was, I take it, the most perfect reasoning and observing machine that
the world has seen, but as a lover he would have placed himself in a
false position. He never spoke of the softer passions, save with a gibe
and a sneer. They were admirable things for the observer—excellent for
drawing the veil from men’s motives and actions. But for the trained
reasoner to admit such intrusions into his own delicate and finely
adjusted temperament was to introduce a distracting factor which might
throw a doubt upon all his mental results. Grit in a sensitive
instrument, or a crack in one of his own high-power lenses, would not
be more disturbing than a strong emotion in a nature such as his. And
yet there was but one woman to him, and that woman was the late Irene
Adler, of dubious and questionable memory.'''


In [6]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences, to_categorical
import pandas as pd

In [11]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([Text])
total_words = len(tokenizer.word_index) + 1

In [13]:
input_sequences = []
for line in Text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [59]:
max_len = max([len(x) for x in input_sequences])
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

In [21]:
padded_input_sequences

array([[  0,   0,   0, ...,   0,   4,  32],
       [  0,   0,   0, ...,   4,  32,  33],
       [  0,   0,   0, ...,  32,  33,  15],
       ...,
       [  0,   0,   0, ...,   8, 126,   3],
       [  0,   0,   0, ..., 126,   3, 127],
       [  0,   0,   0, ...,   3, 127, 128]], dtype=int32)

In [20]:
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)


In [26]:
X.shape

(190, 14)

In [28]:
y.shape

(190, 129)

In [22]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [60]:
model = Sequential([
    Embedding(total_words, 100, input_length=max_len-1),
    LSTM(150, return_sequences=False),
    Dense(total_words, activation='softmax')
])



In [30]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [37]:
model.summary()

In [49]:
history = model.fit(
    X, y,
    epochs=200,
    batch_size=128,
    validation_split=0.2
)

Epoch 1/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 151ms/step - accuracy: 0.9353 - loss: 0.6791 - val_accuracy: 0.0000e+00 - val_loss: 8.0886
Epoch 2/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - accuracy: 0.9327 - loss: 0.6824 - val_accuracy: 0.0000e+00 - val_loss: 8.0826
Epoch 3/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 133ms/step - accuracy: 0.9371 - loss: 0.6785 - val_accuracy: 0.0000e+00 - val_loss: 8.0295
Epoch 4/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 93ms/step - accuracy: 0.9467 - loss: 0.6544 - val_accuracy: 0.0000e+00 - val_loss: 8.0718
Epoch 5/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step - accuracy: 0.9467 - loss: 0.6507 - val_accuracy: 0.0000e+00 - val_loss: 8.1443
Epoch 6/200
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step - accuracy: 0.9371 - loss: 0.6458 - val_accuracy: 0.0000e+00 - val_loss: 8.1257
Epoch 7/200


In [54]:
def predict_next_word(seed_text, num_words=1):
    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        padded = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
        preds = model.predict(padded, verbose=0)[0]
        top_indices = preds.argsort()[-num_words:][::-1]
        return[tokenizer.index_word[idx] for idx in top_indices]


In [56]:
print(predict_next_word("To sherlock"))

['holmes']


In [62]:
from keras.layers import Embedding, GRU, Dense


In [63]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_len-1))
model.add(GRU(64))
model.add(Dense(total_words, activation='softmax'))



In [64]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [70]:
model.summary

In [66]:
model.fit(X,y, epochs=200, verbose=1)

Epoch 1/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.0050 - loss: 4.8611
Epoch 2/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.0653 - loss: 4.8476
Epoch 3/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0782 - loss: 4.8331
Epoch 4/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.1049 - loss: 4.8129
Epoch 5/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - accuracy: 0.0638 - loss: 4.7916
Epoch 6/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0675 - loss: 4.7360
Epoch 7/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0332 - loss: 4.6284
Epoch 8/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.0523 - loss: 4.5922
Epoch 9/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7c701b339990>

In [67]:
def predict_next_word(seed_text, num_words=1):
    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        padded = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
        preds = model.predict(padded, verbose=0)[0]
        top_indices = preds.argsort()[-num_words:][::-1]
        return[tokenizer.index_word[idx] for idx in top_indices]

In [68]:
print(predict_next_word("To "))

['sherlock']


In [75]:
from keras.layers import Embedding, SimpleRNN, Dense


In [76]:
model = Sequential()
model.add(Embedding(total_words, 64, input_length=max_len-1))
model.add(SimpleRNN(64))
model.add(Dense(total_words, activation='softmax'))



In [77]:
model.compile(
    loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

In [78]:
model.summary

In [79]:
model.fit(X,y, epochs=200, verbose=1)

Epoch 1/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.0000e+00 - loss: 4.8688
Epoch 2/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0206 - loss: 4.7857
Epoch 3/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0414 - loss: 4.7113
Epoch 4/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0791 - loss: 4.6289
Epoch 5/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0961 - loss: 4.5560
Epoch 6/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.0902 - loss: 4.4599
Epoch 7/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1343 - loss: 4.4411
Epoch 8/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1434 - loss: 4.3418
Epoch 9/200
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0

<keras.src.callbacks.history.History at 0x7c701e68af50>

In [80]:
def predict_next_word(seed_text, num_words=1):
    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([seed_text])[0]
        padded = pad_sequences([tokens], maxlen=max_sequence_len-1, padding='pre')
        preds = model.predict(padded, verbose=0)[0]
        top_indices = preds.argsort()[-num_words:][::-1]
        return[tokenizer.index_word[idx] for idx in top_indices]

In [81]:
print(predict_next_word("To "))

['sherlock']
