In [29]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sherlock-holmes-next-word-prediction-corpus/Sherlock Holmes.txt


## import the libraries

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [2]:
import re
def preprocess(text):
    text = text.lower()
#     remove non alphanumeric or whitespace characters
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    text = ' '.join([word for word in text.split() if len(word) > 1]) 
    return text

In [33]:
file = open("/kaggle/input/sherlock-holmes-next-word-prediction-corpus/Sherlock Holmes.txt", "r", encoding = "utf8")
lines = []
for i in file:
    preprocessed_i = preprocess(i)
    if preprocessed_i != '':
        lines.append(preprocessed_i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])
lines

The First Line:  the adventures of sherlock holmes
The Last Line:  this text comes from the collections version 31


['the adventures of sherlock holmes',
 'arthur conan doyle',
 'table of contents',
 'scandal in bohemia',
 'the redheaded league',
 'case of identity',
 'the boscombe valley mystery',
 'the five orange pips',
 'the man with the twisted lip',
 'the adventure of the blue carbuncle',
 'the adventure of the speckled band',
 'the adventure of the engineers thumb',
 'the adventure of the noble bachelor',
 'the adventure of the beryl coronet',
 'the adventure of the copper beeches',
 'scandal in bohemia',
 'table of contents',
 'chapter',
 'chapter',
 'chapter',
 'chapter',
 'to sherlock holmes she is always the woman have seldom heard him',
 'mention her under any other name in his eyes she eclipses and',
 'predominates the whole of her sex it was not that he felt any',
 'emotion akin to love for irene adler all emotions and that one',
 'particularly were abhorrent to his cold precise but admirably',
 'balanced mind he was take it the most perfect reasoning and',
 'observing machine that the

In [4]:
tokenizer = Tokenizer()

In [5]:
tokenizer.fit_on_texts(lines)

In [6]:
tokenizer.index_word

{1: 'the',
 2: 'and',
 3: 'to',
 4: 'of',
 5: 'in',
 6: 'that',
 7: 'it',
 8: 'you',
 9: 'he',
 10: 'was',
 11: 'his',
 12: 'is',
 13: 'my',
 14: 'have',
 15: 'as',
 16: 'with',
 17: 'had',
 18: 'which',
 19: 'at',
 20: 'for',
 21: 'but',
 22: 'not',
 23: 'me',
 24: 'be',
 25: 'we',
 26: 'there',
 27: 'from',
 28: 'this',
 29: 'said',
 30: 'upon',
 31: 'holmes',
 32: 'so',
 33: 'him',
 34: 'her',
 35: 'she',
 36: 'very',
 37: 'your',
 38: 'been',
 39: 'no',
 40: 'all',
 41: 'what',
 42: 'on',
 43: 'one',
 44: 'then',
 45: 'were',
 46: 'by',
 47: 'are',
 48: 'an',
 49: 'would',
 50: 'when',
 51: 'out',
 52: 'up',
 53: 'man',
 54: 'could',
 55: 'has',
 56: 'do',
 57: 'into',
 58: 'mr',
 59: 'who',
 60: 'little',
 61: 'will',
 62: 'if',
 63: 'some',
 64: 'now',
 65: 'see',
 66: 'down',
 67: 'should',
 68: 'our',
 69: 'or',
 70: 'they',
 71: 'may',
 72: 'am',
 73: 'well',
 74: 'us',
 75: 'over',
 76: 'more',
 77: 'think',
 78: 'know',
 79: 'about',
 80: 'shall',
 81: 'can',
 82: 'before',


In [7]:
input_sequences = []
for sentence in lines:
    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]
    # tokenizer.texts_to_sequences([sentence]) each sentence is a 2d list so take 0th idx

    # target= we have to generate [93, 1], [93, 1, 13], [93, 1, 13, 20] from [[93, 1, 13, 20]]
    # for every sentence this loop will run (n - 1) times the length of that sentence\
    # to avoid single element like [93] we took i from 1 not 0. But slicing starts from 0
    for i in range (1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])

In [8]:
input_sequences

[[1, 1510],
 [1, 1510, 4],
 [1, 1510, 4, 126],
 [1, 1510, 4, 126, 31],
 [597, 4400],
 [597, 4400, 4401],
 [254, 4],
 [254, 4, 1511],
 [792, 5],
 [792, 5, 844],
 [1, 524],
 [1, 524, 631],
 [112, 4],
 [112, 4, 2022],
 [1, 653],
 [1, 653, 1324],
 [1, 653, 1324, 478],
 [1, 306],
 [1, 306, 913],
 [1, 306, 913, 845],
 [1, 53],
 [1, 53, 16],
 [1, 53, 16, 1],
 [1, 53, 16, 1, 977],
 [1, 53, 16, 1, 977, 846],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 440],
 [1, 553, 4, 1, 440, 1325],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 1740],
 [1, 553, 4, 1, 1740, 736],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 2406],
 [1, 553, 4, 1, 2406, 654],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 737],
 [1, 553, 4, 1, 737, 1199],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 2023],
 [1, 553, 4, 1, 2023, 377],
 [1, 553],
 [1, 553, 4],
 [1, 553, 4, 1],
 [1, 553, 4, 1, 793],
 [1, 553, 4, 1, 793, 847],
 [792, 5],
 [792, 5, 844],
 [254, 4],
 [254

In [9]:
max_len = max([len(x) for x in input_sequences]) #57
max_len

17

In [10]:
padded_input_sequences = pad_sequences(input_sequences, maxlen=max_len, padding='pre')

## Create X and Y 

In [11]:
X = padded_input_sequences[:, :-1]
y = padded_input_sequences[:, -1]
print(X.shape)
print(y.shape)

(89274, 16)
(89274,)


In [12]:
vocab_size = len(tokenizer.word_index) + 1
vocab_size

8400

In [13]:
y = to_categorical(y, num_classes = vocab_size)

In [14]:
y.shape

(89274, 8400)

## Architechture

In [15]:
vocab_size
max_len

17

In [16]:
# model = Sequential()
# model.add(Embedding(8400, 1000, input_length = (max_len - 1))) #bcuz x does not contain the last layer(output) so size = max_len - 1
# model.add(LSTM(200, return_sequences=False))
# model.add(Dense(vocab_size, activation = 'softmax'))

In [17]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

In [18]:
input_layer = Input(shape=(max_len - 1,))
embedding_layer = Embedding(input_dim=vocab_size, output_dim=500)(input_layer)
lstm_layer = LSTM(256)(embedding_layer)
output_layer = Dense(vocab_size, activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.summary()

In [19]:
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [20]:
# model.summary()

In [37]:
model.fit(X, y, epochs = 30)

Epoch 1/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.7145 - loss: 1.3314
Epoch 2/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.7617 - loss: 1.1237
Epoch 3/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.7961 - loss: 0.9686
Epoch 4/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.8229 - loss: 0.8383
Epoch 5/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.8422 - loss: 0.7453
Epoch 6/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.8573 - loss: 0.6656
Epoch 7/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.8657 - loss: 0.6158
Epoch 8/30
[1m2790/2790[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 10ms/step - accuracy: 0.8766 - loss: 0.5660
Epoch 9/

<keras.src.callbacks.history.History at 0x7b72446c4bb0>

In [38]:
import numpy as np

In [43]:
text = "sherlock"
for i in range (11):
    # tokenize
    token_text = tokenizer.texts_to_sequences([text])[0]
    # then pad
    padded_token_text = pad_sequences([token_text], maxlen = max_len - 1, padding = 'pre')
    # predict
    # model.predict(padded_token_text) #this will give a vector of dim 283(i.e vocab size)
    predicted_index = np.argmax(model.predict(padded_token_text))
    predicted_word = tokenizer.index_word[predicted_index]
    text = text + " " +predicted_word
    print(text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
sherlock holmes
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
sherlock holmes clapped
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
sherlock holmes clapped his
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
sherlock holmes clapped his hands
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
sherlock holmes clapped his hands softly
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
sherlock holmes clapped his hands softly together
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
sherlock holmes clapped his hands softly together and
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
sherlock holmes clapped his hands softly together and chuckled
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
sherlock holmes clapped his hands softly t