In [1]:
!kaggle datasets download -d muhammadbilalhaneef/sherlock-holmes-next-word-prediction-corpus

Dataset URL: https://www.kaggle.com/datasets/muhammadbilalhaneef/sherlock-holmes-next-word-prediction-corpus
License(s): CC0-1.0
sherlock-holmes-next-word-prediction-corpus.zip: Skipping, found more recently modified local copy (use --force to force download)


In [2]:
import zipfile
import os

# Define the path to the zip file and the destination directory
zip_file_path = 'sherlock-holmes-next-word-prediction-corpus.zip'  # or provide the full path
destination_dir = 'text_data.txt'

# Create the destination directory if it doesn't exist
os.makedirs(destination_dir, exist_ok=True)

# Unzip the file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(destination_dir)

print(f'Unzipped {zip_file_path} to {destination_dir}')


Unzipped sherlock-holmes-next-word-prediction-corpus.zip to text_data.txt


In [3]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split

In [4]:
data_path = r'C:\Users\Personal\Documents\text_data.txt\Sherlock Holmes.txt'
# data_path =r'C:\Users\Personal\Documents\text_data.txt'
with open(data_path, 'r') as file:
    chat_data = file.read()

# Display the first few lines of the chat data
print(chat_data[:500])  # Print the first 500 characters as an example





                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohemia
               The Red-Headed League
               A Case of Identity
               The Boscombe Valley Mystery
               The Five Orange Pips
               The Man with the Twisted Lip
               The Adventure of the Blue Carbuncle
               The Adventure of the Speckled Band
  


In [5]:
print(type(chat_data))


<class 'str'>


In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([chat_data])
tokenizer.word_index

{'the': 1,
 'and': 2,
 'i': 3,
 'to': 4,
 'of': 5,
 'a': 6,
 'in': 7,
 'that': 8,
 'it': 9,
 'he': 10,
 'you': 11,
 'was': 12,
 'his': 13,
 'is': 14,
 'my': 15,
 'have': 16,
 'as': 17,
 'with': 18,
 'had': 19,
 'which': 20,
 'at': 21,
 'for': 22,
 'but': 23,
 'me': 24,
 'not': 25,
 'be': 26,
 'we': 27,
 'from': 28,
 'there': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'so': 33,
 'holmes': 34,
 'him': 35,
 'her': 36,
 'she': 37,
 "'": 38,
 'very': 39,
 'your': 40,
 'been': 41,
 'all': 42,
 'on': 43,
 'no': 44,
 'what': 45,
 'one': 46,
 'then': 47,
 'were': 48,
 'by': 49,
 'are': 50,
 'an': 51,
 'would': 52,
 'out': 53,
 'when': 54,
 'up': 55,
 'man': 56,
 'could': 57,
 'has': 58,
 'do': 59,
 'into': 60,
 'mr': 61,
 'who': 62,
 'little': 63,
 'will': 64,
 'if': 65,
 'some': 66,
 'now': 67,
 'see': 68,
 'down': 69,
 'should': 70,
 'our': 71,
 'or': 72,
 'they': 73,
 'may': 74,
 'well': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'more': 79,
 'think': 80,
 'room': 81,
 'know': 82,
 'shall': 83

In [7]:
tokenized_sequence =[]
for sentence in chat_data.split('\n'):
  tokenized_sentence= tokenizer.texts_to_sequences([sentence])[0]

  for i in range(1 , len(tokenized_sentence)):
    tokenized_sequence.append(tokenized_sentence[:i+1])

In [8]:
# (tokenized_sequence)

In [9]:
max_len = max([len(x) for x in tokenized_sequence])
print(max_len)

18


In [10]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_sequences= pad_sequences(tokenized_sequence , maxlen= max_len , padding = 'pre')

In [11]:
X = padded_input_sequences[: , :-1]
Y = padded_input_sequences[:,-1]

In [12]:
print(X.shape , Y.shape)

(96314, 17) (96314,)


In [13]:
len(tokenizer.word_index)

8199

In [14]:
from tensorflow.keras.utils import to_categorical
Y=to_categorical(Y , num_classes=8200)
Y.shape

(96314, 8200)

In [15]:
print(Y)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [16]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding , LSTM ,Dense

In [25]:
model = Sequential()
model.add(Embedding(8200 , 100  , input_length=17))
model.add(LSTM(150))
model.add(Dense(8200 , activation='softmax'))

In [26]:
model.compile(loss='categorical_crossentropy' , optimizer='adam' , metrics=['accuracy'])

In [27]:
model.summary()

In [28]:
X_train, X_test , Y_train , Y_test = train_test_split(X , Y  , test_size=0.3 , random_state=2)

In [29]:
model.fit(X  , Y , epochs =100)

Epoch 1/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m101s[0m 32ms/step - accuracy: 0.0598 - loss: 6.5643
Epoch 2/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 31ms/step - accuracy: 0.1188 - loss: 5.5683
Epoch 3/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 32ms/step - accuracy: 0.1430 - loss: 5.1244
Epoch 4/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 31ms/step - accuracy: 0.1648 - loss: 4.7732
Epoch 5/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 31ms/step - accuracy: 0.1834 - loss: 4.4778
Epoch 6/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 31ms/step - accuracy: 0.2034 - loss: 4.1759
Epoch 7/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 32ms/step - accuracy: 0.2322 - loss: 3.8877
Epoch 8/100
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m96s[0m 32ms/step - accuracy: 0.2639 - loss: 3.6277

<keras.src.callbacks.history.History at 0x206fcaf7ec0>

In [30]:
model.fit(X  , Y , epochs =10)

Epoch 1/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 37ms/step - accuracy: 0.8759 - loss: 0.4748
Epoch 2/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m106s[0m 35ms/step - accuracy: 0.8755 - loss: 0.4734
Epoch 3/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 36ms/step - accuracy: 0.8772 - loss: 0.4652
Epoch 4/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m109s[0m 36ms/step - accuracy: 0.8775 - loss: 0.4693
Epoch 5/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 37ms/step - accuracy: 0.8760 - loss: 0.4687
Epoch 6/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m110s[0m 36ms/step - accuracy: 0.8772 - loss: 0.4639
Epoch 7/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m127s[0m 42ms/step - accuracy: 0.8750 - loss: 0.4701
Epoch 8/10
[1m3010/3010[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 45ms/step - accuracy: 0.8743 - loss: 0.4718


<keras.src.callbacks.history.History at 0x206fcb37380>

In [31]:
# Save the entire model to a file
model.save('next_word_prediction_model.h5')

print("Model saved to next_word_prediction_model.h5")



Model saved to next_word_prediction_model.h5


In [32]:
# From here I need to start 

from tensorflow.keras.models import load_model

# Load the saved model
saved_model = load_model('next_word_prediction_model.h5')

print("Model loaded successfully!")



Model loaded successfully!


In [35]:
import numpy as np
import time
text = input()
for i in range(8):

# tokenize
  token_text =tokenizer.texts_to_sequences([text])[0]
# padding
  padded_token_text =pad_sequences([token_text] , maxlen = 17 , padding = 'pre')
# predict
  ind =np.argmax(saved_model.predict(padded_token_text))

  for word , index in tokenizer.word_index.items():

    if index==ind:
      text= text+" "+word
      print(text)
      time.sleep(2)

 how


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 59ms/step
how did
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
how did you
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
how did you trace
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 63ms/step
how did you trace it
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
how did you trace it then
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
how did you trace it then he
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step
how did you trace it then he remarked
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 64ms/step
how did you trace it then he remarked what
