In [1]:
import numpy as np # for numerical mathematical calculatiion 
import tensorflow as tf 
from nltk.tokenize import RegexpTokenizer # regular expression tokenizer (condition based tokenization)
from tensorflow.keras.models import Sequential # to stack one layer over other 
from tensorflow.keras.preprocessing.text import Tokenizer # using this we can have token/ words from a sequence of word
from tensorflow.keras.preprocessing.sequence import pad_sequences # ensure equal length in a batch of sequence 
from tensorflow.keras.layers import Embedding,LSTM,Dense # embedding to convert word to vector, dense -> fully connected layer 



In [2]:
# reading the data set i.e a text file so we have to use concept of file handling 
# with open(location,mode ) as file :
with open ("DataSet/Sherlock Holmes Dataset.txt",'r',encoding='utf-8')as file :
    text = file .read()# used to read the file from the given path 


In [17]:
print(f"length of the corpus : {len(text)}")

length of the corpus : 610871


In [18]:
# tokenizing text to create sequence of word 
tokenize = Tokenizer()
tokenize.fit_on_texts([text])
# counting total number of words in the sequence 
total_word=len(tokenize.word_index)+1
print(f"total number of distinct word in the sequence : {total_word}")

total number of distinct word in the sequence : 8200


In [23]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenize.texts_to_sequences([line])[0]
    for i in range(1,len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)
        
print(f" printing 5 sequence from the list input sequence ")
print('-'*50)
print([input_sequences[i] for i in range (1,6)])

 printing 5 sequence from the list input sequence 
--------------------------------------------------
[[1, 1561, 5], [1, 1561, 5, 129], [1, 1561, 5, 129, 34], [647, 4498], [647, 4498, 4499]]


In [24]:
# computing max length of the sequence 
max_len_seq = max([len(seq) for seq in input_sequences])
print(f"maximum length of the sequence that is there in input seq : {max_len_seq}")

maximum length of the sequence that is there in input seq : 18


In [25]:
# now our next task is to make all these seq to equal length and we can achieve this by using pad_seq function 
input_sequences=np.array(pad_sequences(input_sequences,maxlen=max_len_seq,padding='pre'))

In [26]:
#splitting data to x and y array that is being used for training lstm model  
X = input_sequences[:, :-1]# contain all the columns except last 
y = input_sequences[:, -1] # it contain last column 

In [27]:
y = np.array(tf.keras.utils.to_categorical(y,num_classes=total_word))

In [30]:
# building the model 
model= Sequential() # stacking one layer over other 
model.add(Embedding(total_word,120,input_length=max_len_seq-1)) # 150 dim of word embedding # max length -1 because 
# we will be predicting next word based on the previous words 
model.add(LSTM(150)) # 200 neurons in lstm layer to capture short long dependencies 
model.add(Dense(total_word,activation='softmax')) # output layer -> softmax because multi class classification problem
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 17, 120)           984000    
                                                                 
 lstm_1 (LSTM)               (None, 150)               162600    
                                                                 
 dense_1 (Dense)             (None, 8200)              1238200   
                                                                 
Total params: 2384800 (9.10 MB)
Trainable params: 2384800 (9.10 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [32]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
result = model.fit(X, y, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [40]:
seed_text = "i will leave if they"
next_words = 3

for _ in range(next_words):
    token_list = tokenize.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len_seq-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenize.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

i will leave if they come to night
