In [2]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.7-py3-none-any.whl (1.5 MB)
Collecting regex>=2021.8.3
  Downloading regex-2022.4.24-cp39-cp39-win_amd64.whl (262 kB)
Collecting click
  Using cached click-8.1.3-py3-none-any.whl (96 kB)
Installing collected packages: regex, click, nltk
Successfully installed click-8.1.3 nltk-3.7 regex-2022.4.24


In [3]:
import numpy as np
import matplotlib.pyplot as plt
from nltk.tokenize import RegexpTokenizer

In [4]:
text = open('1661-0.txt',encoding='utf-8').read().lower()

FileNotFoundError: [Errno 2] No such file or directory: '1661-0.txt'

In [3]:
len(text)

21408

In [4]:
text[:200]

"\ufeff\nproject gutenberg's the adventures of sherlock holmes, by arthur conan doyle\n\nthis ebook is for the use of anyone anywhere at no cost and with\nalmost no restrictions whatsoever.  you may copy it, gi"

#### Cleaning the Text

In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [6]:
text = re.sub(r'[^a-zA-Z]', r' ',text)
text = re.sub(r' +', r' ',text)
text = text.strip()
text

'project gutenberg s the adventures of sherlock holmes by arthur conan doyle this ebook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever you may copy it give it away or re use it under the terms of the project gutenberg license included with this ebook or online at www gutenberg net title the adventures of sherlock holmes author arthur conan doyle release date november ebook last updated may language english character set encoding utf start of this project gutenberg ebook the adventures of sherlock holmes produced by an anonymous project gutenberg volunteer and jose menendez cover the adventures of sherlock holmes by arthur conan doyle contents i a scandal in bohemia ii the red headed league iii a case of identity iv the boscombe valley mystery v the five orange pips vi the man with the twisted lip vii the adventure of the blue carbuncle viii the adventure of the speckled band ix the adventure of the engineer s thumb x the adventure of the noble b

In [7]:
tokens = word_tokenize(text)

In [8]:
tokens

['project',
 'gutenberg',
 's',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'by',
 'arthur',
 'conan',
 'doyle',
 'this',
 'ebook',
 'is',
 'for',
 'the',
 'use',
 'of',
 'anyone',
 'anywhere',
 'at',
 'no',
 'cost',
 'and',
 'with',
 'almost',
 'no',
 'restrictions',
 'whatsoever',
 'you',
 'may',
 'copy',
 'it',
 'give',
 'it',
 'away',
 'or',
 're',
 'use',
 'it',
 'under',
 'the',
 'terms',
 'of',
 'the',
 'project',
 'gutenberg',
 'license',
 'included',
 'with',
 'this',
 'ebook',
 'or',
 'online',
 'at',
 'www',
 'gutenberg',
 'net',
 'title',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'author',
 'arthur',
 'conan',
 'doyle',
 'release',
 'date',
 'november',
 'ebook',
 'last',
 'updated',
 'may',
 'language',
 'english',
 'character',
 'set',
 'encoding',
 'utf',
 'start',
 'of',
 'this',
 'project',
 'gutenberg',
 'ebook',
 'the',
 'adventures',
 'of',
 'sherlock',
 'holmes',
 'produced',
 'by',
 'an',
 'anonymous',
 'project',
 'gutenberg',
 'volunteer'

In [9]:
print(f'Total Tokens:{len(tokens)}')
print(f'Unique Tokens:{len(set(tokens))}')

Total Tokens:3857
Unique Tokens:1232


In [10]:
length = 31
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print(f'Total Sequences: {len(sequences)}')

Total Sequences: 3826


#### Model Building

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM,SimpleRNN
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sequences)
seqs = tokenizer.texts_to_sequences(sequences)

In [13]:
vocab_size = len(tokenizer.word_index) + 1

In [14]:
vocab_size

1233

In [15]:
from tensorflow.keras.utils import to_categorical
seqs = np.array(seqs)
X, y = seqs[:,:-1], seqs[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [16]:
# define model
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            61650     
_________________________________________________________________
lstm (LSTM)                  (None, 30, 100)           60400     
_________________________________________________________________
lstm_1 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense (Dense)                (None, 100)               10100     
_________________________________________________________________
dense_1 (Dense)              (None, 1233)              124533    
Total params: 337,083
Trainable params: 337,083
Non-trainable params: 0
_________________________________________________________________
None


In [17]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x17e8b5f80a0>

In [18]:
model.save('next_word.h5')

In [19]:
from pickle import dump
dump(tokenizer, open('tokenizer.pkl', 'wb'))

#### Testing the model

In [57]:
seq_length = len(sequences[0].split()) - 1

In [64]:
sample_texts = []
for _ in range(15):
    seed_text = sequences[np.random.randint(0,len(sequences))]
    sample_texts.append(seed_text)

In [65]:
encoded_texts = []
for seed_text in sample_texts:
    encoded = tokenizer.texts_to_sequences([seed_text])[0]
    encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
    encoded_texts.append(encoded)

In [66]:
encoded_texts

[array([[ 43,  93,  31,  24,   4, 472,  11,  50,  17, 473,  94,  10,   4,
         474, 475,  11, 250, 251,   2,   1, 476, 477, 478,  15,   4, 479,
           3,   4, 480, 252]]),
 array([[ 15,   1, 439, 243, 440,   1,  85,   2,   1, 154, 441, 442,   1,
          85,   2,   1, 443, 444, 445,   1,  85,   2,   1, 446,  83, 447,
         448,   1,  85,   2]]),
 array([[709,  22,   1, 710,  15,  18,  11, 711,   7, 321,   2, 318, 124,
           5, 712,   8, 113,  28, 713,   5, 102,   1, 714, 155, 715,   6,
          29,   6,  21,  41]]),
 array([[ 213, 1077,   10,  194,  133,   41,   28,   70,   24,    5,  145,
         1078, 1079,   15,   30,  231,  108,  205,   48,   75, 1080,  232,
            3,   14,   99, 1081,    2,  304,  171,  232]]),
 array([[736, 131,  59, 133,  41,   8,  17,  20, 326,   3,  66,   8,  17,
          93,  12,  14, 104,  16, 737,  99,   5,  59,  12,  37,  40, 738,
         325, 327,   5,  17]]),
 array([[1034,    8,   39,  220,  332,   27,   19, 1035,    7,   64,  

In [67]:
classes = []
for encoded in encoded_texts:
    pred_x = model.predict(encoded)
    class_x = np.argmax(pred_x,axis = 1)
    classes.append(class_x)

In [68]:
classes

[array([65], dtype=int64),
 array([1], dtype=int64),
 array([5], dtype=int64),
 array([3], dtype=int64),
 array([20], dtype=int64),
 array([64], dtype=int64),
 array([5], dtype=int64),
 array([7], dtype=int64),
 array([13], dtype=int64),
 array([108], dtype=int64),
 array([254], dtype=int64),
 array([20], dtype=int64),
 array([23], dtype=int64),
 array([1], dtype=int64),
 array([5], dtype=int64)]

In [69]:
for idx,class_x in enumerate(classes):
    out_word = ''
    for word, index in tokenizer.word_index.items():
        if index == class_x:
            out_word = word
            break
    print(f"Text: {sample_texts[idx]}\nNext Word:{out_word}")
    print()

Text: world has seen but as a lover he would have placed himself in a false position he never spoke of the softer passions save with a gibe and a sneer they
Next Word:were

Text: man with the twisted lip vii the adventure of the blue carbuncle viii the adventure of the speckled band ix the adventure of the engineer s thumb x the adventure of
Next Word:the

Text: help laughing at the ease with which he explained his process of deduction when i hear you give your reasons i remarked the thing always appears to me to be so
Next Word:i

Text: stage ha living in london quite so your majesty as i understand became entangled with this young person wrote her some compromising letters and is now desirous of getting those letters
Next Word:and

Text: i don t know quite so you have not observed and yet you have seen that is just my point now i know that there are seventeen steps because i have
Next Word:not

Text: of consulting you then pray consult said holmes shutting his eyes once more the fact