In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [2]:
"""
    Dataset: http://www.gutenberg.org/cache/epub/5200/pg5200.txt
    Remove all the unnecessary data and label it as Metamorphosis-clean.
    The starting and ending lines should be as follows.

"""


file = open("metamorphosis.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  ﻿One morning, when Gregor Samsa woke from troubled dreams, he found

The Last Line:  first to get up and stretch out her young body.


In [3]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.'

In [4]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'One morning  when Gregor Samsa woke from troubled dreams  he found himself transformed in his bed into a horrible vermin   He lay on his armour like back  and if he lifted his head a little he could see his brown belly  slightly domed and divided by arches into stiff sections   The bedding was hardly able to cover it and seemed ready to slide off any moment   His many legs  pitifully thin compared with the size of the rest of him  waved about helplessly as he looked    What s happened to me   he'

In [5]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on armour-like back, and if lifted head little could see brown belly, slightly domed divided by arches stiff sections. The bedding was hardly able to cover it seemed ready slide off any moment. His many legs, pitifully thin compared with the size of rest him, waved about helplessly as looked. "What\'s happened me?" thought. It wasn\'t dream. room, proper human room altho'

In [6]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[17, 53, 293, 2, 18, 729, 135, 730, 294, 8]

In [7]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2617


In [8]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  3889


array([[ 17,  53],
       [ 53, 293],
       [293,   2],
       [  2,  18],
       [ 18, 729],
       [729, 135],
       [135, 730],
       [730, 294],
       [294,   8],
       [  8, 731]])

In [9]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [10]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 17  53 293   2  18]
The responses are:  [ 53 293   2  18 729]


In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [12]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))


In [None]:
#pip install numpy==1.19.5

In [13]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             26170     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1, 1000)           8004000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 2617)              2619617   
Total params: 23,698,787
Trainable params: 23,698,787
Non-trainable params: 0
____________________________________________

In [14]:
from tensorflow import keras
from keras.utils.vis_utils import plot_model

keras.utils.plot_model(model, to_file='model.png', show_layer_names=True)

('Failed to import pydot. You must `pip install pydot` and install graphviz (https://graphviz.gitlab.io/download/), ', 'for `pydotprint` to work.')


In [15]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [16]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))

In [18]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/150
Epoch 00001: loss did not improve from 1.87881
Epoch 2/150
Epoch 00002: loss improved from 1.87881 to 1.84093, saving model to nextword1.h5
Epoch 3/150
Epoch 00003: loss did not improve from 1.84093
Epoch 4/150
Epoch 00004: loss improved from 1.84093 to 1.84065, saving model to nextword1.h5
Epoch 5/150
Epoch 00005: loss improved from 1.84065 to 1.78883, saving model to nextword1.h5
Epoch 6/150
Epoch 00006: loss did not improve from 1.78883
Epoch 7/150
Epoch 00007: loss improved from 1.78883 to 1.78838, saving model to nextword1.h5
Epoch 8/150
Epoch 00008: loss improved from 1.78838 to 1.75918, saving model to nextword1.h5
Epoch 9/150
Epoch 00009: loss improved from 1.75918 to 1.73787, saving model to nextword1.h5
Epoch 10/150
Epoch 00010: loss improved from 1.73787 to 1.70511, saving model to nextword1.h5
Epoch 11/150
Epoch 00011: loss improved from 1.70511 to 1.69745, saving model to nextword1.h5
Epoch 12/150
Epoch 00012: loss improved from 1.69745 to 1.68818, saving model

Epoch 73/150
Epoch 00073: loss did not improve from 0.78476
Epoch 74/150
Epoch 00074: loss improved from 0.78476 to 0.78100, saving model to nextword1.h5
Epoch 75/150
Epoch 00075: loss improved from 0.78100 to 0.77817, saving model to nextword1.h5
Epoch 76/150
Epoch 00076: loss improved from 0.77817 to 0.77805, saving model to nextword1.h5
Epoch 77/150
Epoch 00077: loss improved from 0.77805 to 0.77589, saving model to nextword1.h5
Epoch 78/150
Epoch 00078: loss did not improve from 0.77589
Epoch 79/150
Epoch 00079: loss did not improve from 0.77589
Epoch 80/150
Epoch 00080: loss improved from 0.77589 to 0.77336, saving model to nextword1.h5
Epoch 81/150
Epoch 00081: loss improved from 0.77336 to 0.77064, saving model to nextword1.h5
Epoch 82/150
Epoch 00082: loss improved from 0.77064 to 0.76750, saving model to nextword1.h5
Epoch 83/150
Epoch 00083: loss did not improve from 0.76750
Epoch 84/150
Epoch 00084: loss did not improve from 0.76750
Epoch 85/150
Epoch 00085: loss improved fr

Epoch 00148: loss did not improve from 0.67117
Epoch 149/150
Epoch 00149: loss improved from 0.67117 to 0.67082, saving model to nextword1.h5
Epoch 150/150
Epoch 00150: loss did not improve from 0.67082


<tensorflow.python.keras.callbacks.History at 0x296286d0730>