In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
file = open("metamorphosis_clean.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  One morning, when Gregor Samsa woke from troubled dreams, he found

The Last Line:  first to get up and stretch out her young body.


In [None]:
## Data Cleaning by replacing newlines and \r and some special characters
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.'

In [None]:
### Further cleaning by removing punctuations
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'One morning  when Gregor Samsa woke from troubled dreams  he found himself transformed in his bed into a horrible vermin   He lay on his armour like back  and if he lifted his head a little he could see his brown belly  slightly domed and divided by arches into stiff sections   The bedding was hardly able to cover it and seemed ready to slide off any moment   His many legs  pitifully thin compared with the size of the rest of him  waved about helplessly as he looked    What s happened to me   he'

In [None]:
## Created a vocab
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on armour-like back, and if lifted head little could see brown belly, slightly domed divided by arches stiff sections. The bedding was hardly able to cover it seemed ready slide off any moment. His many legs, pitifully thin compared with the size of rest him, waved about helplessly as looked. "What\'s happened me?" thought. It wasn\'t dream. room, proper human room altho'

In [None]:
z

['One',
 'morning,',
 'when',
 'Gregor',
 'Samsa',
 'woke',
 'from',
 'troubled',
 'dreams,',
 'he',
 'found',
 'himself',
 'transformed',
 'in',
 'his',
 'bed',
 'into',
 'a',
 'horrible',
 'vermin.',
 'He',
 'lay',
 'on',
 'armour-like',
 'back,',
 'and',
 'if',
 'lifted',
 'head',
 'little',
 'could',
 'see',
 'brown',
 'belly,',
 'slightly',
 'domed',
 'divided',
 'by',
 'arches',
 'stiff',
 'sections.',
 'The',
 'bedding',
 'was',
 'hardly',
 'able',
 'to',
 'cover',
 'it',
 'seemed',
 'ready',
 'slide',
 'off',
 'any',
 'moment.',
 'His',
 'many',
 'legs,',
 'pitifully',
 'thin',
 'compared',
 'with',
 'the',
 'size',
 'of',
 'rest',
 'him,',
 'waved',
 'about',
 'helplessly',
 'as',
 'looked.',
 '"What\'s',
 'happened',
 'me?"',
 'thought.',
 'It',
 "wasn't",
 'dream.',
 'room,',
 'proper',
 'human',
 'room',
 'although',
 'too',
 'small,',
 'peacefully',
 'between',
 'its',
 'four',
 'familiar',
 'walls.',
 'A',
 'collection',
 'textile',
 'samples',
 'spread',
 'out',
 'table'

In [None]:
## Creating Tokenizers
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

#Each word in the sequence is converted to index using tokenizers
sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[17, 53, 293, 2, 18, 729, 135, 730, 294, 8]

In [None]:
tokenizer.index_word

{1: 'now',
 2: 'gregor',
 3: 'well',
 4: 'it',
 5: 'that',
 6: 'then',
 7: 'father',
 8: 'he',
 9: 'in',
 10: 'out',
 11: 'this',
 12: 'so',
 13: 'before',
 14: 'no',
 15: 'mother',
 16: 'grete',
 17: 'one',
 18: 'samsa',
 19: 'himself',
 20: 'and',
 21: 'him',
 22: 'there',
 23: 'all',
 24: 'you',
 25: 'yes',
 26: 'again',
 27: 'work',
 28: 'here',
 29: 'on',
 30: 'like',
 31: 'was',
 32: 'room',
 33: 'too',
 34: 'be',
 35: 'but',
 36: 'god',
 37: 'is',
 38: 'soon',
 39: 'enough',
 40: 'would',
 41: 'seven',
 42: 'did',
 43: 'come',
 44: 'round',
 45: 'door',
 46: 'while',
 47: 'said',
 48: 'already',
 49: 'will',
 50: 'help',
 51: 'we',
 52: 'anyway',
 53: 'morning',
 54: 'bed',
 55: 'back',
 56: 'if',
 57: 'little',
 58: 'the',
 59: 'to',
 60: 'moment',
 61: "what's",
 62: 'happened',
 63: 'me',
 64: 'table',
 65: 'had',
 66: 'upright',
 67: 'her',
 68: 'look',
 69: 'something',
 70: 'do',
 71: 'right',
 72: 'however',
 73: 'hard',
 74: 'pain',
 75: 'oh',
 76: 'what',
 77: 'business

In [None]:
len(sequence_data)

3890

In [None]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2617


In [None]:
### Generating bigrams in terms of sequences

sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

[17, 53]
[53, 293]
[293, 2]
[2, 18]
[18, 729]
[729, 135]
[135, 730]
[730, 294]
[294, 8]
[8, 731]
[731, 19]
[19, 732]
[732, 9]
[9, 295]
[295, 54]
[54, 733]
[733, 296]
[296, 297]
[297, 734]
[734, 8]
[8, 735]
[735, 29]
[29, 736]
[736, 30]
[30, 55]
[55, 20]
[20, 56]
[56, 737]
[737, 136]
[136, 57]
[57, 298]
[298, 137]
[137, 738]
[738, 138]
[138, 299]
[299, 739]
[739, 740]
[740, 300]
[300, 741]
[741, 301]
[301, 742]
[742, 58]
[58, 743]
[743, 31]
[31, 302]
[302, 744]
[744, 59]
[59, 745]
[745, 4]
[4, 303]
[303, 304]
[304, 746]
[746, 305]
[305, 747]
[747, 60]
[60, 295]
[295, 748]
[748, 139]
[139, 749]
[749, 750]
[750, 751]
[751, 140]
[140, 58]
[58, 752]
[752, 141]
[141, 306]
[306, 21]
[21, 753]
[753, 142]
[142, 754]
[754, 143]
[143, 144]
[144, 61]
[61, 62]
[62, 63]
[63, 145]
[145, 4]
[4, 755]
[755, 307]
[307, 32]
[32, 756]
[756, 308]
[308, 32]
[32, 146]
[146, 33]
[33, 309]
[309, 310]
[310, 757]
[757, 758]
[758, 759]
[759, 760]
[760, 147]
[147, 296]
[296, 761]
[761, 762]
[762, 763]
[763, 764]
[7

array([[ 17,  53],
       [ 53, 293],
       [293,   2],
       [  2,  18],
       [ 18, 729],
       [729, 135],
       [135, 730],
       [730, 294],
       [294,   8],
       [  8, 731]])

In [None]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [None]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 17  53 293   2  18]
The responses are:  [ 53 293   2  18 729]


In [None]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [None]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1, 10)             26170     
_________________________________________________________________
lstm (LSTM)                  (None, 1, 1000)           4044000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 1000)              8004000   
_________________________________________________________________
dense (Dense)                (None, 1000)              1001000   
_________________________________________________________________
dense_1 (Dense)              (None, 2617)              2619617   
Total params: 15,694,787
Trainable params: 15,694,787
Non-trainable params: 0
_________________________________________________________________


In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')


In [None]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(lr=0.001))

In [None]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint])

Epoch 1/150
Epoch 00001: loss improved from inf to 6.82446, saving model to nextword1.h5
Epoch 2/150
Epoch 00002: loss improved from 6.82446 to 6.60558, saving model to nextword1.h5
Epoch 3/150
Epoch 00003: loss improved from 6.60558 to 6.35038, saving model to nextword1.h5
Epoch 4/150
Epoch 00004: loss improved from 6.35038 to 6.09490, saving model to nextword1.h5
Epoch 5/150
Epoch 00005: loss improved from 6.09490 to 5.85156, saving model to nextword1.h5
Epoch 6/150
Epoch 00006: loss improved from 5.85156 to 5.65967, saving model to nextword1.h5
Epoch 7/150
Epoch 00007: loss improved from 5.65967 to 5.45402, saving model to nextword1.h5
Epoch 8/150
Epoch 00008: loss improved from 5.45402 to 5.25266, saving model to nextword1.h5
Epoch 9/150
Epoch 00009: loss improved from 5.25266 to 5.08153, saving model to nextword1.h5
Epoch 10/150
Epoch 00010: loss improved from 5.08153 to 4.90104, saving model to nextword1.h5
Epoch 11/150
Epoch 00011: loss improved from 4.90104 to 4.77432, saving m

KeyboardInterrupt: 

In [None]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer

model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
#         print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [None]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""

# text1 = "at the dull"
# text2 = "collection of textile"
# text3 = "what a strenuous"
# text4 = "stop the script"

while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line:  One fine morning


Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
copy


Enter your line:  one fine morning


copy


Enter your line:  how should we


this


Enter your line:  i am


less


Enter your line:  please wake me


before


1. Add the pre-trained embeddings
2. Use trigrams
3. Optimize your networks
4. Run it for entire epochs