In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import pickle
import time
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,GRU, LSTM, Dense, Bidirectional, Dropout

In [None]:
# getting our text file
with open('/content/1661-0.txt') as f:
    data = f.read()

In [None]:
tokenizer = Tokenizer() # Create a tokenizer

In [None]:
tokenizer.fit_on_texts([data])

In [None]:
# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle)

In [None]:
keys_list = tokenizer.word_index  ## creates a word to index mapping
print("No. of words = " , len(keys_list))

No. of words =  8931


In [None]:
input_sequences = []

for sentence in data.split('\n'):

    tokenized_sentence = tokenizer.texts_to_sequences([sentence])[0]  ## the [0] index is putting all sequences in one list
    for i in range( 1, len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1]) ## apppendind tokenized sentences to input_sequences list


In [None]:
## length of the biggest line
max_len = max(len(x) for x in input_sequences)

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequences = pad_sequences(input_sequences, ## vector who's vector we need to padd
                                       maxlen=max_len,  ## length of sequence's vectors
                                       padding='pre'    ## padding from the starting
                                      )

In [None]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  145, 4790],
       [   0,    0,    0, ...,  145, 4790,    1],
       [   0,    0,    0, ..., 4790,    1, 1020],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]], dtype=int32)

In [None]:
## X will contain all elements instead of last one in list
X = padded_input_sequences[:,:-1]

## y will only contain last element of list
y = padded_input_sequences[:,-1]

In [None]:
X.shape, y.shape

((101619, 19), (101619,))

In [None]:
print(" Total number of word: " ,len(tokenizer.word_index))

INPUT_LENGTH = len(tokenizer.word_index)+1

 Total number of word:  8931


In [None]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=INPUT_LENGTH)

In [None]:
y.shape

(101619, 8932)

In [None]:
model = Sequential()
model.add(Embedding(INPUT_LENGTH, 100))
model.add(Bidirectional(GRU(units=80, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(units=80, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(GRU(units=80)))
model.add(Dense(INPUT_LENGTH, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
N_EPOCHS = 86

history = model.fit(X, y, epochs=N_EPOCHS)

Epoch 1/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 16ms/step - accuracy: 0.0611 - loss: 6.5585
Epoch 2/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 17ms/step - accuracy: 0.1110 - loss: 5.5886
Epoch 3/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 19ms/step - accuracy: 0.1356 - loss: 5.2272
Epoch 4/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 16ms/step - accuracy: 0.1478 - loss: 5.0077
Epoch 5/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 16ms/step - accuracy: 0.1613 - loss: 4.7754
Epoch 6/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 16ms/step - accuracy: 0.1704 - loss: 4.5998
Epoch 7/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 16ms/step - accuracy: 0.1821 - loss: 4.4218
Epoch 8/100
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 16ms/step - accuracy: 0.1919 - loss: 4.2712


In [None]:
model.save('next_word_prediction.keras')