In [1]:
# Import Libraries

import os
import re
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional

In [2]:
# Read the text from a file

with open('1661-0.txt', encoding='utf-8') as file:
    text = file.read()
text



In [3]:
# Create a function for text preparation: convert the text to lowercase, remove punctuation and words of 2 or less characters

def text_preparation(data_text):
    # lower case text
    corr_text = data_text.lower()
    corr_text = re.sub(r"'s\b",'',corr_text)
    # remove punctuations
    corr_text = re.sub('[^a-zA-Z]', ' ', corr_text) 
    words=[]
    # remove short word
    for i in corr_text.split():
        if len(i)>=3:                  
            words.append(i)
    return (' '.join(words)).strip()

# preparation of the text
prepared_text = text_preparation(text)
prepared_text



In [4]:
# Assign each word its own numerical value using the tokenizer from tensorflow.keras.preprocessing.text. 
# Print the number of words and several of them with an assigned number as an example

tokenizer = Tokenizer()
tokenizer.fit_on_texts([prepared_text])
total_words = len(tokenizer.word_index)+1

print('Total number of words: ', total_words)
print('-'*15)
print('paper: ', tokenizer.word_index['paper'])
print('bohemia: ', tokenizer.word_index['bohemia'])
print("little: ", tokenizer.word_index['little'])

Total number of words:  8002
---------------
paper:  245
bohemia:  863
little:  41


In [6]:
# Convert the text into the corresponding numerical sequence using tokenizer

token_list = tokenizer.texts_to_sequences([prepared_text])[0]
# Print the first 20 characters of a sequence
example_for_print = str()
for i in range(0, 20):
    example_for_print += str(token_list[i]) + ' '
print(f'First 20 tokens:\n{example_for_print}...')

First 20 tokens:
114 102 1 941 99 18 441 2081 2082 14 942 11 1 235 352 2083 1556 2 8 531 ...


In [7]:
# Form a list of sequences of 5 tokens

sequences = []
len_sequence = 5

for i in range(0, len(token_list)-len_sequence):
    n_gram_sequence = token_list[i:i+len_sequence]
    sequences.append(n_gram_sequence)

# print(input_sequences)
print("Total input sequences: ", len(sequences))

Total input sequences:  82026


In [8]:
sequences[:10]

[[114, 102, 1, 941, 99],
 [102, 1, 941, 99, 18],
 [1, 941, 99, 18, 441],
 [941, 99, 18, 441, 2081],
 [99, 18, 441, 2081, 2082],
 [18, 441, 2081, 2082, 14],
 [441, 2081, 2082, 14, 942],
 [2081, 2082, 14, 942, 11],
 [2082, 14, 942, 11, 1],
 [14, 942, 11, 1, 235]]

In [9]:
# Convert list to numpy array

sequences_array = np.asarray(sequences)
sequences_array

array([[ 114,  102,    1,  941,   99],
       [ 102,    1,  941,   99,   18],
       [   1,  941,   99,   18,  441],
       ...,
       [8000,   47, 4469, 8001,  320],
       [  47, 4469, 8001,  320,   53],
       [4469, 8001,  320,   53,  318]])

In [10]:
# Divide the array into x and y, where x is the first four tokens in sequences, y is the desired value and, accordingly, 
# the last token in each sequence

x, y = sequences_array[:,:-1],sequences_array[:,-1]
# Converts integers y class vector to binary class matrix
target = tf.keras.utils.to_categorical(y, num_classes=total_words)

In [14]:
# Create and compiling a model with Embedding layer with 100 neurons to map integer indices to dense vectors, bi-directional 
# LSTM layer with 150 neurons and Dense layer with unit=total words number. Take categorical_crossentropy as a loss function and
# accuracy as a metric. 

model = Sequential()
model.add(Embedding(total_words, 100, input_length=len_sequence-1))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))
# Compile the model with Adam as optimizer with learning_rate=0.01
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 100)            800200    
                                                                 
 bidirectional (Bidirectiona  (None, 300)              301200    
 l)                                                              
                                                                 
 dense (Dense)               (None, 8002)              2408602   
                                                                 
Total params: 3,510,002
Trainable params: 3,510,002
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Fit model on 100 epochs

history = model.fit(x, target, epochs=100, verbose=0)

In [46]:
# Print result

score = model.evaluate(x, target, verbose=0)
print('test loss:', round(score[0], 4))
print('test accuracy:', round(score[1], 4))

test loss: 1.9208
test accuracy: 0.5591


In [None]:
# Save model to be able to call later and predict without fitting

model.save('next_word_prediction.h5')

In [48]:
# Perform a test on a random phrase from the text

test_phrase = "from each other own complete happiness and the home centred interests"
test_without_target = ' '.join(test_phrase.split()[:-1])
print(test_without_target)
# original next word is 'interests' 
token_list = tokenizer.texts_to_sequences([' '.join(test_without_target.split()[-4:])])[0]
token_array = np.asarray(token_list)
token_array = np.expand_dims(token_array, axis=1)
token_array = token_array.reshape(1, 4)
pred = model.predict(token_array, verbose=0)
pred = np.argmax(pred[0])
next_word = ''
for word, index in tokenizer.word_index.items():
    if index == pred:
        next_word = word
        break
complemented_phrase = test_without_target + ' ' + next_word
print('Next word:', next_word)
print(f'\nTest phrase:\n"{test_phrase}"')
print(f'\nComplemented phrase:\n"{test_phrase}"')

from each other own complete happiness and the home centred
Next word: interests

Test phrase:
"from each other own complete happiness and the home centred interests"

Complemented phrase:
"from each other own complete happiness and the home centred interests"
