In [1]:
import tensorflow.keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import InputLayer

from tensorflow.keras.optimizers import Adam

from tensorflow.keras.utils import to_categorical

import numpy as np
import nltk
from nltk.tokenize import RegexpTokenizer
import matplotlib.pyplot as plt
import random

from tqdm.notebook import tqdm as log_progress

In [2]:
#load and clean data
data_file_name = "a-great-cloud-of-witnesses-the-catholic-churchs-experience-i.epub.txt"
SEQUENCE_LENGTH = 10 #how many items are fed into the ai per sequence

raw = open("data/" + data_file_name, "r", encoding='utf-8')

lines = []
print("Loading data...")
for line in log_progress(raw):
    if str(line) != '\n':
        lines.append(str(line).lower())

print("Tokenizing...")
#putting the entirety of the input data into one string
text = ""
for line in log_progress(lines):
    text += line
#tokenizer = RegexpTokenizer(r'[a-zA-Z]')
tokenizer = RegexpTokenizer(r'\w+')
tokenized = tokenizer.tokenize(text)

#converting the text into numbers to be processed by the embedding layer of the model
words = [] #one of each word in tokenized will be in here
filtering = lambda x : not x in words #for finding if the word should be added to the words array
find = lambda x : words.index(x) if x in words else len(words) #convert each word into a number. -1 means that the item isn't in the vocabulary

print("Finding every unique word...")
for word in log_progress(tokenized):
    if filtering(word):
        words.append(word)

#x data is every single word in the data set, in order
#y data is every single word that comes after the corresponding x value
x = []
y = []
print("Compiling dataset...")
counter = 0
for i in log_progress(range(int(len(tokenized)/SEQUENCE_LENGTH))):
    temp = []
    for j in range(0, SEQUENCE_LENGTH):
        if counter + j >= len(tokenized):
            while len(temp) < SEQUENCE_LENGTH:
                temp.append(0)
            break
        else:
            temp.append(find(tokenized[counter+j]))
    
    counter += SEQUENCE_LENGTH - 1
    x.append(temp)
    
    if counter+SEQUENCE_LENGTH < len(tokenized):
        y.append(find(tokenized[counter+SEQUENCE_LENGTH]))
    else:
        y.append(find(tokenized[0]))

x = np.asarray(x, np.int32)
y = np.asarray(y, np.int32)

x = np.reshape(x, (x.shape[0], SEQUENCE_LENGTH, 1))
y = to_categorical(y)

print(x.shape)
print(y.shape)

vocab_size = len(words)

Loading data...


0it [00:00, ?it/s]

Tokenizing...


  0%|          | 0/1577 [00:00<?, ?it/s]

Finding every unique word...


  0%|          | 0/82928 [00:00<?, ?it/s]

Compiling dataset...


  0%|          | 0/8292 [00:00<?, ?it/s]

(8292, 10, 1)
(8292, 7411)


In [3]:
#create and compile model
model = Sequential()

model.add(Embedding(vocab_size, 24, input_length=SEQUENCE_LENGTH)) #embedding: size of vocabulary, dimension of each value
model.add(LSTM(40, dropout=0.01))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.05))
model.add(Dense(y.shape[1], activation='softmax'))

opt = Adam(learning_rate=0.001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 10, 32)            248896    
_________________________________________________________________
lstm (LSTM)                  (None, 40)                11680     
_________________________________________________________________
dense (Dense)                (None, 32)                1312      
_________________________________________________________________
dropout (Dropout)            (None, 32)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 7411)              244563    
Total params: 506,451
Trainable params: 506,451
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
#train model
hist = model.fit(x, y, epochs=200, verbose=1, batch_size=100)
model.save("TestModel.h5")

plt.title("loss")
plt.legend(['loss'])
plt.plot(hist.history['loss'], label='loss')
#plt.plot(hist.history['val_loss'], label='val_loss')

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200

KeyboardInterrupt: 

In [4]:
#test model
sample_length = 100
model = load_model("TestModel.h5")

#get input sentence and process
sentence = input("Enter first 10 words...").lower()
tokenizer = RegexpTokenizer(r'\w+')
root = tokenizer.tokenize(sentence)
root = [find(root[0])]

output = ""

for i in log_progress(range(sample_length)):
    tmp = np.array(root)
    tmp = np.reshape(tmp, (1, SEQUENCE_LENGTH, 1))
    pred = model.predict(tmp, verbose=0)
    next_word = pred.argmax()
    #next_word = np.random.choice(len(pred[0]), p=pred[0])
    output += words[next_word] + " "
    root.pop(0)
    root.append(next_word) #setting the next word

print(output)

  0%|          | 0/100 [00:00<?, ?it/s]

young have which church from from an with of of in the to the alive church church more of in in in in in in the state no the up instead please all live live live is is is is is is we we is we during during faith live live live we we we we we we we we saying warfare its its us its were were the the the in from his how his come was to to gaza i no between between parents his of of of of by one remind political israeli a of of of 
