In [3]:
import pandas as pd
import re
import requests
from bs4 import BeautifulSoup
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow import keras
from tensorflow import keras
from tensorflow.keras.layers import *
import tensorflow as tf
import numpy as np

In [4]:
from tensorflow.keras.models import model_from_json

In [5]:
# This function slices the unwanted text (introduction, etc.) at the beginning of the txt file 
# It takes a string 'my_str', and delete everything before the specified 'sub' 
def slicer_front(my_str,sub):
  index=my_str.find(sub)
  if index !=-1 :
        return my_str[index:] 
  else :
        raise Exception('Sub string not found!')

In [6]:
# This function slices the unwanted text (introduction, etc.) at the end of the txt file 
def slicer_back(my_str,sub):
  index=my_str.find(sub)
  if index !=-1 :
        return my_str[:index] 
  else :
        raise Exception('Sub string not found!')

In [7]:
# writ a function to read in data from a url 
def get_soup(target_url):
    r = requests.get(target_url)
    soup = BeautifulSoup(r.text, "html.parser")
    return soup

In [8]:
DJ = get_soup('http://www.gutenberg.org/cache/epub/21700/pg21700.txt')
# Clean preface, conclusion and titles
DJ_txt = slicer_back(slicer_front(DJ.get_text(),'I want a hero: an uncommon want,'),'End of the Project Gutenberg EBook') # Delete the preface and conclusion
DJ = DJ_txt.split('\r\n\r\n')
DJ_c = [i for i in DJ if len(i)> 50 and len(i)<500]
len(DJ_c)

1991

In [9]:
JRL = get_soup('http://www.gutenberg.org/files/38520/38520-0.txt')
# Clean preface, conclusion and titles
JRL_txt = slicer_back(slicer_front(JRL.get_text(),'If some small savor creep into my rhyme'),'But is lord of the earldom as much as he.') # Delete the preface and conclusion
# the titles are all capitalized, get rid of capitalized words
JRL_txt = re.sub('[A-Z \d\W]+\r\n\r\n','\r\n\r\n\r\n\r\n\r\n',JRL_txt) # Clean the titles
JRL_txt = re.sub('[1-9]\d*\.','',JRL_txt) # Clean the numbers
# split the text into each poem
JRL = JRL_txt.split('\r\n\r\n\r\n\r\n\r\n')
JRL_c = [i for i in JRL if len(i)>50 and len(i)<500]
len(JRL_c)

851

In [10]:
CGR = get_soup('http://www.gutenberg.org/cache/epub/19188/pg19188.txt')
# Clean preface, conclusion and titles
CGR_txt = slicer_back(slicer_front(CGR.get_text(),'Morning and evening'),'We trust to Thee.') # Delete the preface and conclusion

# Some of the titles are capitalized, some of the titles are numbers. 
# Get rid of capitalized words and numbers
CGR_txt = re.sub('[A-Z \d\W]+\r\n\r\n','\r\n\r\n\r\n\r\n\r\n',CGR_txt) # Clean the titles
CGR_txt = re.sub('[1-9]\d*\.','\r\n\r\n\r\n\r\n\r\n',CGR_txt) # Clean the numbers
# split the text into each poem
CGR = CGR_txt.split('\r\n\r\n\r\n\r\n\r\n')
CGR_c = [i for i in CGR if len(i)>50 and len(i)<500]
len(CGR_c)

1427

In [11]:
corpus = JRL_c+DJ_c+CGR_c
len(corpus)

4269

In [12]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len( tokenizer.word_index ) + 1
total_words

20743

In [16]:
input_sequences = []

for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

In [17]:
sequence_lengths = list()
for x in input_sequences:
    sequence_lengths.append( len( x ) )
max_sequence_len = max( sequence_lengths )
max_sequence_len

103

In [18]:
input_sequences = np.array(pad_sequences(input_sequences,
                                         maxlen=max_sequence_len+1, padding='pre'))
x, y = input_sequences[:, :-1], input_sequences[:, -1]
y = keras.utils.to_categorical(y, num_classes=total_words)

In [19]:
dropout_rate = 0.2
activation_func = keras.activations.relu

SCHEMA = [

    Embedding( total_words ,128, input_length=max_sequence_len ),
    LSTM( 64 ) ,
    Dropout(dropout_rate),
    Dense( 64 , activation=activation_func ) ,
    Dropout(dropout_rate),
    Dense( total_words, activation=tf.nn.softmax )

]
model = keras.Sequential(SCHEMA)
model.compile(
    optimizer=keras.optimizers.Adam() ,
    loss=keras.losses.categorical_crossentropy ,
    metrics=[ 'accuracy' ]
)
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 103, 128)          2655104   
_________________________________________________________________
lstm (LSTM)                  (None, 64)                49408     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 64)                4160      
_________________________________________________________________
dropout_1 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)      

In [20]:
from tensorflow.keras.callbacks import ModelCheckpoint
checkpoint = [ModelCheckpoint(filepath='2_TY_1Layer_LongCorp.hdf5',period=5)]

In [22]:
history = model.fit(
    x,
    y,
    batch_size=32 ,
    epochs=15,
    callbacks=checkpoint
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [24]:
# save model
model_json = model.to_json()
with open("2_TY_1Layer_4629_LongerCorp.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("2_TY_1Layer_4629_LongerCorp.h5")
print("Saved model to disk")

Saved model to disk


In [25]:
# Save Accuracy
result = pd.DataFrame({'accuracy':history.history['acc'],'loss':history.history['loss']})
result.to_csv(r'2_TY_1Layer_4629_LongerCorp_Accuracy.csv')

In [14]:
# Load Model
from keras.models import model_from_json
# load json and create model
json_file = open('2_TY_1Layer_4629_LongerCorp.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model =  keras.models.model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("2_TY_1Layer_4629_LongerCorp.h5")
print("Loaded model from disk")

Loaded model from disk


In [24]:
def predict(seed_text , seed=10 ):

    for i in range( seed ):

        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=
        max_sequence_len , padding='pre')
        predicted = loaded_model.predict_classes(token_list, verbose=0 )
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word

    return seed_text

print( 
  predict( 
    input( 'Enter some starter text ( I want ... ) : ') , 
    int( input( 'Enter the desired length of the generated sentence : '))  
  ) 
)

Enter some starter text ( I want ... ) : How 
Enter the desired length of the generated sentence : 8
How  the world was a world of the world
