In [1]:
import re
import numpy as np
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.optimizers import RMSprop
import math
import random
import sys
import matplotlib.pyplot as plt
from keras.utils import plot_model

Using TensorFlow backend.


In [None]:
import pandas as pd
import ezodf

def read_ods(filename, sheet_no=0, header=0):
    tab = ezodf.opendoc(filename=filename).sheets[sheet_no]
    return pd.DataFrame({col[header].value:[x.value for x in col[header+1:]]
                         for col in tab.columns()})
x =  read_ods()

### Load File and Build Vocabulary

In [2]:
data_path = "test_data_long.txt"
raw_text = open(data_path).read().lower()

# generate list of unique characters, but only include words and some punctuation marks
pattern = re.compile('[a-z]+|\!|\n|\.|,|;')
all_words = re.findall(pattern, raw_text)

unique_words = sorted(set(all_words))

word_to_int = dict((c, i) for i, c in enumerate(unique_words))

# print(unique_words)
# print(word_to_int)

# later used to make outputs more readable by converting ints back to characters
int_to_word = dict((i, c) for i, c in enumerate(unique_words))

total_num_words = len(all_words)
len_vocab = len(unique_words)

print("Total number of words:\t" + str(total_num_words))
print("Length of vocabulary:\t" + str(len_vocab))

Total number of words:	100440
Length of vocabulary:	5686


### Create Training Data from Text File

In [3]:
sequence_length = 15 # max number of words to consider at a time.
                    # this means that each trainig set (training pattern) will be comprised of 20 time steps
step_window = 3

# set up x and y
# convert the words into integers
x_data = [] # list of lists
y_data = []

for i in range(0, total_num_words - sequence_length, step_window):
    
    # extract the first n words (length sequence_length): our "x"
    sequence_in = all_words[i : i+sequence_length]
    
    # extract last word for this window: our "y" (target)
    word_out = all_words[i+sequence_length]
    
    # print('\nx: ' + str(sequence_in) + '\n' + 'y: ' + word_out)
    # print()
        
    # store corresponding integer for each character in the input sequence
    x_data.append(sequence_in)
    y_data.append(word_out)

num_train_patters = len(x_data)
print('Total patterns:\t' + str(num_train_patters))

Total patterns:	33475


### Prepare Training and Testing Data

In [5]:
x = np.zeros((num_train_patters, sequence_length, len_vocab))
y = np.zeros((num_train_patters, len_vocab))

# encode all data into one-hot vectors
for i, sentence in enumerate(x_data):
    for t, word in enumerate(sentence):
        x[i, t, word_to_int[word]] = 1
    y[i, word_to_int[y_data[i]]] = 1

MemoryError: 

### Define Model

In [24]:
learning_rate = 0.01
optimizer = RMSprop(lr=learning_rate)
num_memory_units = 256

model = Sequential()

# model.add(LSTM(num_memory_units, return_sequences=True, input_shape=(sequence_length, len_vocab)))
model.add(LSTM(num_memory_units, input_shape=(sequence_length, len_vocab)))
model.add(Dropout(0.2))
# model.add(LSTM(num_memory_units))
# model.add(Dropout(0.2))
model.add(Dense(len_vocab))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [6]:
model.summary()

NameError: name 'model' is not defined

### Train Model

In [7]:
def add_temperature(predictions, temperature=1.0):

    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

In [8]:
num_iterations = 50
batch_size = 128
words_to_generate = 300

prev_loss = math.inf
loss_history = []
accuracy_history = []

val_loss_history = []
val_accuracy_history = []

# train the model, output generated text after each iteration
for i in range(num_iterations):
    
    print('\n' + '-'*10 + ' epoch ' + str(i+1) + '/' + str(num_iterations) + ' ' + '-'*10)
        
    history = model.fit(x, y, batch_size=batch_size, epochs=1)
    
    curr_loss = history.history['loss'][0]
    loss_history.append(curr_loss)
    
    # save weights if loss improves
    if (curr_loss < prev_loss):
        print("Loss improved from " + str(prev_loss) + " to " + str(curr_loss) + ". Saving weights.")
        model.save_weights('weights_epoch-{}_loss-{}.hdf5'.format(i, curr_loss))
        prev_loss = curr_loss
    
    start_index = random.randint(0, total_num_words - sequence_length - 1)
    # start_index = 0

    seed_sentence = all_words[start_index : start_index + sequence_length]

    print('\n-> seed: "' + ' '.join(seed_sentence) + '" ...\n')

    for i in range(words_to_generate):
        
        x_input = np.zeros((1, sequence_length, len_vocab))
        for t, word in enumerate(seed_sentence):
            x_input[0, t, word_to_int[word]] = 1.

        predictions = model.predict(x_input, verbose=0)[0]
        
        if i == num_iterations-1:
            final_predicted = predictions
        
        # predicted_word_index = add_temperature(predictions, 0.5)
        predicted_word_index = np.argmax(predictions)
        predicted_word = int_to_word[predicted_word_index]

        seed_sentence = seed_sentence[1:] + list([predicted_word])

        if re.match('[a-z]', predicted_word):
            sys.stdout.write(" " + predicted_word)
        else:
            sys.stdout.write(predicted_word)
    
        sys.stdout.flush()
        
    print()


---------- iteration 1/10 ----------
Epoch 1/1
Loss improved from inf to 3.55284714699. Saving weights.

-> seed: "worried look on his usually" ...

 usually cheerful cheerful.. his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it to to... his he he he a a, he he he over the the second second before before it it 

In [11]:
loss = model.evaluate(x, y, batch_size=batch_size, verbose=1)
print("loss: ", loss)

loss:  0.0209426973015


In [19]:
print('loss history:')
print(loss_history)

# plt.figure(figsize=(15,8))
# plt.rc('font', size=20)
# plt.plot(loss_history, lw=3, c='orange')
# plt.title('Cross Entropy Loss of LSTM Model over Epoch Iterations', fontsize=25)
# plt.ylabel('Loss')
# plt.xlabel('Epochs')
# plt.savefig("loss.png")
# plt.grid()
# plt.show()

loss history:
[3.552847146987915, 2.9307136535644531, 3.2151951789855957, 1.755587100982666, 0.70277565717697144, 0.50920349359512329, 0.28506067395210266, 0.21564257144927979, 0.087182797491550446, 0.042824957519769669]


In [22]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 5, 256)            299008    
_________________________________________________________________
dropout_2 (Dropout)          (None, 5, 256)            0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 35)                4515      
_________________________________________________________________
activation_2 (Activation)    (None, 35)                0         
Total params: 500,643
Trainable params: 500,643
Non-trainable params: 0
_________________________________________________________________
None

In [25]:
plot_model(model, to_file='model_plot.png')