In [92]:
import numpy as np
from keras.utils.data_utils import get_file
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from keras.optimizers import RMSprop
import sys
import random
import math
import re
import pandas as pd
import ezodf

In [93]:
def read_ods(filename, sheet_no=0, header=0):
    tab = ezodf.opendoc(filename=filename).sheets[sheet_no]
    return pd.DataFrame({col[header].value:[x.value for x in col[header+1:]]
                         for col in tab.columns()})
x =  read_ods(filename = "sentenceGenerator.ods")
sentences = x["act"]+" "+ x["sentence"]
modSentence = ""
for eachSentence in sentences:
    modSentence += eachSentence + " "
print(modSentence)

greet Hey there! greet Hi greet Hey how may I Help you greet Hello bye Good bye bye Thank you. Take care good bye. request can I have your request number. request Can you please provide me your request number. request I need your request number to go ahead with your request 


### Load Text File and Build Vocabulary

In [94]:
data_path = "test_data_long.txt"
raw_text = modSentence.lower()
# generate list of unique characters, but only include words and some punctuation marks
pattern = re.compile('[a-z]+|\!|\n|\.|,|;')
all_words = re.findall(pattern, raw_text)

unique_words = sorted(set(all_words))

word_to_int = dict((c, i) for i, c in enumerate(unique_words))

#print(unique_words)
print(word_to_int)

# later used to make outputs more readable by converting ints back to characters
int_to_word = dict((i, c) for i, c in enumerate(unique_words))

total_num_words = len(all_words)
len_vocab = len(unique_words)

print("Total number of words:\t" + str(total_num_words))
print("Length of vocabulary:\t" + str(len_vocab))

{'!': 0, '.': 1, 'ahead': 2, 'bye': 3, 'can': 4, 'care': 5, 'go': 6, 'good': 7, 'greet': 8, 'have': 9, 'hello': 10, 'help': 11, 'hey': 12, 'hi': 13, 'how': 14, 'i': 15, 'may': 16, 'me': 17, 'need': 18, 'number': 19, 'please': 20, 'provide': 21, 'request': 22, 'take': 23, 'thank': 24, 'there': 25, 'to': 26, 'with': 27, 'you': 28, 'your': 29}
Total number of words:	57
Length of vocabulary:	30


### Define Model

I will use a single hidden LSTM layer with 256 memory units and a dropout probability of 20%. The dense layer will use a softmax activation to output a probability prediction for each of the characters, between 0 and 1.

In [95]:
sequence_length = 2

learning_rate = 0.01
optimizer = RMSprop(lr=learning_rate)
num_memory_units = 256

model = Sequential()

model.add(LSTM(num_memory_units, input_shape=(sequence_length, len_vocab)))
model.add(Dropout(0.2))
model.add(Dense(len_vocab))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy', optimizer=optimizer)

"We are not interested in the most accurate (classification accuracy) model of the training dataset. This would be a model that predicts each character in the training dataset perfectly. Instead we are interested in a generalization of the dataset that minimizes the chosen loss function. We are seeking a balance between generalization and overfitting but short of memorization."

## Predict

In [96]:
def add_temperature(predictions, temperature=1.0):

    predictions = np.asarray(predictions).astype('float64')
    predictions = np.log(predictions) / temperature
    exp_predictions = np.exp(predictions)
    predictions = exp_predictions / np.sum(exp_predictions)
    
    probabilities = np.random.multinomial(1, predictions, 1)
    return np.argmax(probabilities)

In [97]:
def get_random_word(n=1, array=unique_words):

    random_words = []
    
    random_indices = random.sample(range(0, len(array)), n)
    
    # in-place shuffle
    random.shuffle(array)

    # take the first n elements of the now randomized array
    return array[:n]

In [98]:
# load weights
weights_path = 'weights_epoch-27_loss-0.4866659641265869.hdf5'
model.load_weights(weights_path)
model.compile(loss='categorical_crossentropy', optimizer='adam')

### Handle User Input

In [99]:
random_seed = False

In [141]:
# user_input = "this is some string entered by a user this is some string entered by a user this is some string entered by a user"
user_input = "request"
words_to_generate = 6
seed_sentence = ["\n"] * sequence_length

if not random_seed:

    pattern = re.compile('[a-z]+|\!|\n|\.|,|;')
    user_input_edit = user_input.lower()
    user_input_edit = re.findall(pattern, user_input)

    if (len(user_input_edit) > sequence_length):
        # need to truncate
        
        seed_sentence = user_input_edit[:sequence_length]
        
    elif (len(user_input_edit) < sequence_length):
        # need to pad
        
        # get number of elements missing in user input
        missing_elems = sequence_length - len(user_input_edit)
        
        seed_sentence[0:missing_elems+1] = get_random_word(missing_elems)
        seed_sentence[-(len(user_input_edit)):] = user_input_edit
        
    # check all words provided are in the vocabulary
    for i, word in enumerate(seed_sentence):
        # if it doesnt exist, replace it with one that does
        if word not in word_to_int.keys():
            seed_sentence[i] = get_random_word()[0]
    
    print('-> seed: "' + user_input + '" ...\n')

else:

    # pick a random seed
    random_index_start = np.random.randint(0, total_num_words - sequence_length - 1)
    seed_sentence = all_words[random_index_start : random_index_start + sequence_length]
    
    print('-> seed: "' + ' '.join(seed_sentence) + '" ...')

-> seed: "request" ...



In [142]:
for i in range(words_to_generate):
        
    x_input = np.zeros((1, sequence_length, len_vocab))
    for t, word in enumerate(seed_sentence):
        x_input[0, t, word_to_int[word]] = 1.

    predictions = model.predict(x_input, verbose=0)[0]
    predicted_word_index = add_temperature(predictions, 0.5)
    predicted_word = int_to_word[predicted_word_index]

    seed_sentence = seed_sentence[1:] + list([predicted_word])

    if re.match('[a-z]', predicted_word):
        sys.stdout.write(" " + predicted_word)
    else:
        sys.stdout.write(predicted_word)

    sys.stdout.flush()

 i have your request number.