In [30]:
import numpy as np
import keras
from keras.models import Model, Sequential, load_model
from keras.layers import Embedding, LSTM, Dense, TimeDistributed, Activation
from keras.utils import to_categorical

First, preprocess the data the same way as for the HMM - get a list of words, and remove all punctuation and the start and end

In [2]:
word_list = []
dictionary = set()
with open('data/Syllable_dictionary.txt') as f:
    for line in f:
        word_list.append(line.split()[0])
        dictionary.add(line.split()[0])

In [3]:
def remove_punctuation(words):
    punctuation = [',','.',':','?',';','!',"'",'"', '(', ')']
    for i, word in enumerate(words):
        word = word.lower()

        while word not in dictionary:
            
            if word[-1] in punctuation:
                word = word[:-1]

            if word[0] in punctuation and word not in dictionary:
                word = word[1:]

        words[i] = word
            
    return words

Turn the raw data from the file into a list of words. Treat the newline character as its own word

In [19]:
def get_words_from_data(filename):
    data = []
    with open(filename) as f:
        start = 0
        for i, line in enumerate(f):
            words = line.split()
            if len(words) <= 1:
                continue
                
            words = remove_punctuation(words)
            words.append('\n')
            data.extend(words)

    return data

Assign each word a unique integer

In [27]:
def encode_data(data, word_list):
    encoding = {}
    for i, word in enumerate(word_list):
        encoding[word] = i
    
    encoded_data = []
    for word in data:
        encoded_data.append(encoding[word])
        
    return encoded_data    

In [20]:
word_data = get_words_from_data('data/shakespeare.txt')
print(len(word_data))

19737


In [32]:
word_list2 = ['\n'] + word_list
vocab_size = len(word_list2)

In [31]:
word_data_encoded = encode_data(word_data, word_list2)
word_data_encoded = np.array(word_data_encoded)

Create the training data by taking sequences of words

In [33]:
def build_training_data(words, vocab_size, skip=3, seq_len=20):
    X = []
    y = []
    for i in range(0, len(words) - seq_len - 1, skip):
        sequence = words[i : i + seq_len]
        X.append(sequence)
        y.append(to_categorical(words[i + 1 : i + seq_len + 1], num_classes=vocab_size))
    X = np.array(X)
    y = np.array(y)
    return X, y

In [34]:
skip = 1
seq_len = 20
X, y = build_training_data(word_data_encoded, vocab_size, skip, seq_len)
print(X.shape, y.shape)

(19716, 20) (19716, 20, 3206)


In [50]:
def LSTM_model2(seq_len=20):
    model = Sequential()
    model.add(Embedding(vocab_size, 200, input_length=seq_len))
    model.add(LSTM(200, input_shape=X.shape[1:], return_sequences=True))
    model.add(Dense(vocab_size, activation='softmax'))
    return model

In [52]:
embedding_model = LSTM_model2()
embedding_model.compile(loss='categorical_crossentropy', optimizer='adam')
embedding_model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 20, 200)           641200    
_________________________________________________________________
lstm_7 (LSTM)                (None, 20, 200)           320800    
_________________________________________________________________
dense_6 (Dense)              (None, 20, 3206)          644406    
Total params: 1,606,406
Trainable params: 1,606,406
Non-trainable params: 0
_________________________________________________________________


In [104]:
embedding_model.fit(X, y, batch_size=32, epochs=40)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.callbacks.History at 0x2266141dc08>

In [128]:
embedding_model.save('wordLSTM.h5')

In [106]:
def sample_from_softmax(prediction, temp=1.0):
    prediction = np.asarray(prediction).astype('float64')
    num = np.log(prediction) / temp
    num = np.exp(num)
    p = num / np.sum(num)
    return np.argmax(np.random.multinomial(1, p, 1))

In [107]:
def decode_sequence(sequence, word_list):
    decoded = ""
    for word_id in sequence:
        decoded += word_list[word_id]
        if word_id != 0:
            decoded += " "
    return decoded

In [108]:
def generate_sequence(num_words, temp=1):
    generated = []
    sequence = seed
    for i in range(num_words):
        x_pred = np.zeros((1, sequence.shape[0]))
        x_pred[0] = sequence
        prediction = embedding_model.predict(x_pred)
        next_word_id = sample_from_softmax(prediction[0][-1], temp)
        generated.append(next_word_id)
        sequence[:-1] = sequence[1:]
        sequence[-1] = next_word_id
    return generated

In [109]:
init = "shall i compare thee to a summer's day \n thou art more lovely and more temperate \n rough winds do"
initwords = init.split(" ")
seed = np.array(encode_data(initwords, word_list2))

In [126]:
generated = generate_sequence(150, temp=2)

  This is separate from the ipykernel package so we can avoid doing imports until


In [127]:
print(init + " " + decode_sequence(generated, word_list2))

shall i compare thee to a summer's day 
 thou art more lovely and more temperate 
 rough winds do a story of his spring 
for it depends upon that love doth part 
and summer's lease hath all too short a date 
sometime too hot the self who eyes have been on every vulgar thief 
thee have i not any thing sinful then striving to mend 
to mar the subject that before was any dear prepare her 
made old offences of affections new 
most true worse say thy beauty's form upon desired change 
o hear and dove it will 
so him i lose through my woeful age 
by this be error and upon thine own desire is admitted when i since others voices that my adder's sense 
to critic and to flatterer stopped are seen 
and heavily in eyes 
showing their garments though new-fangled no 
for at a death hath do that harvest reap 
at 
