In [18]:
from __future__ import print_function
from keras.callbacks import LambdaCallback, ModelCheckpoint, EarlyStopping
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, LSTM, Bidirectional, Embedding
import numpy as np
import sys
import io
import os

import argparse
import re
from keras.models import load_model

from multiplicative_lstm import MultiplicativeLSTM

import matplotlib.pyplot as plt

# Parameters: change to experiment different configurations
SEQUENCE_LEN = 15
MIN_WORD_FREQUENCY = 500
STEP = 1
BATCH_SIZE = 32

In [19]:
def shuffle_and_split_training_set(sentences_original, next_original, percentage_test=2):
    # shuffle at unison
    print('Shuffling sentences')

    tmp_sentences = []
    tmp_next_word = []
    for i in np.random.permutation(len(sentences_original)):
        tmp_sentences.append(sentences_original[i])
        tmp_next_word.append(next_original[i])

    cut_index = int(len(sentences_original) * (1.-(percentage_test/100.)))
    x_train, x_test = tmp_sentences[:cut_index], tmp_sentences[cut_index:]
    y_train, y_test = tmp_next_word[:cut_index], tmp_next_word[cut_index:]

    print("Size of training set = %d" % len(x_train))
    print("Size of test set = %d" % len(y_test))
    return (x_train, y_train), (x_test, y_test)



# Data generator for fit and evaluate
def generator(sentence_list, next_word_list, batch_size):
    index = 0
    while True:
        x = np.zeros((batch_size, SEQUENCE_LEN), dtype=np.int32)
        y = np.zeros((batch_size), dtype=np.int32)
        for i in range(batch_size):
            for t, w in enumerate(sentence_list[index % len(sentence_list)]):
                x[i, t] = word_indices[w]
            y[i] = word_indices[next_word_list[index % len(sentence_list)]]
            index = index + 1
        yield x, y


def get_model(dropout=0.2):
    print('Build model...')
    model = Sequential()
    model.add(Embedding(input_dim=len(words), output_dim=1024))
    model.add(MultiplicativeLSTM(128))
    if dropout > 0:
        model.add(Dropout(dropout))
#     model.add(Bidirectional(MultiplicativeLSTM(128)))
#     if dropout > 0:
#         model.add(Dropout(dropout))
#     model.add(Bidirectional(MultiplicativeLSTM(128)))
#     if dropout > 0:
#         model.add(Dropout(dropout))
    model.add(Dense(len(words)))
    model.add(Activation('softmax'))
    return model


# Functions from keras-team/keras/blob/master/examples/lstm_text_generation.py
def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    examples_file.write('\n----- Generating text after Epoch: %d\n' % epoch)

    # Randomly pick a seed sequence
    seed_index = np.random.randint(len(sentences+sentences_test))
    seed = (sentences+sentences_test)[seed_index]

    for diversity in [0.3, 0.4, 0.5, 0.6, 0.7]:
        sentence = seed
        examples_file.write('----- Diversity:' + str(diversity) + '\n')
        examples_file.write('----- Generating with seed:\n"' + ' '.join(sentence) + '"\n')
        examples_file.write(' '.join(sentence))

        for i in range(150):
            x_pred = np.zeros((1, SEQUENCE_LEN))
            for t, word in enumerate(sentence):
                x_pred[0, t] = word_indices[word]

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_word = indices_word[next_index]

            sentence = sentence[1:]
            sentence.append(next_word)

            examples_file.write(" "+next_word)
        examples_file.write('\n')
    examples_file.write('='*80 + '\n')
    examples_file.flush()

In [20]:
if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

In [11]:
text = ""
for corpus in os.listdir(os.getcwd()+"/aclImdb/train/pos"):
    try:
        with io.open(os.getcwd()+"/aclImdb/train/pos/"+corpus, encoding='utf-8') as f:
            text += re.sub(r'[^a-zA-Z0-9., ]', '', f.read().lower().replace('<br />', ' ').replace('.', ' . ').replace(',', ' , '))
    except:
        print("File: "+os.getcwd()+"/aclImdb/train/pos/"+corpus+" not found.")

for corpus in os.listdir(os.getcwd()+"/aclImdb/test/pos"):
    try:
        with io.open(os.getcwd()+"/aclImdb/test/pos/"+corpus, encoding='utf-8') as f:
            text += re.sub(r'[^a-zA-Z0-9., ]', '', f.read().lower().replace('<br />', ' ').replace('.', ' . ').replace(',', ' , '))
    except:
        print("File: "+os.getcwd()+"/aclImdb/test/pos/"+corpus+" not found.")

with open('pos.txt','w') as f:
    print(text, file=f)

In [12]:
text = ""
for corpus in os.listdir(os.getcwd()+"/aclImdb/train/neg"):
    try:
        with io.open(os.getcwd()+"/aclImdb/train/neg/"+corpus, encoding='utf-8') as f:
            text += re.sub(r'[^a-zA-Z0-9., ]', '', f.read().lower().replace('<br />', ' ').replace('.', ' . ').replace(',', ' , '))
    except:
        print("File: "+os.getcwd()+"/aclImdb/train/neg/"+corpus+" not found.")

for corpus in os.listdir(os.getcwd()+"/aclImdb/test/neg"):
    try:
        with io.open(os.getcwd()+"/aclImdb/test/neg/"+corpus, encoding='utf-8') as f:
            text += re.sub(r'[^a-zA-Z0-9., ]', '', f.read().lower().replace('<br />', ' ').replace('.', ' . ').replace(',', ' , '))
    except:
        print("File: "+os.getcwd()+"/aclImdb/test/neg/"+corpus+" not found.")

with open('neg.txt','w') as f:
    print(text, file=f)

In [None]:
# Positive training
examples = "results_pos.txt"
vocabulary = "vocabulary_pos.txt"


text = ""
with open('pos.txt','r') as f:
    text = f.read()


print('Corpus length in characters:', len(text))

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

with open(vocabulary, 'w') as f:
    for item in words:
        f.write("%s\n" % item)

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored + 1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

# x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(sentences, next_words, 10)

model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

file_path="checkpoints/MLSTM-{epoch:02d}-{loss:.4f}.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=20)
callbacks_list = [checkpoint, print_callback, early_stopping]

examples_file = open(examples, "w")
history = model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)


Corpus length in characters: 33425456
Corpus length in words: 6353168
Unique words before ignoring: 106044
Ignoring words with frequency < 500
Unique words after ignoring: 1133
Ignored sequences: 6045424
Remaining sequences: 307729
Shuffling sentences
Size of training set = 276956
Size of test set = 30773
Build model...
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
 536/8655 [>.............................] - ETA: 4:16 - loss: 3.8871 - acc: 0.2298

In [1]:
# list all data in history
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

NameError: name 'history' is not defined

In [None]:
train_dir = "data/train/neg"
examples = "results_neg.txt"
vocabulary = "vocabulary_neg.txt"

if not os.path.isdir('./checkpoints/'):
    os.makedirs('./checkpoints/')

text = ""
for corpus in os.listdir(os.getcwd()):
    with io.open(corpus, encoding='utf-8') as f:
        text.append(f.read().lower().replace('<br />', ' ').replace('\n', ' ').replace(u"\u201c", " \" ").replace(u"\u201d", " \" ").replace(u"\u2018", " ' ").replace(u"\u2019", " ' ").replace("--", "-").replace("-", " - ").replace("?", " ? ").replace(".", " . ").replace(",", " , ").replace(";", " ; ").replace("(", " ( ").replace(")", " ) ").replace("_", " ").replace("[", " [ ").replace("]", " ] ").replace(":", " : ").replace("!", " ! "))
#     text = re.sub(r'[^a-zA-Z0-9' ]', '', text)
print('Corpus length in characters:', len(text))

text_in_words = [w for w in text.split(' ') if w.strip() != '' or w == '\n']
print('Corpus length in words:', len(text_in_words))

# Calculate word frequency
word_freq = {}
for word in text_in_words:
    word_freq[word] = word_freq.get(word, 0) + 1

ignored_words = set()
for k, v in word_freq.items():
    if word_freq[k] < MIN_WORD_FREQUENCY:
        ignored_words.add(k)

words = set(text_in_words)
print('Unique words before ignoring:', len(words))
print('Ignoring words with frequency <', MIN_WORD_FREQUENCY)
words = sorted(set(words) - ignored_words)
print('Unique words after ignoring:', len(words))

with open(vocabulary, 'w') as f:
    for item in words:
        f.write("%s\n" % item)

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

# cut the text in semi-redundant sequences of SEQUENCE_LEN words
sentences = []
next_words = []
ignored = 0
for i in range(0, len(text_in_words) - SEQUENCE_LEN, STEP):
    # Only add the sequences where no word is in ignored_words
    if len(set(text_in_words[i: i+SEQUENCE_LEN+1]).intersection(ignored_words)) == 0:
        sentences.append(text_in_words[i: i + SEQUENCE_LEN])
        next_words.append(text_in_words[i + SEQUENCE_LEN])
    else:
        ignored = ignored + 1
print('Ignored sequences:', ignored)
print('Remaining sequences:', len(sentences))

# x, y, x_test, y_test
(sentences, next_words), (sentences_test, next_words_test) = shuffle_and_split_training_set(
    sentences, next_words
)

model = get_model()
model.compile(loss='sparse_categorical_crossentropy', optimizer="adam", metrics=['accuracy'])

# file_path = "./checkpoints/LSTM_LYRICS-epoch{epoch:03d}-words%d-sequence%d-minfreq%d-" \
#             "loss{loss:.4f}-acc{acc:.4f}-val_loss{val_loss:.4f}-val_acc{val_acc:.4f}" % \

file_path="checkpoints/MLSTM-{epoch:02d}-{loss:.4f}-bigger.hdf5"

checkpoint = ModelCheckpoint(file_path, monitor='val_acc', save_best_only=True)
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)
early_stopping = EarlyStopping(monitor='val_acc', patience=20)
callbacks_list = [checkpoint, print_callback, early_stopping]

examples_file = open(examples, "w")
history = model.fit_generator(generator(sentences, next_words, BATCH_SIZE),
                    steps_per_epoch=int(len(sentences)/BATCH_SIZE) + 1,
                    epochs=100,
                    callbacks=callbacks_list,
                    validation_data=generator(sentences_test, next_words_test, BATCH_SIZE),
                    validation_steps=int(len(sentences_test)/BATCH_SIZE) + 1)

