In [3]:
import keras
import numpy
from nltk.corpus import inaugural
from nltk.tokenize.moses import MosesDetokenizer

Using TensorFlow backend.


In [4]:
fileids = inaugural.fileids()

In [5]:
detokenizer = MosesDetokenizer()

In [6]:
addresses = [' '.join(map(lambda sent: detokenizer.detokenize(sent, return_str=True), inaugural.sents(fileid))) 
             for fileid in fileids]

In [7]:
len(addresses)

56

In [8]:
fileids

[u'1789-Washington.txt',
 u'1793-Washington.txt',
 u'1797-Adams.txt',
 u'1801-Jefferson.txt',
 u'1805-Jefferson.txt',
 u'1809-Madison.txt',
 u'1813-Madison.txt',
 u'1817-Monroe.txt',
 u'1821-Monroe.txt',
 u'1825-Adams.txt',
 u'1829-Jackson.txt',
 u'1833-Jackson.txt',
 u'1837-VanBuren.txt',
 u'1841-Harrison.txt',
 u'1845-Polk.txt',
 u'1849-Taylor.txt',
 u'1853-Pierce.txt',
 u'1857-Buchanan.txt',
 u'1861-Lincoln.txt',
 u'1865-Lincoln.txt',
 u'1869-Grant.txt',
 u'1873-Grant.txt',
 u'1877-Hayes.txt',
 u'1881-Garfield.txt',
 u'1885-Cleveland.txt',
 u'1889-Harrison.txt',
 u'1893-Cleveland.txt',
 u'1897-McKinley.txt',
 u'1901-McKinley.txt',
 u'1905-Roosevelt.txt',
 u'1909-Taft.txt',
 u'1913-Wilson.txt',
 u'1917-Wilson.txt',
 u'1921-Harding.txt',
 u'1925-Coolidge.txt',
 u'1929-Hoover.txt',
 u'1933-Roosevelt.txt',
 u'1937-Roosevelt.txt',
 u'1941-Roosevelt.txt',
 u'1945-Roosevelt.txt',
 u'1949-Truman.txt',
 u'1953-Eisenhower.txt',
 u'1957-Eisenhower.txt',
 u'1961-Kennedy.txt',
 u'1965-Johnson.tx

In [9]:
# using seq2seq, need to fetch from: https://github.com/bstriner/keras-seq2seq
import kerasseq2seq

In [10]:
import itertools

def get_charset(words):
    """
    List unique characters
    :param words:
    :return: list of characters, dictionary from characters to indexes
    """
    charset = list(set(itertools.chain.from_iterable(words)))
    charset.sort()
    charmap = {c: i for i, c in enumerate(charset)}
    return charset, charmap

In [28]:
def map_word(word, charmap):
    """
    Convert string to list of indexes into charset
    :param word:
    :param charmap:
    :return:
    """
    return [charmap[c] for c in word]

def map_words(words, charmap):
    return [map_word(w, charmap) for w in words]

In [18]:
def clean_word(word):
    """
    Remove non-asci characters and downcase
    :param word:
    :return:
    """
    return "".join([c for c in word.lower() if ord(c) < 128])


def clean_words(words):
    """
    Remove words < 3 characters
    :param words:
    :return:
    """
    return [clean_word(w) for w in words if len(clean_word(w)) >= 3]

In [16]:
from keras.callbacks import LambdaCallback
from keras.layers import Input, Lambda
from keras.optimizers import Adam
from keras.models import Model
from kerasseq2seq.s2s_loss import s2sloss
from kerasseq2seq.s2s_layer import S2SLayer
from kerasseq2seq.s2s_data import process_sequences, process_test_sequences, one_hot_2d

In [29]:
# Hyperparameters
hidden_dim = 512
batch_size = 128
steps_per_epoch = 512
epochs = 1000
lr = 1e-3

# Load and clean data
words = clean_words(addresses)
charset, charmap = get_charset(words)
x_k = len(charset)
vectors = map_words(words, charmap)
max_word = max(len(w) for w in vectors)
depth = max_word * 2 + 4

# Create model
x = Input((None, 3), dtype='float32')
s2s = S2SLayer(x_k, hidden_dim, stochastic=True)
# output of layer is softmax and prediction concatenated; slice the output
y = s2s(x)
ysoftmax = Lambda(lambda z: z[:, :, :-1], output_shape=lambda z: (z[0], z[1], z[2] - 1))(y)
ypred = Lambda(lambda z: z[:, :, -1], output_shape=lambda z: (z[0], z[1], 1))(y)
# model for training
m = Model(inputs=[x], outputs=[ysoftmax])
m.summary()
m.compile(Adam(lr), s2sloss(x[:, :, 1]))
# model for testing
mtest = Model(inputs=[x], outputs=[ypred])
# callback to print results
cb = LambdaCallback(on_epoch_end=on_epoch_end(mtest, vectors, charset, depth))
# train model
m.fit_generator(bigram_generator(vectors, batch_size, x_k), callbacks=[cb],
                steps_per_epoch=steps_per_epoch, epochs=epochs,
                verbose=1)

TypeError: can only concatenate list (not "tuple") to list

In [26]:
clean_word('dfd')

'dfd'