In [1]:
import numpy as np
import keras

Using TensorFlow backend.


In [2]:
# link1: https://www.tensorflow.org/tutorials/seq2seq
# link2: https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html

In [3]:
from gensim.corpora import Dictionary
from sklearn.preprocessing import OneHotEncoder

class SentenceToCharVecEncoder:
    def __init__(self, dictionary):
        self.dictionary = dictionary
        numchars = len(self.dictionary)
        self.onehot_encoder = OneHotEncoder()
        self.onehot_encoder.fit(np.arange(numchars).reshape((numchars, 1)))
        
    def encode_sentence(self, sent):
        return self.onehot_encoder.transform(
            np.array([self.dictionary.token2id[c] for c in sent]).reshape((len(sent), 1))
        )
    
    def encode_sentences(self, sentences, sparse=True):
        if sparse:
            return map(lambda sent: self.encode_sentence(sent), sentences)
        else:
            return map(lambda sent: self.encode_sentence(sent).toarray(), sentences)
    
def initSentenceToCharVecEncoder(textfile):
    text = filter(lambda t: len(t)>0, [t.strip() for t in textfile])
    dictionary = Dictionary(map(lambda line: [c for c in line], text))
    return SentenceToCharVecEncoder(dictionary)

In [4]:
import urllib2

sent_encoder = initSentenceToCharVecEncoder(urllib2.urlopen('http://norvig.com/big.txt', 'r'))

In [5]:
sent_encoder.encode_sentence('abAtrE.')

<7x92 sparse matrix of type '<type 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [6]:
sent_encoder.encode_sentences(['I love you.', 'seq2seq model in Keras', 'language model'])

[<11x92 sparse matrix of type '<type 'numpy.float64'>'
 	with 11 stored elements in Compressed Sparse Row format>,
 <22x92 sparse matrix of type '<type 'numpy.float64'>'
 	with 22 stored elements in Compressed Sparse Row format>,
 <14x92 sparse matrix of type '<type 'numpy.float64'>'
 	with 14 stored elements in Compressed Sparse Row format>]

In [9]:
sent_encoder.encode_sentences(['I love you.', 'seq2seq model in Keras', 'language model'], sparse=False)

[array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 1.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]]),
 array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        ..., 
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.],
        [ 0.,  0.,  0., ...,  0.,  0.,  0.]])]

In [10]:
numchars = len(sent_encoder.dictionary)
latent_dim = numchars + 20

print numchars
print latent_dim

92
112


In [11]:
textfile = urllib2.urlopen('http://norvig.com/big.txt', 'r')
text = filter(lambda t: len(t)>0, [t.strip() for t in textfile])

In [10]:
from keras.models import Model
from keras.layers import Input, LSTM, Dense

# Define an input sequence and process it.
encoder_inputs = Input(shape=(None, numchars))
encoder = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_inputs)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None, numchars))
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the 
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs,
                                     initial_state=encoder_states)
decoder_dense = Dense(numchars, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [14]:
# preparing training data

In [17]:
chartovec_encoder = initSentenceToCharVecEncoder(text)

In [19]:
encoder_input = chartovec_encoder.encode_sentences(text[:-1])
decoder_input = chartovec_encoder.encode_sentences(text[1:])
decoder_output = chartovec_encoder.encode_sentences(text[1:])

In [24]:
print(len(encoder_input))
print(map(lambda e: e.shape, encoder_input[:100]))

103500
[(64, 92), (25, 92), (45, 92), (68, 92), (68, 92), (42, 92), (68, 92), (68, 92), (34, 92), (68, 92), (68, 92), (68, 92), (68, 92), (55, 92), (63, 92), (63, 92), (64, 92), (40, 92), (30, 92), (40, 92), (42, 92), (11, 92), (17, 92), (29, 92), (79, 92), (37, 92), (17, 92), (15, 92), (2, 92), (22, 92), (8, 92), (23, 92), (25, 92), (23, 92), (31, 92), (23, 92), (32, 92), (40, 92), (40, 92), (41, 92), (38, 92), (38, 92), (40, 92), (35, 92), (2, 92), (1146, 92), (1300, 92), (978, 92), (337, 92), (114, 92), (20, 92), (175, 92), (24, 92), (150, 92), (394, 92), (67, 92), (820, 92), (383, 92), (241, 92), (13, 92), (12, 92), (31, 92), (26, 92), (25, 92), (489, 92), (62, 92), (492, 92), (76, 92), (217, 92), (74, 92), (208, 92), (109, 92), (124, 92), (41, 92), (59, 92), (649, 92), (40, 92), (491, 92), (146, 92), (228, 92), (39, 92), (138, 92), (19, 92), (137, 92), (161, 92), (23, 92), (1126, 92), (193, 92), (170, 92), (270, 92), (183, 92), (298, 92), (25, 92), (8, 92), (232, 92), (39, 92), (2

In [25]:
print(len(decoder_input))
print(map(lambda e: e.shape, decoder_input[:100]))

103500
[(25, 92), (45, 92), (68, 92), (68, 92), (42, 92), (68, 92), (68, 92), (34, 92), (68, 92), (68, 92), (68, 92), (68, 92), (55, 92), (63, 92), (63, 92), (64, 92), (40, 92), (30, 92), (40, 92), (42, 92), (11, 92), (17, 92), (29, 92), (79, 92), (37, 92), (17, 92), (15, 92), (2, 92), (22, 92), (8, 92), (23, 92), (25, 92), (23, 92), (31, 92), (23, 92), (32, 92), (40, 92), (40, 92), (41, 92), (38, 92), (38, 92), (40, 92), (35, 92), (2, 92), (1146, 92), (1300, 92), (978, 92), (337, 92), (114, 92), (20, 92), (175, 92), (24, 92), (150, 92), (394, 92), (67, 92), (820, 92), (383, 92), (241, 92), (13, 92), (12, 92), (31, 92), (26, 92), (25, 92), (489, 92), (62, 92), (492, 92), (76, 92), (217, 92), (74, 92), (208, 92), (109, 92), (124, 92), (41, 92), (59, 92), (649, 92), (40, 92), (491, 92), (146, 92), (228, 92), (39, 92), (138, 92), (19, 92), (137, 92), (161, 92), (23, 92), (1126, 92), (193, 92), (170, 92), (270, 92), (183, 92), (298, 92), (25, 92), (8, 92), (232, 92), (39, 92), (293, 92), (

In [26]:
print(len(decoder_output))
print(map(lambda e: e.shape, decoder_output[:100]))

103500
[(25, 92), (45, 92), (68, 92), (68, 92), (42, 92), (68, 92), (68, 92), (34, 92), (68, 92), (68, 92), (68, 92), (68, 92), (55, 92), (63, 92), (63, 92), (64, 92), (40, 92), (30, 92), (40, 92), (42, 92), (11, 92), (17, 92), (29, 92), (79, 92), (37, 92), (17, 92), (15, 92), (2, 92), (22, 92), (8, 92), (23, 92), (25, 92), (23, 92), (31, 92), (23, 92), (32, 92), (40, 92), (40, 92), (41, 92), (38, 92), (38, 92), (40, 92), (35, 92), (2, 92), (1146, 92), (1300, 92), (978, 92), (337, 92), (114, 92), (20, 92), (175, 92), (24, 92), (150, 92), (394, 92), (67, 92), (820, 92), (383, 92), (241, 92), (13, 92), (12, 92), (31, 92), (26, 92), (25, 92), (489, 92), (62, 92), (492, 92), (76, 92), (217, 92), (74, 92), (208, 92), (109, 92), (124, 92), (41, 92), (59, 92), (649, 92), (40, 92), (491, 92), (146, 92), (228, 92), (39, 92), (138, 92), (19, 92), (137, 92), (161, 92), (23, 92), (1126, 92), (193, 92), (170, 92), (270, 92), (183, 92), (298, 92), (25, 92), (8, 92), (232, 92), (39, 92), (293, 92), (