# Calculate with seq2seq model

Use neural networks to solve sequence-to-sequence prediction tasks. Seq2Seq models are very popular these days because they achieve great results in Machine Translation, Text Summarization, Conversational Modeling and more.

In [1]:
# Import libraries
import random
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.metrics import mean_absolute_error

  from ._conv import register_converters as _register_converters


In [2]:
# Generates pairs of equations and solutions to them
def generate_equations(allowed_operators, dataset_size, min_value, max_value):
    sample = []
    for _ in range(dataset_size):
        equation = (str(random.randint(min_value,max_value))+
                   allowed_operators[random.randint(0,len(allowed_operators)-1)]+
                    str(random.randint(min_value,max_value))
                   )
        solution = str(eval(equation))
        sample.append((equation,solution))
    return sample

In [3]:
# Check the correctness of your implementation
def test_generate_equations():
    allowed_operators = ['+', '-']
    dataset_size = 10
    for (input_, output_) in generate_equations(allowed_operators, dataset_size, 0, 100):
        if not (type(input_) is str and type(output_) is str):
            return "Both parts should be strings."
        if eval(input_) != int(output_):
            return "The (equation: {!r}, solution: {!r}) pair is incorrect.".format(input_, output_)
    return "Tests passed."

In [4]:
print(test_generate_equations())

Tests passed.


In [5]:
# Generate the train and test data for the neural network
allowed_operators = ['+', '-']
dataset_size = 100000
data = generate_equations(allowed_operators, dataset_size, min_value=0, max_value=9999)

train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)

## 1. Prepare data for the neural network

Generate the vocabulary by creating mappings of the characters to their indices in some vocabulary. Other strategies that your should consider are: data normalization (lowercasing, tokenization, how to consider punctuation marks), separate vocabulary for input and for output (e.g. for machine translation), some specifics of the task.

In [6]:
# Create dictionaries for other tasks (filter frequent words if word2id is the basic unit of the sequence)
word2id = {symbol:i for i, symbol in enumerate('^$#+-1234567890')}
id2word = {i:symbol for symbol, i in word2id.items()}

In [7]:
# Generate special symbols 
start_symbol = '^' # beginning of decoding procedure 
end_symbol = '$' # end of a string, both input and output sequences
padding_symbol = '#' #padding character to make lengths of all strings equal in training batch

In [8]:
# Convert sentence to a list of vocabulary word indices 
def sentence_to_ids(sentence, word2id, padded_len):
    sent_ids = [word2id[w] for w in sentence[:padded_len]] 
    if padded_len>len(sentence):
        sent_ids.append(word2id['$'])
        sent_ids +=[word2id['#']]*(padded_len-len(sent_ids))
        return sent_ids,len(sentence)+1
    sent_ids[-1]=word2id['$']
    return sent_ids, padded_len

In [9]:
# Check implementation
def test_sentence_to_ids():
    sentences = [("123+123", 7), ("123+123", 8), ("123+123", 10)]
    expected_output = [([5, 6, 7, 3, 5, 6, 1], 7), 
                       ([5, 6, 7, 3, 5, 6, 7, 1], 8), 
                       ([5, 6, 7, 3, 5, 6, 7, 1, 2, 2], 8)] 
    for (sentence, padded_len), (sentence_ids, expected_length) in zip(sentences, expected_output):
        output, length = sentence_to_ids(sentence, word2id, padded_len)
        if output != sentence_ids:
            return("Convertion of '{}' for padded_len={} to {} is incorrect.".format(
                sentence, padded_len, output))
        if length != expected_length:
            return("Convertion of '{}' for padded_len={} has incorrect actual length {}.".format(
                sentence, padded_len, length))
    return("Tests passed.")

In [10]:
print(test_sentence_to_ids())

Tests passed.


In [11]:
# Define function to return padded indices to symbols
def ids_to_sentence(ids, id2word): 
    return [id2word[i] for i in ids] 

In [12]:
# Generate batches of indices
def batch_to_ids(sentences, word2id, max_len):    
    max_len_in_batch = min(max(len(s) for s in sentences) + 1, max_len)
    batch_ids, batch_ids_len = [], []
    for sentence in sentences:
        ids, ids_len = sentence_to_ids(sentence, word2id, max_len_in_batch)
        batch_ids.append(ids)
        batch_ids_len.append(ids_len)
    return batch_ids, batch_ids_len

In [13]:
# Generate batches with set size
def generate_batches(samples, batch_size=64):
    X, Y = [], []
    for i, (x, y) in enumerate(samples, 1):
        X.append(x)
        Y.append(y)
        if i % batch_size == 0:
            yield X, Y
            X, Y = [], []
    if X and Y:
        yield X, Y

In [14]:
# Print results
sentences = train_set[0]
ids, sent_lens = batch_to_ids(sentences, word2id, max_len=10)
print('Input:', sentences)
print('Ids: {}\nSentences lengths: {}'.format(ids, sent_lens))

Input: ('3764-3386', '378')
Ids: [[7, 11, 10, 8, 4, 7, 7, 12, 10, 1], [7, 11, 12, 1, 2, 2, 2, 2, 2, 2]]
Sentences lengths: [10, 4]


## 2. Encoder-Decoder architecture

Encoder-Decoder is a successful architecture for Seq2Seq tasks with different lengths of input and output sequences. The main idea is to use two recurrent neural networks, where the first neural network *encodes* the input sequence into a real-valued vector and then the second neural network *decodes* this vector into the output sequence.

In [15]:
# Specify network architecture
class Seq2SeqModel(object):
    pass

In [16]:
# Create placeholders to specify data feed
def declare_placeholders(self):
    """Specifies placeholders for the model."""
    
    # Placeholders for input and its actual lengths.
    self.input_batch = tf.placeholder(shape=(None, None), dtype=tf.int32, name='input_batch')
    self.input_batch_lengths = tf.placeholder(shape=(None, ), dtype=tf.int32, name='input_batch_lengths')
    
    # Placeholders for groundtruth and its actual lengths.
    self.ground_truth = tf.placeholder(shape=(None,None), dtype=tf.int32,name='ground_truth')
    self.ground_truth_lengths = tf.placeholder(shape=(None,), dtype=tf.int32,name='ground_truth_lengths')
        
    self.dropout_ph = tf.placeholder_with_default(tf.cast(1.0, tf.float32), shape=[])
    self.learning_rate_ph = tf.placeholder(dtype=tf.float32,shape=[])

Seq2SeqModel.__declare_placeholders = classmethod(declare_placeholders)

In [17]:
# Specify embedding layers of neural network 
def create_embeddings(self, vocab_size, embeddings_size):
    random_initializer = tf.random_uniform((vocab_size, embeddings_size), -1.0, 1.0)
    self.embeddings =tf.Variable(initial_value=random_initializer,name='embeddings',dtype=tf.float32) 
    
    # Perform embeddings lookup for self.input_batch. 
    self.input_batch_embedded = tf.nn.embedding_lookup(self.embeddings,self.input_batch)
    
Seq2SeqModel.__create_embeddings = classmethod(create_embeddings)

#### Encoder

The first RNN of the current architecture is called an *encoder* and serves for encoding an input sequence to a real-valued vector. Input of this RNN is an embedded input batch. Since sentences in the same batch could have different actual lengths, we also provide input lengths to avoid unnecessary computations. The final encoder state will be passed to the second RNN (decoder), which we will create soon. 

- TensorFlow provides a number of [RNN cells](https://www.tensorflow.org/api_guides/python/contrib.rnn#Core_RNN_Cells_for_use_with_TensorFlow_s_core_RNN_methods) ready for use. We suggest that you use [GRU cell](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/GRUCell), but you can also experiment with other types. 
- Wrap your cells with [DropoutWrapper](https://www.tensorflow.org/api_docs/python/tf/contrib/rnn/DropoutWrapper). Dropout is an important regularization technique for neural networks. Specify input keep probability using the dropout placeholder that we created before.
- Combine the defined encoder cells with [Dynamic RNN](https://www.tensorflow.org/api_docs/python/tf/nn/dynamic_rnn). Use the embedded input batches and their lengths here.
- Use *dtype=tf.float32* everywhere.

In [18]:
# Specify RNN encoder architecture to compute input sequence to a real-valued vector
def build_encoder(self, hidden_size):    
    # Create GRUCell with dropout.
    encoder_cell = tf.contrib.rnn.GRUCell(num_units=hidden_size)
    encoder_cell = tf.contrib.rnn.DropoutWrapper(encoder_cell,input_keep_prob=self.dropout_ph,dtype=tf.float32)
    # Create RNN with the predefined cell.
    _, self.final_encoder_state = tf.nn.dynamic_rnn(encoder_cell, self.input_batch_embedded,
        sequence_length = self.input_batch_lengths, dtype = tf.float32)
    
Seq2SeqModel.__build_encoder = classmethod(build_encoder)

In [19]:
# Specify decoder architecture for computing output sequence
# Two helpers are used: TrainingHelper and GreedyEmbeddingHelper
def build_decoder(self, hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id):
    # Use start symbols as the decoder inputs at the first time step.
    batch_size = tf.shape(self.input_batch)[0]
    start_tokens = tf.fill([batch_size], start_symbol_id)
    ground_truth_as_input = tf.concat([tf.expand_dims(start_tokens, 1), self.ground_truth], 1)
    
    # Use the embedding layer defined before to lookup embedings for ground_truth_as_input. 
    self.ground_truth_embedded = tf.nn.embedding_lookup(self.embeddings,ground_truth_as_input)
     
    # Create TrainingHelper for the train stage.
    train_helper = tf.contrib.seq2seq.TrainingHelper(self.ground_truth_embedded, 
                                                     self.ground_truth_lengths)
    # Create GreedyEmbeddingHelper for the inference stage.
    # You should provide the embedding layer, start_tokens and index of the end symbol.
    infer_helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(self.embeddings,start_tokens,end_symbol_id)    
  
    # Define decoder and return results
    def decode(helper, scope, reuse=None):        
        with tf.variable_scope(scope, reuse=reuse):
            # Create GRUCell with dropout. Do not forget to set the reuse flag properly.
            decoder_cell = tf.contrib.rnn.GRUCell(num_units=hidden_size,reuse=reuse)
            decoder_cell = tf.contrib.rnn.DropoutWrapper(decoder_cell,input_keep_prob=self.dropout_ph)
            
            # Create a projection wrapper.
            decoder_cell = tf.contrib.rnn.OutputProjectionWrapper(decoder_cell, vocab_size, reuse=reuse)
            
            # Create BasicDecoder, pass the defined cell, a helper, and initial state.
            # The initial state should be equal to the final state of the encoder!
            decoder = tf.contrib.seq2seq.BasicDecoder(
                cell=decoder_cell,
                helper=helper,
                initial_state=self.final_encoder_state)
            
            
            # The first returning argument of dynamic_decode contains two fields: 1) rnn_output (predicted logits) and 2) sample_id (predictions)
            outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoder=decoder, maximum_iterations=max_iter, 
                                                              output_time_major=False, impute_finished=True)
            return outputs
        
    self.train_outputs = decode(train_helper, 'decode')
    self.infer_outputs = decode(infer_helper, 'decode', reuse=True)
    
Seq2SeqModel.__build_decoder = classmethod(build_decoder)

In [20]:
# Compute sequence loss (masked cross-entopy loss with logits)
def compute_loss(self):
    weights = tf.cast(tf.sequence_mask(self.ground_truth_lengths), dtype=tf.float32)    
    self.loss = tf.contrib.seq2seq.sequence_loss(
        self.train_outputs.rnn_output,
        self.ground_truth,
        weights,
        average_across_timesteps=True,
        average_across_batch=True,
        softmax_loss_function=None,
        name=None
    )
    
Seq2SeqModel.__compute_loss = classmethod(compute_loss)

In [21]:
# Specify train_op to optimize self.loss
def perform_optimization(self):
    self.train_op = tf.contrib.layers.optimize_loss(
            loss = self.loss,
            global_step = tf.train.get_global_step(),
            learning_rate=self.learning_rate_ph,
            optimizer='Adam',
            clip_gradients=1.0
)
    
Seq2SeqModel.__perform_optimization = classmethod(perform_optimization)

In [22]:
# Initialize model
def init_model(self, vocab_size, embeddings_size, hidden_size, 
               max_iter, start_symbol_id, end_symbol_id, padding_symbol_id):
    
    self.__declare_placeholders()
    self.__create_embeddings(vocab_size, embeddings_size)
    self.__build_encoder(hidden_size)
    self.__build_decoder(hidden_size, vocab_size, max_iter, start_symbol_id, end_symbol_id)
    
    # Compute loss and back-propagate.
    self.__compute_loss()
    self.__perform_optimization()
    
    # Get predictions for evaluation.
    self.train_predictions = self.train_outputs.sample_id
    self.infer_predictions = self.infer_outputs.sample_id

Seq2SeqModel.__init__ = classmethod(init_model)

## 3. Train the network and predict output

[Session.run](https://www.tensorflow.org/api_docs/python/tf/Session#run) is a point which initiates computations in the graph that we have defined. To train the network, we need to compute *self.train_op*. To predict output, we just need to compute *self.infer_predictions*. In any case, we need to feed actual data through the placeholders that we defined above. 

In [23]:
# Define training function
def train_on_batch(self, session, X, X_seq_len, Y, Y_seq_len, learning_rate, dropout_keep_probability):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len,
            self.learning_rate_ph: learning_rate,
            self.dropout_ph: dropout_keep_probability
        }
    pred, loss, _ = session.run([
            self.train_predictions,
            self.loss,
            self.train_op], feed_dict=feed_dict)
    return pred, loss

Seq2SeqModel.train_on_batch = classmethod(train_on_batch)

In [24]:
# Implement function to predict output for some input sequence
def predict_for_batch(self, session, X, X_seq_len):
    feed_dict = {
        self.input_batch: X,
        self.input_batch_lengths: X_seq_len
    }
    
    pred = session.run([
            self.infer_predictions
        ], feed_dict=feed_dict)[0]
    return pred

Seq2SeqModel.predict_for_batch = classmethod(predict_for_batch)

In [25]:
# Define function to compute loss and validate results
def predict_for_batch_with_loss(self, session, X, X_seq_len, Y, Y_seq_len):
    feed_dict = {
            self.input_batch: X,
            self.input_batch_lengths: X_seq_len,
            self.ground_truth: Y,
            self.ground_truth_lengths: Y_seq_len
        }    
    
    pred, loss = session.run([
            self.infer_predictions,
            self.loss,
        ], feed_dict=feed_dict)
    return pred, loss

Seq2SeqModel.predict_for_batch_with_loss = classmethod(predict_for_batch_with_loss)

## 4. Run experiment and evaluate results

In [26]:
# Define model parameters
tf.reset_default_graph()

model = Seq2SeqModel(
    vocab_size = len(word2id),
    embeddings_size=20,
    max_iter=7,
    hidden_size=512,
    start_symbol_id=word2id['^'],
    end_symbol_id=word2id['$'],
    padding_symbol_id = word2id['#']
)

batch_size = 128
n_epochs = 10
learning_rate = 0.001
dropout_keep_probability = 0.5
max_len = 20

n_step = int(len(train_set) / batch_size)

In [27]:
# Train data
session = tf.Session()
session.run(tf.global_variables_initializer())
            
invalid_number_prediction_counts = []
all_model_predictions = []
all_ground_truth = []

print('Start training... \n')
for epoch in range(n_epochs):  
    random.shuffle(train_set)
    random.shuffle(test_set)
    
    print('Train: epoch', epoch + 1)
    for n_iter, (X_batch, Y_batch) in enumerate(generate_batches(train_set, batch_size=batch_size)):
        # prepare the data (X_batch and Y_batch) for training
        X_ids, X_sent_lens = batch_to_ids(X_batch, word2id, max_len=max_len)
        Y_ids, Y_sent_lens = batch_to_ids(Y_batch, word2id, max_len=max_len)
        # using function batch_to_ids
        predictions, loss = model.train_on_batch(
            session,
            X_ids,
            X_sent_lens,
            Y_ids,
            Y_sent_lens,
            learning_rate,
            dropout_keep_probability
        )
        
        if n_iter % 200 == 0:
            print("Epoch: [%d/%d], step: [%d/%d], loss: %f" % (epoch + 1, n_epochs, n_iter + 1, n_step, loss))
                
    X_sent, Y_sent = next(generate_batches(test_set, batch_size=batch_size))
    # prepare test data (X_sent and Y_sent) for predicting 
    X, X_sent_lens = batch_to_ids(X_sent, word2id, max_len=max_len)
    Y, Y_sent_lens = batch_to_ids(Y_sent, word2id, max_len=max_len)
    # quality and computing value of the loss function
    # using function batch_to_ids
    
    predictions, loss = model.predict_for_batch_with_loss(
        session,
        X,
        X_sent_lens,
        Y,
        Y_sent_lens
    )
    print('Test: epoch', epoch + 1, 'loss:', loss,)
    for x, y, p  in list(zip(X, Y, predictions))[:3]:
        print('X:',''.join(ids_to_sentence(x, id2word)))
        print('Y:',''.join(ids_to_sentence(y, id2word)))
        print('O:',''.join(ids_to_sentence(p, id2word)))
        print('')

    model_predictions = []
    ground_truth = []
    invalid_number_prediction_count = 0
    # For the whole test set calculate ground-truth values (as integer numbers) and prediction values (also as integers) to calculate metrics.
    for X_batch, Y_batch in generate_batches(test_set, batch_size=batch_size):
        X_ids, X_sent_lens = batch_to_ids(X_batch, word2id, max_len=max_len)
        Y_ids, Y_sent_lens = batch_to_ids(Y_batch, word2id, max_len=max_len)
        predictions = model.predict_for_batch(session, X_ids, X_sent_lens)
        for y, p in zip(Y_ids, predictions):
            y_sent = ''.join(ids_to_sentence(y, id2word))
            y_sent = y_sent[:y_sent.find('$')]
            p_sent = ''.join(ids_to_sentence(p, id2word))
            p_sent = p_sent[:p_sent.find('$')]
            if p_sent.isdigit() or (p_sent.startswith('-') and p_sent[1:].isdigit()):
                model_predictions.append(int(p_sent))
                ground_truth.append(int(y_sent))
            else:
                invalid_number_prediction_count += 1
    all_model_predictions.append(model_predictions)
    all_ground_truth.append(ground_truth)
    invalid_number_prediction_counts.append(invalid_number_prediction_count)
    print(invalid_number_prediction_count)
            
print('\n...training finished.')

Start training... 

Train: epoch 1
Epoch: [1/10], step: [1/625], loss: 2.709073
Epoch: [1/10], step: [201/625], loss: 1.821524
Epoch: [1/10], step: [401/625], loss: 1.693949
Epoch: [1/10], step: [601/625], loss: 1.664004
Test: epoch 1 loss: 1.5824859
X: 2444-171$#
Y: 2273$#
O: 2991$^

X: 7054-4337$
Y: 2717$#
O: 2999$^

X: 7958-1360$
Y: 6598$#
O: 7477$^

0
Train: epoch 2
Epoch: [2/10], step: [1/625], loss: 1.601699
Epoch: [2/10], step: [201/625], loss: 1.566108
Epoch: [2/10], step: [401/625], loss: 1.512849
Epoch: [2/10], step: [601/625], loss: 1.517137
Test: epoch 2 loss: 1.414193
X: 3204+4834$
Y: 8038$#
O: 7922$^

X: 1853-2096$
Y: -243$#
O: -106$^

X: 4679+6367$
Y: 11046$
O: 11242$

0
Train: epoch 3
Epoch: [3/10], step: [1/625], loss: 1.422339
Epoch: [3/10], step: [201/625], loss: 1.455282
Epoch: [3/10], step: [401/625], loss: 1.408459
Epoch: [3/10], step: [601/625], loss: 1.383311
Test: epoch 3 loss: 1.3355594
X: 8208-7624$
Y: 584$##
O: 1022$^

X: 3091-3001$
Y: 90$###
O: -222$^

X: 5

In [28]:
# Use MAE metric to evaluate the trained model
for i, (gts, predictions, invalid_number_prediction_count) in enumerate(zip(all_ground_truth,
                                                                            all_model_predictions,
                                                                            invalid_number_prediction_counts), 1):
    mae = mean_absolute_error(gts, predictions)
    print("Epoch: %i, MAE: %f, Invalid numbers: %i" % (i, mae, invalid_number_prediction_count))

Epoch: 1, MAE: 949.963400, Invalid numbers: 0
Epoch: 2, MAE: 337.342200, Invalid numbers: 0
Epoch: 3, MAE: 234.103500, Invalid numbers: 0
Epoch: 4, MAE: 195.828450, Invalid numbers: 0
Epoch: 5, MAE: 171.797500, Invalid numbers: 0
Epoch: 6, MAE: 104.704935, Invalid numbers: 1
Epoch: 7, MAE: 54.836600, Invalid numbers: 0
Epoch: 8, MAE: 54.332850, Invalid numbers: 0
Epoch: 9, MAE: 37.148500, Invalid numbers: 0
Epoch: 10, MAE: 27.736800, Invalid numbers: 0
