In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mxnet import nd, autograd, gluon
import mxnet as mx
from mxnet.gluon import nn, rnn

In [2]:
def n(digits =3):
    number = ''
    for i in range(np.random.randint(1, digits + 1)):
        number += np.random.choice(list('0123456789'))
    return int(number)

def padding(chars, maxlen):
    return chars + ' ' * (maxlen - len(chars))

N = 50000
N_train = int(N * .9)
N_validation = N - N_train

digits = 3
input_digits = digits * 2 + 3
output_digits = digits + 3

added = set()
questions = []
answers = []
answers_y = []

while len(questions) < N:
    a, b = n(), n()
    pair = tuple(sorted((a, b)))
    if pair in added:
        continue
        
    question = 'S{}+{}E'.format(a, b)
    question = padding(question, input_digits)
    answer = 'S' + str(a + b) + 'E'
    answer = padding(answer, output_digits)
    answer_y = str(a + b) + 'E'
    answer_y = padding(answer_y, output_digits)
    
    added.add(pair)
    questions.append(question)
    answers.append(answer)
    answers_y.append(answer_y)
    
chars = '0123456789+SE '
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

X = np.zeros((len(questions), input_digits, len(chars)), dtype=np.integer)
Y = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)
Z = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)

for i in range(N):
    for t, char in enumerate(questions[i]):
        X[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers[i]):
        Y[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers_y[i]):
        Z[i, t, char_indices[char]] = 1
    
X_train, X_validation, Y_train, Y_validation, Z_train, Z_validation = \
    train_test_split(X, Y, Z, train_size=N_train)



### Testset

In [3]:
def gen_n_test(N):
    q = []
    y = []
    for i in range(N):
        a, b = n(), n() 
        question = '{}+{}'.format(a, b)
        answer_y = str(a + b)
        q.append(question)
        y.append(answer_y)
    return(q,y)

In [4]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

In [5]:
class calculator(gluon.Block):
    def __init__(self, n_hidden, in_seq_len, out_seq_len, vocab_size, **kwargs):
        super(calculator, self).__init__(**kwargs)
        self.in_seq_len = in_seq_len
        self.out_seq_len = out_seq_len
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        
        with self.name_scope():
            self.encoder = rnn.LSTMCell(hidden_size = n_hidden)
            self.decoder = rnn.LSTMCell(hidden_size = n_hidden)
            self.batchnorm = nn.BatchNorm(axis = 2)
            self.dense = nn.Dense(self.vocab_size, flatten = False)
            
    def forward(self, inputs, outputs):
        # Since we don't use intermediate states for 'though vector', we don't need to unroll it.
        # In the later examples, we will use LSTM class rather than LSTMCell class.
        enout, (next_h, next_c) = self.encoder.unroll(inputs = inputs
                                                    , length = self.in_seq_len
                                                    , merge_outputs = True)
        for i in range(self.out_seq_len):
            deout, (next_h, next_c) = self.decoder(outputs[:, i, :], [next_h, next_c],)
            if i == 0:
                deouts = deout
            else:
                deouts = nd.concat(deouts, deout, dim = 1)
            #print('i= {}, deouts= {}'.format(i, deouts.shape))
        
        deouts = nd.reshape(deouts, (-1, self.out_seq_len, self.n_hidden))
        deouts = self.batchnorm(deouts)
        deouts_fc = self.dense(deouts)
        return deouts_fc
    
    def calculation(self, input_str, char_indices, indices_char, input_digits = 9, lchars = 14, ctx = mx.gpu()):
        input_str = 'S' + input_str + 'E'
        X = nd.zeros((1, input_digits, lchars), ctx = ctx)
        for t, char in enumerate(input_str):
            X[0, t, char_indices[char]] = 1
        Y_init = nd.zeros((1, lchars), ctx = ctx)
        Y_init[0, char_indices['S']] = 1
        enout, (next_h, next_c) = self.encoder.unroll(inputs = X, length = self.in_seq_len, merge_outputs = True)
        deout = Y_init
        
        for i in range(self.out_seq_len):
            deout, (next_h, next_c) = self.decoder(deout, [next_h, next_c])
            #print('dim deout = {}'.format(deout.shape))
            deout = nd.expand_dims(deout, axis = 1)
            #print('dim deout = {}'.format(deout.shape))
            deout = self.batchnorm(deout)
            deout = deout[:, 0, :]
            #print('dim deout = {}'.format(deout.shape))

            deout_sm = self.dense(deout)
            deout = nd.one_hot(nd.argmax(nd.softmax(deout_sm, axis = 1), axis = 1), depth = self.vocab_size)
            if i == 0:
                ret_seq = indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]
            else:
                ret_seq += indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]

            if ret_seq[-1] == ' ' or ret_seq[-1] == 'E':
                break
        return ret_seq.strip('E').strip()
        

In [6]:
tr_set = gluon.data.ArrayDataset(X_train, Y_train, Z_train)
tr_data_iterator = gluon.data.DataLoader(tr_set, batch_size=256, shuffle=True)

te_set =gluon.data.ArrayDataset(X_validation, Y_validation, Z_validation)
te_data_iterator = gluon.data.DataLoader(te_set, batch_size=256, shuffle=True)

ctx = mx.gpu()
model = calculator(300, 9, 6, 14)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
loss = gluon.loss.SoftmaxCrossEntropyLoss(axis = 2, sparse_label = False)

In [7]:
print(model)

calculator(
  (encoder): LSTMCell(None -> 1200)
  (decoder): LSTMCell(None -> 1200)
  (batchnorm): BatchNorm(axis=2, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
  (dense): Dense(None -> 14, linear)
)


In [8]:
def calculate_loss(model, data_iter, loss_obj, ctx = ctx):
    test_loss = []
    for i, (x_data, y_data, z_data) in enumerate(data_iter):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        with autograd.predict_mode():
            z_output = model(x_data, y_data)
            loss_te = loss_obj(z_output, z_data)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return np.mean(test_loss)

In [9]:
epochs = 201

tot_test_loss = []
tot_train_loss = []
for e in range(epochs):
    train_loss = []
    for i, (x_data, y_data, z_data) in enumerate(tr_data_iterator):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        
        with autograd.record():
            z_output = model(x_data, y_data)
            loss_ = loss(z_output, z_data)
        loss_.backward()
        trainer.step(x_data.shape[0])
        curr_loss = nd.mean(loss_).asscalar()
        train_loss.append(curr_loss)
        
    if e % 10 == 0:
        q, y = gen_n_test(10)
        for i in range(10):
            with autograd.predict_mode():
                p = model.calculation(q[i], char_indices, indices_char).strip()
                iscorr = 1 if p == y[i] else 0
                if iscorr == 1:
                    print(colors.ok + '☑' + colors.close, end=' ')
                else:
                    print(colors.fail + '☒' + colors.close, end=' ')
                print("{} = {}({}) 1/0 {}".format(q[i], p, y[i], str(iscorr) ))
    #caculate test loss
    test_loss = calculate_loss(model, te_data_iterator, loss_obj = loss, ctx=ctx) 

    print("Epoch %s. Train Loss: %s, Test Loss : %s" % (e, np.mean(train_loss), test_loss))    
    tot_test_loss.append(test_loss)
    tot_train_loss.append(np.mean(train_loss))


[91m☒[0m 252+26 = 222(278) 1/0 0
[91m☒[0m 89+84 = 1032(173) 1/0 0
[91m☒[0m 14+66 = 144(80) 1/0 0
[91m☒[0m 1+0 = 122(1) 1/0 0
[91m☒[0m 161+175 = 1122(336) 1/0 0
[91m☒[0m 0+270 = 122(270) 1/0 0
[91m☒[0m 1+85 = 122(86) 1/0 0
[91m☒[0m 572+661 = 1144(1233) 1/0 0
[91m☒[0m 19+905 = 1003(924) 1/0 0
[91m☒[0m 0+13 = 122(13) 1/0 0
Epoch 0. Train Loss: 1.1944102, Test Loss : 1.1342056
Epoch 1. Train Loss: 1.1213632, Test Loss : 1.1067674
Epoch 2. Train Loss: 1.0915885, Test Loss : 1.0877845
Epoch 3. Train Loss: 1.0276933, Test Loss : 0.96420777
Epoch 4. Train Loss: 0.92164785, Test Loss : 0.87567604
Epoch 5. Train Loss: 0.83364654, Test Loss : 0.830315
Epoch 6. Train Loss: 0.76330084, Test Loss : 0.73404014
Epoch 7. Train Loss: 0.69588923, Test Loss : 0.6664765
Epoch 8. Train Loss: 0.6245909, Test Loss : 0.5847117
Epoch 9. Train Loss: 0.54041433, Test Loss : 0.51308846
[91m☒[0m 4+28 = 212(32) 1/0 0
[91m☒[0m 1+62 = 632(63) 1/0 0
[91m☒[0m 0+952 = 942(952) 1/0 0
[91m☒[0m 5

[92m☑[0m 5+55 = 60(60) 1/0 1
[92m☑[0m 2+9 = 11(11) 1/0 1
[92m☑[0m 38+9 = 47(47) 1/0 1
[92m☑[0m 136+950 = 1086(1086) 1/0 1
[91m☒[0m 13+7 = 19(20) 1/0 0
[92m☑[0m 6+5 = 11(11) 1/0 1
[91m☒[0m 2+8 = 110(10) 1/0 0
[91m☒[0m 5+97 = 101(102) 1/0 0
[91m☒[0m 159+288 = 437(447) 1/0 0
[91m☒[0m 76+8 = 94(84) 1/0 0
Epoch 90. Train Loss: 0.0024783742, Test Loss : 0.048588507
Epoch 91. Train Loss: 0.0018179704, Test Loss : 0.0076959417
Epoch 92. Train Loss: 0.0016809956, Test Loss : 0.0071055098
Epoch 93. Train Loss: 0.001687593, Test Loss : 0.0075776307
Epoch 94. Train Loss: 0.002192769, Test Loss : 0.0075569926
Epoch 95. Train Loss: 0.0028593973, Test Loss : 0.0055113
Epoch 96. Train Loss: 0.0019556386, Test Loss : 0.016315576
Epoch 97. Train Loss: 0.0018179577, Test Loss : 0.008407067
Epoch 98. Train Loss: 0.0018593405, Test Loss : 0.006684537
Epoch 99. Train Loss: 0.0026867779, Test Loss : 0.008272896
[92m☑[0m 22+415 = 437(437) 1/0 1
[92m☑[0m 700+35 = 735(735) 1/0 1
[92m☑[

Epoch 175. Train Loss: 2.749527e-06, Test Loss : 0.002226091
Epoch 176. Train Loss: 2.7499957e-06, Test Loss : 0.0022506018
Epoch 177. Train Loss: 2.6849543e-06, Test Loss : 0.0021794266
Epoch 178. Train Loss: 2.6770176e-06, Test Loss : 0.0021291904
Epoch 179. Train Loss: 2.646597e-06, Test Loss : 0.0022371626
[91m☒[0m 103+7 = 100(110) 1/0 0
[92m☑[0m 9+1 = 10(10) 1/0 1
[92m☑[0m 933+85 = 1018(1018) 1/0 1
[92m☑[0m 7+721 = 728(728) 1/0 1
[91m☒[0m 8+0 = 9(8) 1/0 0
[92m☑[0m 67+3 = 70(70) 1/0 1
[92m☑[0m 65+9 = 74(74) 1/0 1
[92m☑[0m 137+449 = 586(586) 1/0 1
[92m☑[0m 0+566 = 566(566) 1/0 1
[92m☑[0m 769+12 = 781(781) 1/0 1
Epoch 180. Train Loss: 2.6078012e-06, Test Loss : 0.0021729602
Epoch 181. Train Loss: 2.5343752e-06, Test Loss : 0.0025367003
Epoch 182. Train Loss: 2.5487607e-06, Test Loss : 0.0022060536
Epoch 183. Train Loss: 2.5458949e-06, Test Loss : 0.0022452483
Epoch 184. Train Loss: 2.495283e-06, Test Loss : 0.0021666258
Epoch 185. Train Loss: 2.4933092e-06, Test L