In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from mxnet import nd, autograd, gluon
import mxnet as mx
from mxnet.gluon import nn, rnn

In [2]:
def n(digits =3):
    number = ''
    for i in range(np.random.randint(1, digits + 1)):
        number += np.random.choice(list('0123456789'))
    return int(number)

def padding(chars, maxlen):
    return chars + ' ' * (maxlen - len(chars))

N = 50000
N_train = int(N * .9)
N_validation = N - N_train

digits = 3
input_digits = digits * 2 + 3
output_digits = digits + 3

added = set()
questions = []
answers = []
answers_y = []

while len(questions) < N:
    a, b = n(), n()
    pair = tuple(sorted((a, b)))
    if pair in added:
        continue
        
    question = 'S{}+{}E'.format(a, b)
    question = padding(question, input_digits)
    answer = 'S' + str(a + b) + 'E'
    answer = padding(answer, output_digits)
    answer_y = str(a + b) + 'E'
    answer_y = padding(answer_y, output_digits)
    
    added.add(pair)
    questions.append(question)
    answers.append(answer)
    answers_y.append(answer_y)
    
chars = '0123456789+SE '
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

X = np.zeros((len(questions), input_digits, len(chars)), dtype=np.integer)
Y = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)
Z = np.zeros((len(questions), digits + 3, len(chars)), dtype=np.integer)

for i in range(N):
    for t, char in enumerate(questions[i]):
        X[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers[i]):
        Y[i, t, char_indices[char]] = 1
    for t, char in enumerate(answers_y[i]):
        Z[i, t, char_indices[char]] = 1
    
X_train, X_validation, Y_train, Y_validation, Z_train, Z_validation = \
    train_test_split(X, Y, Z, train_size=N_train)



### Testset

In [3]:
def gen_n_test(N):
    q = []
    y = []
    for i in range(N):
        a, b = n(), n() 
        question = '{}+{}'.format(a, b)
        answer_y = str(a + b)
        q.append(question)
        y.append(answer_y)
    return(q,y)

In [4]:
class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

In [5]:
class calculator(gluon.Block):
    def __init__(self, n_hidden, in_seq_len, out_seq_len, vocab_size, enc_layer, dec_layer = 1, **kwargs):
        super(calculator, self).__init__(**kwargs)
        self.in_seq_len = in_seq_len
        self.out_seq_len = out_seq_len
        self.n_hidden = n_hidden
        self.vocab_size = vocab_size
        self.enc_layer = enc_layer
        
        with self.name_scope():
            self.encoder = rnn.LSTM(hidden_size = n_hidden, num_layers = enc_layer, layout = 'NTC')
            self.decoder_0 = rnn.LSTMCell(hidden_size = n_hidden)
            self.decoder_1 = rnn.LSTMCell(hidden_size = n_hidden)
            self.batchnorm = nn.BatchNorm(axis = 2)
            self.dense = nn.Dense(self.vocab_size, flatten = False)
            
    def forward(self, inputs, outputs):
        # API says: num_layers, batch_size, num_hidden
        self.batch_size = inputs.shape[0]
        begin_state = self.encoder.begin_state(batch_size = self.batch_size, ctx = ctx)
        enout, (h, c) = self.encoder(inputs, begin_state) # h, c: n_layer * batch_size * n_hidden
        # Pick the hidden states and cell states at the last time step in the second layer
        next_h_0 = h[0] # batch_size * n_hidden
        next_c_0 = c[0] # batch_size * n_hidden
        next_h_1 = h[1] # batch_size * n_hidden
        next_c_1 = c[1] # batch_size * n_hidden
        for i in range(self.out_seq_len):
            deout, (next_h_0, next_c_0) = self.decoder_0(outputs[:, i, :], [next_h_0, next_c_0],)
            deout, (next_h_1, next_c_1) = self.decoder_1(deout, [next_h_1, next_c_1],)
            if i == 0:
                deouts = deout
            else:
                deouts = nd.concat(deouts, deout, dim = 1)   
        deouts = nd.reshape(deouts, (-1, self.out_seq_len, self.n_hidden))
        deouts = self.batchnorm(deouts)
        deouts_fc = self.dense(deouts)
        return deouts_fc
    
    def calculation(self, input_str, char_indices, indices_char, input_digits = 9, lchars = 14, ctx = mx.gpu()):
        input_str = 'S' + input_str + 'E'
        X = nd.zeros((1, input_digits, lchars), ctx = ctx)
        for t, char in enumerate(input_str):
            X[0, t, char_indices[char]] = 1
        Y_init = nd.zeros((1, lchars), ctx = ctx)
        Y_init[0, char_indices['S']] = 1
        begin_state = self.encoder.begin_state(batch_size = 1, ctx = ctx)
        enout, (h, c) = self.encoder(X, begin_state)
        next_h_0 = h[0] # batch_size * n_hidden
        next_c_0 = c[0] # batch_size * n_hidden
        next_h_1 = h[1] # batch_size * n_hidden
        next_c_1 = c[1] # batch_size * n_hidden
        deout = Y_init
        
        for i in range(self.out_seq_len):
            deout, (next_h_0, next_c_0) = self.decoder_0(deout, [next_h_0, next_c_0],)
            deout, (next_h_1, next_c_1) = self.decoder_1(deout, [next_h_1, next_c_1],)
            deout = nd.expand_dims(deout, axis = 1)
            deout = self.batchnorm(deout)
            deout = deout[:, 0, :]
            deout_sm = self.dense(deout)
            deout = nd.one_hot(nd.argmax(nd.softmax(deout_sm, axis = 1), axis = 1), depth = self.vocab_size)
            if i == 0:
                ret_seq = indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]
            else:
                ret_seq += indices_char[nd.argmax(deout_sm, axis = 1).asnumpy()[0].astype('int')]

            if ret_seq[-1] == ' ' or ret_seq[-1] == 'E':
                break
        return ret_seq.strip('E').strip()
        

In [6]:
tr_set = gluon.data.ArrayDataset(X_train, Y_train, Z_train)
tr_data_iterator = gluon.data.DataLoader(tr_set, batch_size=256, shuffle=True)

te_set =gluon.data.ArrayDataset(X_validation, Y_validation, Z_validation)
te_data_iterator = gluon.data.DataLoader(te_set, batch_size=256, shuffle=True)

In [7]:
ctx = mx.gpu()
model = calculator(300, 9, 6, 14, 2)
model.collect_params().initialize(mx.init.Xavier(), ctx = ctx)

trainer = gluon.Trainer(model.collect_params(), 'rmsprop')
loss = gluon.loss.SoftmaxCrossEntropyLoss(axis = 2, sparse_label = False)

In [8]:
print(model)

calculator(
  (encoder): LSTM(None -> 300, NTC, num_layers=2)
  (decoder_0): LSTMCell(None -> 1200)
  (decoder_1): LSTMCell(None -> 1200)
  (batchnorm): BatchNorm(axis=2, eps=1e-05, momentum=0.9, fix_gamma=False, use_global_stats=False, in_channels=None)
  (dense): Dense(None -> 14, linear)
)


In [9]:
def calculate_loss(model, data_iter, loss_obj, ctx = ctx):
    test_loss = []
    for i, (x_data, y_data, z_data) in enumerate(data_iter):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        with autograd.predict_mode():
            z_output = model(x_data, y_data)
            loss_te = loss_obj(z_output, z_data)
        curr_loss = nd.mean(loss_te).asscalar()
        test_loss.append(curr_loss)
    return np.mean(test_loss)

In [10]:
epochs = 201

tot_test_loss = []
tot_train_loss = []
for e in range(epochs):
    train_loss = []
    for i, (x_data, y_data, z_data) in enumerate(tr_data_iterator):
        x_data = x_data.as_in_context(ctx).astype('float32')
        y_data = y_data.as_in_context(ctx).astype('float32')
        z_data = z_data.as_in_context(ctx).astype('float32')
        
        with autograd.record():
            z_output = model(x_data, y_data)
            loss_ = loss(z_output, z_data)
        loss_.backward()
        trainer.step(x_data.shape[0])
        curr_loss = nd.mean(loss_).asscalar()
        train_loss.append(curr_loss)
        
    if e % 10 == 0:
        q, y = gen_n_test(10)
        for i in range(10):
            with autograd.predict_mode():
                p = model.calculation(q[i], char_indices, indices_char).strip()
                iscorr = 1 if p == y[i] else 0
                if iscorr == 1:
                    print(colors.ok + '☑' + colors.close, end=' ')
                else:
                    print(colors.fail + '☒' + colors.close, end=' ')
                print("{} = {}({}) 1/0 {}".format(q[i], p, y[i], str(iscorr) ))
    #caculate test loss
    test_loss = calculate_loss(model, te_data_iterator, loss_obj = loss, ctx=ctx) 

    print("Epoch %s. Train Loss: %s, Test Loss : %s" % (e, np.mean(train_loss), test_loss))    
    tot_test_loss.append(test_loss)
    tot_train_loss.append(np.mean(train_loss))


[91m☒[0m 29+798 = 1005(827) 1/0 0
[91m☒[0m 5+0 = 105(5) 1/0 0
[91m☒[0m 92+74 = 1005(166) 1/0 0
[91m☒[0m 2+474 = 143(476) 1/0 0
[91m☒[0m 486+848 = 1005(1334) 1/0 0
[91m☒[0m 7+8 = 1005(15) 1/0 0
[91m☒[0m 29+0 = 1005(29) 1/0 0
[91m☒[0m 383+5 = 1305(388) 1/0 0
[91m☒[0m 621+53 = 135(674) 1/0 0
[91m☒[0m 44+58 = 550(102) 1/0 0
Epoch 0. Train Loss: 1.1944661, Test Loss : 1.1354314
Epoch 1. Train Loss: 1.1253184, Test Loss : 1.113188
Epoch 2. Train Loss: 1.099548, Test Loss : 1.0727321
Epoch 3. Train Loss: 1.034573, Test Loss : 0.9679411
Epoch 4. Train Loss: 0.91749996, Test Loss : 0.8765491
Epoch 5. Train Loss: 0.8379429, Test Loss : 0.8202093
Epoch 6. Train Loss: 0.77298045, Test Loss : 0.76619637
Epoch 7. Train Loss: 0.68192005, Test Loss : 0.65772104
Epoch 8. Train Loss: 0.5897682, Test Loss : 0.5635521
Epoch 9. Train Loss: 0.5030496, Test Loss : 0.47183222
[91m☒[0m 6+42 = 496(48) 1/0 0
[92m☑[0m 471+52 = 523(523) 1/0 1
[91m☒[0m 1+321 = 332(322) 1/0 0
[91m☒[0m 779

Epoch 88. Train Loss: 7.1180707e-06, Test Loss : 0.0024517984
Epoch 89. Train Loss: 6.8402123e-06, Test Loss : 0.0025251582
[92m☑[0m 97+507 = 604(604) 1/0 1
[92m☑[0m 432+856 = 1288(1288) 1/0 1
[92m☑[0m 228+63 = 291(291) 1/0 1
[92m☑[0m 9+2 = 11(11) 1/0 1
[92m☑[0m 704+0 = 704(704) 1/0 1
[92m☑[0m 4+718 = 722(722) 1/0 1
[92m☑[0m 44+9 = 53(53) 1/0 1
[92m☑[0m 86+931 = 1017(1017) 1/0 1
[92m☑[0m 1+5 = 6(6) 1/0 1
[92m☑[0m 27+65 = 92(92) 1/0 1
Epoch 90. Train Loss: 6.8194176e-06, Test Loss : 0.0023782158
Epoch 91. Train Loss: 6.7371e-06, Test Loss : 0.0024871244
Epoch 92. Train Loss: 6.340904e-06, Test Loss : 0.0023339286
Epoch 93. Train Loss: 6.2266013e-06, Test Loss : 0.0026296931
Epoch 94. Train Loss: 6.1020314e-06, Test Loss : 0.0025065304
Epoch 95. Train Loss: 5.94201e-06, Test Loss : 0.0026804241
Epoch 96. Train Loss: 5.890624e-06, Test Loss : 0.0023863213
Epoch 97. Train Loss: 5.68883e-06, Test Loss : 0.0023624052
Epoch 98. Train Loss: 5.4881425e-06, Test Loss : 0.0024

Epoch 173. Train Loss: 2.1838339e-06, Test Loss : 0.0023485278
Epoch 174. Train Loss: 2.1711367e-06, Test Loss : 0.0023103869
Epoch 175. Train Loss: 2.1425649e-06, Test Loss : 0.0022776288
Epoch 176. Train Loss: 2.1329097e-06, Test Loss : 0.0023074658
Epoch 177. Train Loss: 2.1482826e-06, Test Loss : 0.0024132668
Epoch 178. Train Loss: 2.1081619e-06, Test Loss : 0.0023862284
Epoch 179. Train Loss: 2.092818e-06, Test Loss : 0.002298784
[92m☑[0m 9+821 = 830(830) 1/0 1
[92m☑[0m 981+7 = 988(988) 1/0 1
[91m☒[0m 3+5 = 87(8) 1/0 0
[92m☑[0m 519+6 = 525(525) 1/0 1
[92m☑[0m 86+50 = 136(136) 1/0 1
[92m☑[0m 13+61 = 74(74) 1/0 1
[92m☑[0m 8+544 = 552(552) 1/0 1
[92m☑[0m 195+83 = 278(278) 1/0 1
[92m☑[0m 5+41 = 46(46) 1/0 1
[91m☒[0m 89+4 = 133(93) 1/0 0
Epoch 180. Train Loss: 2.0457828e-06, Test Loss : 0.0023088977
Epoch 181. Train Loss: 2.0648488e-06, Test Loss : 0.002313013
Epoch 182. Train Loss: 2.057208e-06, Test Loss : 0.0023098933
Epoch 183. Train Loss: 2.008566e-06, Test Los