In [212]:
import numpy as np
import random
from keras.models import Sequential
from keras.utils import to_categorical
from keras import layers

class colors:
    ok = '\033[92m'
    fail = '\033[91m'
    close = '\033[0m'

In [190]:
class CharacterTable(object):
    def __init__(self, chars="0123456789+-"):
        self.chars = sorted(set(chars))
        self.ansChars = sorted(set('0123456789'))
        
        # build correspondence table between char and index of encoding
        self.charToIndex = dict((c, i) for i, c in enumerate(self.chars))
        self.indexToChar = dict((i ,c) for i, c in enumerate(self.chars))
        self.ansCharsToIndex =  dict((c, i) for i, c in enumerate(self.ansChars))
        self.indexToAnsChars = dict((i ,c) for i, c in enumerate(self.ansChars))

    def encode(self, expr, rowsNum, type):
        if type == "expr":
            x = np.zeros(shape=(rowsNum, len(self.chars)))
            for i, c in enumerate(expr):
                x[i, self.charToIndex[c]] = 1
        elif type == "ans":
            x = np.zeros(shape=(rowsNum, len(self.ansChars))) # +1 for the sign bit in ans
            for i, c in enumerate(expr):
                x[i, self.ansCharsToIndex[c]] = 1

        return x

    def decode(self, x, type, argmax=True):
        if argmax:
            # get the index of max value(=1) of each row
            x = x.argmax(axis=1)
        # then look up table to decode
        if type == "expr":
            return "".join(self.indexToChar[i] for i in x)
        elif type == "ans":
            return "".join(self.indexToAnsChars[i] for i in x)

# Data Generation

In [256]:
def genData(quesSize=150000, digits=3):
    questions = []
    expected = []
    seen = set()

    print('Generating data...')
    
    func = lambda: int(''.join(np.random.choice(list('0123456789'))
                        for i in range(np.random.randint(1, digits + 1))))
    while len(questions) < quesSize:
        a, b = func(), func()
        key = tuple(sorted((a, b)))
        if key in seen:
            continue
        seen.add(key)
        
        if random.randint(0, 99) % 2 == 0:
            e = str(a).zfill(digits) + '-' + str(b).zfill(digits)
            #e = '{}-{}'.format(a, b)
            ans = str(a - b)
        else:
            e = str(a).zfill(digits) + '+' + str(b).zfill(digits)
            ans = str(a + b).zfill(4)
        
        #expression = e + " " * ((2 * digits + 1) - len(e))
        #ans += " " * (digits + 2 - len(ans)) # `+2` for the carry and the minus sign.
        ans = "1" + ans[1:].zfill(4) if int(ans) < 0 else "0" + ans.zfill(4)

        questions.append(e)
        expected.append(ans)
    
    
    print('Generating data complete.')
    print('Total questions:', len(questions))
    return questions, expected

# Data Representation
* One-hot encoding
    * May use `to_categorical` method
    * But here I encoded the data myself using modified character table in example
* I've modified the example for subtraction but it's not so good
    * The whitespace's dimension in encoding may influence the result

In [193]:
def Vectorize(questions, expected, ctable, digits=3):
    print('Vectorization begin...')
    
    MAXLEN = 2 * digits + 1
    x = np.zeros((len(questions), MAXLEN, len(ctable.chars)))
    y = np.zeros((len(expected), digits + 2, len(ctable.ansChars)))

    # encoding the each char in expression/answer to boolean value
    for i, sentence in enumerate(questions):
        x[i] = ctable.encode(sentence, MAXLEN, "expr")
    for i, sentence in enumerate(expected):
        y[i] = ctable.encode(sentence, digits + 2, "ans")
    
    print('Vectorization complete')
    return x, y

In [258]:
questions, expected = genData()
questions[:5]

Generating data...
Generating data complete.
Total questions: 150000


['066-023', '812+071', '046-006', '264-306', '034-158']

* Encoding of questions and expected answer are **different**
* **Use first bit as a sign bit** in expected answer

In [259]:
expected[:5]

['00043', '00883', '00040', '10042', '10124']

In [191]:
ctable = CharacterTable()
ctable.charToIndex

{'+': 0,
 '-': 1,
 '0': 2,
 '1': 3,
 '2': 4,
 '3': 5,
 '4': 6,
 '5': 7,
 '6': 8,
 '7': 9,
 '8': 10,
 '9': 11}

In [260]:
x, y = Vectorize(questions, expected, ctable)
x[:2]

Vectorization begin...
Vectorization complete


array([[[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],

       [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.]]])

* For 045 - 365, by looking up charToIndex table to set corresponding index value as 1
    - 0   \[0 0 1 0 0 0 0 0 0 0 0 0\]
    - 4   \[0 0 0 0 0 0 1 0 0 0 0 0\]
    - 5   \[0 0 0 0 0 0 0 1 0 0 0 0\]
    - \-    \[0 1 0 0 0 0 0 0 0 0 0 0\]
    - 3   \[0 0 0 0 0 1 0 0 0 0 0 0\]
    - 6   \[0 0 0 0 0 0 0 0 1 0 0 0\]
    - 5   \[0 0 0 0 0 0 0 1 0 0 0 0\]
* Same for 299 + 396

In [261]:
y[:2]

array([[[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]],

       [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
        [0., 0., 0., 1., 0., 0., 0., 0., 0., 0.]]])

* 藉由一個 digit 來表示正負號
    * 045 - 365 = 10320 第一位數 1 表示 sign bit `-`, encoding 中第一位數的 index1 會被設為 1
    * 299 + 396 = 00695 第一位數 0 表示 sign bit `+`, encoding 中第一位數的 index0 會被設為 1

In [257]:
def splitData(x, y, splitAt=120000):

    # train_test_split
    train_x = x[:splitAt]
    train_y = y[:splitAt]
    test_x = x[splitAt:]
    test_y = y[splitAt:]

    # train validation split for RNN
    # splitAt = len(train_x) - len(train_x) // 10
    # (val_x, train_x) = train_x[splitAt:], train_x[:splitAt]
    # (val_y, train_y) = train_y[splitAt:], train_y[:splitAt]

    # return ((train_x, train_y), (val_x, val_y), (test_x, test_y))
    return ((train_x, train_y), (test_x, test_y))

In [262]:
((train_x, train_y), (test_x, test_y)) = splitData(x, y)

print('Training Data:')
print(train_x.shape)
print(train_y.shape)

print('Testing Data:')
print(test_x.shape)
print(test_y.shape)

Training Data:
(120000, 7, 12)
(120000, 5, 10)
Testing Data:
(30000, 7, 12)
(30000, 5, 10)


In [263]:
ctable.decode(x[0], type="expr"), ctable.decode(y[0], type="ans")

('066-023', '00043')

# Build model
* I train two model for outputs of the sign and the numeric part

In [269]:
def buildModel(train_x, train_y):
    print('Build model...')
    HIDDEN_SIZE = 128
    BATCH_SIZE = 128
    digits = 3
    
    # flattern the data
    train_x = train_x.reshape(len(train_x), -1) # 7 * 12 -> 1 * 84
    print(train_x.shape)

    train_y_sign = train_y[:, 0, :2].reshape(len(train_y), -1)
    train_y_num = train_y[:, 1:len(train_y), :].reshape(len(train_y), -1)
    print(train_y_num.shape)
    print(train_y_sign.shape)
    
    modelOfNums = Sequential()
    modelOfNums.add(layers.Dense(250 , input_shape=(84,), activation='relu'))
    modelOfNums.add(layers.Dense(250, activation='relu'))
    modelOfNums.add(layers.Dense(150, activation='relu'))
    modelOfNums.add(layers.Dense(100, activation='relu'))
    modelOfNums.add(layers.Dense(4*10, activation='sigmoid'))
    modelOfNums.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    modelOfNums.fit(train_x, train_y_num,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2, 
                    shuffle=True, verbose=1, epochs=100)
    
    modelOfSign = Sequential()
    modelOfSign.add(layers.Dense(250 , input_shape=(84,), activation='relu'))
    modelOfSign.add(layers.Dense(250, activation='relu'))
    modelOfSign.add(layers.Dense(150, activation='relu'))
    modelOfSign.add(layers.Dense(50, activation='relu'))
    modelOfSign.add(layers.Dense(2, activation='sigmoid'))
    modelOfSign.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 
    modelOfSign.fit(train_x, train_y_sign,
                    batch_size=BATCH_SIZE,
                    validation_split=0.2, 
                    shuffle=True, verbose=1, epochs=80)
    
    return modelOfNums, modelOfSign

* Use `tranform()` to change the data representation of the answers into int value for validation 

In [265]:
def transform(ans):
    if ans[0] == '1':
        ans = "-" + ans[1:len(ans)]
    return int(ans)

# Validation

In [268]:
print('Validation with testing data...')
right = 0

modelOfNums, modelOfSign = buildModel(train_x, train_y)
predNums = modelOfNums.predict(test_x.reshape(len(test_x), -1), verbose=0)
predNums = predNums.reshape((len(test_y), 4, 10))
print(predNums.shape)

predSign = np.argmax(modelOfSign.predict(test_x.reshape(len(test_x), -1), verbose=0), axis=1)
print(predSign.shape)


for i in range(len(predSign)):
    q = ctable.decode(test_x[i], type="expr")
    correct = transform(ctable.decode(test_y[i], type="ans"))
    
    guessNum = ctable.decode(predNums[i], type="ans")
    if predSign[i] == 0:
        guessSign = "+"
    elif predSign[i] == 1:
        guessSign = "-"
    guess = int(guessSign + guessNum)
    
    if correct == guess:
            right += 1
    
    if i >= len(predSign) - 10:
        print(q, end=' ')
        print(correct, end='\t')
        if correct == guess:
            print(colors.ok + '☑' + colors.close, end=' ')
        else:
            print(colors.fail + '☒' + colors.close, end=' ')
        print(guess)
            
        
print("Accuracy : {0:3.2f} %".format((right / len(predNums)) * 100), end=' ')
print("({} / {})".format(right, len(predNums)))

Validation with testing data...
Build model...
(120000, 84)
(120000, 40)
(120000, 2)
Train on 96000 samples, validate on 24000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100


Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100
Train on 96000 samples, validate on 24000 samples
Epoch 1/80
Epoch 2/80
Epoch 3/80
Epoch 4/80
Epoch 5/80
Epoch 6/80
Epoch 7/80
Epoch 8/80
Epoch 9/80
Epoch 10/80
Epoch 11/80
Epoch 12/80
Epoch 13/80
Epoch 14/80
Epoch 15/80
Epoch 16/80
Epoch 17/80


Epoch 18/80
Epoch 19/80
Epoch 20/80
Epoch 21/80
Epoch 22/80
Epoch 23/80
Epoch 24/80
Epoch 25/80
Epoch 26/80
Epoch 27/80
Epoch 28/80
Epoch 29/80
Epoch 30/80
Epoch 31/80
Epoch 32/80
Epoch 33/80
Epoch 34/80
Epoch 35/80
Epoch 36/80
Epoch 37/80
Epoch 38/80
Epoch 39/80
Epoch 40/80
Epoch 41/80
Epoch 42/80
Epoch 43/80
Epoch 44/80
Epoch 45/80
Epoch 46/80
Epoch 47/80
Epoch 48/80
Epoch 49/80
Epoch 50/80
Epoch 51/80
Epoch 52/80
Epoch 53/80
Epoch 54/80
Epoch 55/80
Epoch 56/80
Epoch 57/80
Epoch 58/80
Epoch 59/80
Epoch 60/80
Epoch 61/80
Epoch 62/80
Epoch 63/80
Epoch 64/80
Epoch 65/80
Epoch 66/80
Epoch 67/80
Epoch 68/80
Epoch 69/80
Epoch 70/80
Epoch 71/80
Epoch 72/80
Epoch 73/80
Epoch 74/80


Epoch 75/80
Epoch 76/80
Epoch 77/80
Epoch 78/80
Epoch 79/80
Epoch 80/80
(30000, 4, 10)
(30000,)
424-136 288	[92m☑[0m 288
470-228 242	[92m☑[0m 242
481-240 241	[92m☑[0m 241
712-894 -182	[92m☑[0m -182
302+841 1143	[92m☑[0m 1143
555+902 1457	[92m☑[0m 1457
160+136 296	[92m☑[0m 296
390+678 1068	[92m☑[0m 1068
872+418 1290	[92m☑[0m 1290
620-883 -263	[92m☑[0m -263
Accuracy : 99.32 % (29795 / 30000)


In [272]:
modelDir = "./model/"
modelOfNums.model.save(modelDir + "nums_model")
modelOfSign.model.save(modelDir + "sign_model")