In [1]:
import numpy as np

In [2]:
def softmax(x):
    if x.ndim == 2:
        x = x - x.max(axis=1, keepdims=True)
        x = np.exp(x)
        x /= x.sum(axis=1, keepdims=True)
    elif x.ndim == 1:
        x = x - np.max(x)
        x = np.exp(x) / np.sum(np.exp(x))

    return x

In [3]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [4]:
class Embedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx):
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout):
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)
        return None

In [5]:
class TimeEmbedding:
    def __init__(self, W):
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.layers = None
        self.W = W

    def forward(self, xs):
        N, T = xs.shape
        V, D = self.W.shape

        out = np.empty((N, T, D), dtype='f')
        self.layers = []

        for t in range(T):
            layer = Embedding(self.W)
            out[:, t, :] = layer.forward(xs[:, t])
            self.layers.append(layer)

        return out

    def backward(self, dout):
        N, T, D = dout.shape

        grad = 0
        for t in range(T):
            layer = self.layers[t]
            layer.backward(dout[:, t, :])
            grad += layer.grads[0]

        self.grads[0][...] = grad
        return None

In [6]:
class LSTM:
    def __init__(self, wx, wh, b):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.cache = None
        
    def forward(self, x, h_prev, c_prev):
        wx, wh, b = self.params
        N, H = h_prev.shape
        
        A = np.dot(x, wx) + np.dot(h_prev, wh) + b
        
        # slice
        f = A[:, :H]
        g = A[:, H:2*H]
        i = A[:, 2*H:3*H]
        o = A[:, 3*H:]
        
        f = sigmoid(f)
        g = np.tanh(g)
        i = sigmoid(i)
        o = sigmoid(o)
        
        c_next = f * c_prev + g * i
        h_next = o * np.tanh(c_next)
        
        self.cache = (x, h_prev, c_prev, i, f, g, o, c_next)
        return h_next, c_next
    
    def backward(self, dh_next, dc_next):
        Wx, Wh, b = self.params
        x, h_prev, c_prev, i, f, g, o, c_next = self.cache

        tanh_c_next = np.tanh(c_next)

        ds = dc_next + (dh_next * o) * (1 - tanh_c_next ** 2)

        dc_prev = ds * f

        di = ds * g
        df = ds * c_prev
        do = dh_next * tanh_c_next
        dg = ds * i

        di *= i * (1 - i)
        df *= f * (1 - f)
        do *= o * (1 - o)
        dg *= (1 - g ** 2)

        dA = np.hstack((df, dg, di, do))

        dWh = np.dot(h_prev.T, dA)
        dWx = np.dot(x.T, dA)
        db = dA.sum(axis=0)

        self.grads[0][...] = dWx
        self.grads[1][...] = dWh
        self.grads[2][...] = db

        dx = np.dot(dA, Wx.T)
        dh_prev = np.dot(dA, Wh.T)

        return dx, dh_prev, dc_prev

In [7]:
class TimeLSTM:
    def __init__(self, wx, wh, b, stateful = False):
        self.params = [wx, wh, b]
        self.grads = [np.zeros_like(wx), np.zeros_like(wh), np.zeros_like(b)]
        self.layers = None
        
        self.h, self.c = None, None
        self.dh = None
        self.stateful = stateful
        
    def forward(self, xs):
        wx, wh, b = self. params
        N, T, D = xs.shape
        H = wh.shape[0]
        
        self.layers = []
        hs = np.empty((N, T, H), dtype='f')
        
        if not self.stateful or self.h is None:
            self.h = np.zeros((N, H), dtype='f')
        if not self.stateful or self.c is None:
            self.c = np.zeros((N, H), dtype='f')
        
        for t in range(T):
            layer = LSTM(*self.params)
            self.h, self.c = layer.forward(xs[:, t, :], self.h, self.c)
            hs[:, t, :] = self.h
            
            self.layers.append(layer)
            
        return hs
    
    def backward(self, dhs):
        wx, wh, b = self.params
        N, T, H = dhs.shape
        D = wx.shape[0]
        
        dxs = np.empty((N, T, D), dtype='f')
        dh, dc = 0, 0
        
        grads = [0, 0, 0]
        for t in reversed(range(T)):
            layer = self.layers[t]
            dx, dh, dc = layer.backward(dhs[:, t, :] + dh, dc)
            dxs[:, t, :] = dx
            for i, grad in enumerate(layer.grads):
                grads[i] += grad
                
        for i, grad in enumerate(grads):
            self.grads[i][...] = grad
        self.dh = dh
        return dxs
    
    def set_state(self, h, c = None):
        self.h, self.c = h, c
        
    def reset_state(self):
        self.h, self.c = None, None
        

In [8]:
class TimeAffine:
    def __init__(self, W, b):
        self.params = [W, b]
        self.grads = [np.zeros_like(W), np.zeros_like(b)]
        self.x = None

    def forward(self, x):
        N, T, D = x.shape
        W, b = self.params

        rx = x.reshape(N*T, -1)
        out = np.dot(rx, W) + b
        self.x = x
        return out.reshape(N, T, -1)

    def backward(self, dout):
        x = self.x
        N, T, D = x.shape
        W, b = self.params

        dout = dout.reshape(N*T, -1)
        rx = x.reshape(N*T, -1)

        db = np.sum(dout, axis=0)
        dW = np.dot(rx.T, dout)
        dx = np.dot(dout, W.T)
        dx = dx.reshape(*x.shape)

        self.grads[0][...] = dW
        self.grads[1][...] = db

        return dx

In [9]:
class TimeSoftmaxWithLoss:
    def __init__(self):
        self.params, self.grads = [], []
        self.cache = None
        self.ignore_label = -1

    def forward(self, xs, ts):
        N, T, V = xs.shape

        if ts.ndim == 3:  # 教師ラベルがone-hotベクトルの場合
            ts = ts.argmax(axis=2)

        mask = (ts != self.ignore_label)

        # バッチ分と時系列分をまとめる（reshape）
        xs = xs.reshape(N * T, V)
        ts = ts.reshape(N * T)
        mask = mask.reshape(N * T)

        ys = softmax(xs)
        ls = np.log(ys[np.arange(N * T), ts])
        ls *= mask  # ignore_labelに該当するデータは損失を0にする
        loss = -np.sum(ls)
        loss /= mask.sum()

        self.cache = (ts, ys, mask, (N, T, V))
        return loss

    def backward(self, dout=1):
        ts, ys, mask, (N, T, V) = self.cache

        dx = ys
        dx[np.arange(N * T), ts] -= 1
        dx *= dout
        dx /= mask.sum()
        dx *= mask[:, np.newaxis]  # ignore_labelに該当するデータは勾配を0にする

        dx = dx.reshape((N, T, V))

        return dx

In [10]:
class Rnnlm:
    def __init__(self, vocab_size = 10000, wordvec_size = 100, hidden_size = 100):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.rand
        
        # 重みの初期化
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4* H).astype('f')
        affine_w = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        # レイヤの生成
        self.layers = [
            TimeEmbedding(embed_w),
            TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful = True),
            TimeAffine(affine_w, affine_b)
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layer = self.layers[1]
        
        # すべての重みと勾配を配列にまとめる
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
            
    def predict(self, xs):
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs
    
    def forward(self, xs, ts):
        score = self.predict(xs)
        loss = self.loss_layer.forward(score, ts)
        return loss
    
    def backward(self, dout = 1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout
    
    def reset_state(self):
        self.lstm_layer.reset_state()
        
    def save_params(self, file_name="Rnnlm.pkl"):
        with open(file_name, 'wb') as f:
            pickle.dump(self.params, f)
            
    def load_params(self, file_name="Rnnlm.pkl"):
        with open(file_name, 'rb') as f:
            self.params = pickle.load(f)
            


In [11]:
class RnnlmGen(Rnnlm):
    def generate(self, start_id, skip_ids = None, sample_size = 50):
        word_ids = [start_id]
        
        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x)
            p = softmax(score.flatten())
            
            sampled = np.random.choice(len(p), size = 1, p = p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))
                
        return word_ids
                

In [12]:
import sys, os
sys.path.append(os.pardir)
try:
    import urllib.request
except ImportError:
    raise ImportError('Use Python3!')
import pickle
import numpy as np


url_base = 'https://raw.githubusercontent.com/tomsercu/lstm/master/data/'
key_file = {
    'train':'ptb.train.txt',
    'test':'ptb.test.txt',
    'valid':'ptb.valid.txt'
}
save_file = {
    'train':'ptb.train.npy',
    'test':'ptb.test.npy',
    'valid':'ptb.valid.npy'
}
vocab_file = 'ptb.vocab.pkl'

dataset_dir = '.' # os.path.dirname(os.path.abspath(__file__))


def _download(file_name):
    file_path = dataset_dir + '/' + file_name
    if os.path.exists(file_path):
        return

    print('Downloading ' + file_name + ' ... ')

    try:
        urllib.request.urlretrieve(url_base + file_name, file_path)
    except urllib.error.URLError:
        import ssl
        ssl._create_default_https_context = ssl._create_unverified_context
        urllib.request.urlretrieve(url_base + file_name, file_path)

    print('Done')


def load_vocab():
    vocab_path = dataset_dir + '/' + vocab_file

    if os.path.exists(vocab_path):
        with open(vocab_path, 'rb') as f:
            word_to_id, id_to_word = pickle.load(f)
        return word_to_id, id_to_word

    word_to_id = {}
    id_to_word = {}
    data_type = 'train'
    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name

    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()

    for i, word in enumerate(words):
        if word not in word_to_id:
            tmp_id = len(word_to_id)
            word_to_id[word] = tmp_id
            id_to_word[tmp_id] = word

    with open(vocab_path, 'wb') as f:
        pickle.dump((word_to_id, id_to_word), f)

    return word_to_id, id_to_word


def load_data(data_type='train'):
    '''
        :param data_type: データの種類：'train' or 'test' or 'valid (val)'
        :return:
    '''
    if data_type == 'val': data_type = 'valid'
    save_path = dataset_dir + '/' + save_file[data_type]

    word_to_id, id_to_word = load_vocab()

    if os.path.exists(save_path):
        corpus = np.load(save_path)
        if False:
            corpus = to_gpu(corpus)
        return corpus, word_to_id, id_to_word

    file_name = key_file[data_type]
    file_path = dataset_dir + '/' + file_name
    _download(file_name)

    words = open(file_path).read().replace('\n', '<eos>').strip().split()
    corpus = np.array([word_to_id[w] for w in words])

    np.save(save_path, corpus)

    if False:
        corpus = to_gpu(corpus)

    return corpus, word_to_id, id_to_word

In [21]:
corpus, word_to_id, id_to_word = load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)

model = RnnlmGen()
model.load_params('Rnnlm.pkl')

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]

# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.¥n')
print(txt)

you gap cray-3 perspective musicians projections nicaraguan suing crumbling recessions filters insure pigs those max spoke corresponding troubles cutbacks pair charitable kraft salt permanently disarray owner borrow insurer newhouse nippon ore. enfield defaulted cash ton bono performer fully norwegian rewarding setbacks spark cemetery outlets social bolstered four-year phelan prestige fetch


In [28]:
class BetterRnnlmGen(BetterRnnlm):
    def generate(self, start_id, skip_ids=None, sample_size=100):
        word_ids = [start_id]

        x = start_id
        while len(word_ids) < sample_size:
            x = np.array(x).reshape(1, 1)
            score = self.predict(x).flatten()
            p = softmax(score).flatten()

            sampled = np.random.choice(len(p), size=1, p=p)
            if (skip_ids is None) or (sampled not in skip_ids):
                x = sampled
                word_ids.append(int(x))

        return word_ids

    def get_state(self):
        states = []
        for layer in self.lstm_layers:
            states.append((layer.h, layer.c))
        return states

    def set_state(self, states):
        for layer, state in zip(self.lstm_layers, states):
            layer.set_state(*state)

In [27]:
class BetterRnnlm:
    def __init__(self, vocab_size = 10000, wordvec_size = 650, 
                 hidden_size = 650, dropout_ratio = 0.5):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        # 重みの初期化
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx1 = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh1 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b1 = np.zeros(4* H).astype('f')
        lstm_wx2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_wh2 = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b2 = np.zeros(4* H).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        # レイヤの生成
        self.layers = [
            TimeEmbedding(embed_w),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_wx1, lstm_wh1, lstm_b1, stateful = True),
            TimeDropout(dropout_ratio),
            TimeLSTM(lstm_wx2, lstm_wh2, lstm_b2, stateful = True),
            TimeDropout(dropout_ratio),
            TimeAffine(embed_w.T, affine_b)
        ]
        self.loss_layer = TimeSoftmaxWithLoss()
        self.lstm_layers = [self.layers[2], self.layers[4]]
        self.dropout_layers = [self.layers[1], self.layers[3], self.layers[5]]
        
        # すべての重みと勾配を配列にまとめる
        self.params, self.grads = [], []
        for layer in self.layers:
            self.params += layer.params
            self.grads += layer.grads
            
    def predict(self, xs, train_flg = False):
        for layer in self.dropout_layers:
            layer.train_flg = train_flg
            
        for layer in self.layers:
            xs = layer.forward(xs)
        return xs
    
    def forward(self, xs, ts, train_flg = True):
        score = self.predict(xs, train_flg)
        loss = self.loss_layer.forward(score, ts)
        return loss
    
    def backward(self, dout = 1):
        dout = self.loss_layer.backward(dout)
        for layer in reversed(self.layers):
            dout = layer.backward(dout)
        return dout
    
    def reset_state(self):
        for layer in self.lstm_layers:
            layer.reset_state()
        
    def save_params(self, file_name="Rnnlm.pkl"):
        with open(file_name, 'wb') as f:
            pickle.dump(self.params, f)
            
    def load_params(self, file_name="Rnnlm.pkl"):
        with open(file_name, 'rb') as f:
            self.params = pickle.load(f)
            


In [13]:
class TimeDropout:
    def __init__(self, dropout_ratio = 0.5):
        self.params, self.grads = [], []
        self.dropout_ratio = dropout_ratio
        self.mask = None
        self.train_flg = True

    def forward(self, xs):
        if self.train_flg:
            flg = np.random.rand(*xs.shape) > self.dropout_ratio
            scale = 1 / (1.0 - self.dropout_ratio)
            self.mask = flg.astype(np.float32) * scale

            return xs * self.mask
        else:
            return xs

    def backward(self, dout):
        return dout * self.mask

In [14]:
class SGD:
    '''
    確率的勾配降下法（Stochastic Gradient Descent）
    '''
    def __init__(self, lr=0.01):
        self.lr = lr
        
    def update(self, params, grads):
        for i in range(len(params)):
            params[i] -= self.lr * grads[i]

In [15]:
def remove_duplicate(params, grads):
    '''
    パラメータ配列中の重複する重みをひとつに集約し、
    その重みに対応する勾配を加算する
    '''
    params, grads = params[:], grads[:]  # copy list

    while True:
        find_flg = False
        L = len(params)

        for i in range(0, L - 1):
            for j in range(i + 1, L):
                # 重みを共有する場合
                if params[i] is params[j]:
                    grads[i] += grads[j]  # 勾配の加算
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)
                # 転置行列として重みを共有する場合（weight tying）
                elif params[i].ndim == 2 and params[j].ndim == 2 and \
                     params[i].T.shape == params[j].shape and np.all(params[i].T == params[j]):
                    grads[i] += grads[j].T
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)

                if find_flg: break
            if find_flg: break

        if not find_flg: break

    return params, grads

In [31]:
corpus, word_to_id, id_to_word = load_data('train')
vocab_size = len(word_to_id)
corpus_size = len(corpus)


model = BetterRnnlmGen()
model.load_params('Rnnlm.pkl')

# start文字とskip文字の設定
start_word = 'you'
start_id = word_to_id[start_word]
skip_words = ['N', '<unk>', '$']
skip_ids = [word_to_id[w] for w in skip_words]
# 文章生成
word_ids = model.generate(start_id, skip_ids)
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')

print(txt)


model.reset_state()

start_words = 'the meaning of life is'
start_ids = [word_to_id[w] for w in start_words.split(' ')]

for x in start_ids[:-1]:
    x = np.array(x).reshape(1, 1)
    model.predict(x)

word_ids = model.generate(start_ids[-1], skip_ids)
word_ids = start_ids[:-1] + word_ids
txt = ' '.join([id_to_word[i] for i in word_ids])
txt = txt.replace(' <eos>', '.\n')
print('-' * 50)
print(txt)

you imports tire jail array belts prospective h. scripts narrowing handy respectable passage salon denver aeronautics performer region trading damage axa par dollar sim wrong permanently nations equipment sagged head sleep movement sandinistas tens offset registration tons depository song salt troop hammack cowboys channels bat quickly marks elliott unprofitable volumes albert plunge speaking cycling accumulated method competing mo. respectively afghanistan friendship pennzoil proper slew areas riskier finger improving vote rare economically fractionally howard wildlife surgical bare-faced franklin request manitoba prime neal short-lived homeowners manpower authors copyright june reserve continent doubts shadow specific partially retrieve peck mancuso firmer stereo indications invariably
--------------------------------------------------
the meaning of life is guess roman demanded corry mercantile midnight andrew wanting uncovered junk-bond sources unfairly home-equity editions existed

In [16]:
import numpy


id_to_char = {}
char_to_id = {}


def _update_vocab(txt):
    chars = list(txt)

    for i, char in enumerate(chars):
        if char not in char_to_id:
            tmp_id = len(char_to_id)
            char_to_id[char] = tmp_id
            id_to_char[tmp_id] = char


def sequence_load_data(file_name='addition.txt', seed=1984):
    file_path = file_name

    if not os.path.exists(file_path):
        print('No file: %s' % (file_name) )
        return None

    questions, answers = [], []

    for line in open(file_path, 'r'):
        idx = line.find('_')
        questions.append(line[:idx])
        answers.append(line[idx:-1])

    # create vocab dict
    for i in range(len(questions)):
        q, a = questions[i], answers[i]
        _update_vocab(q)
        _update_vocab(a)

    # create numpy array
    x = numpy.zeros((len(questions), len(questions[0])), dtype=numpy.int)
    t = numpy.zeros((len(questions), len(answers[0])), dtype=numpy.int)

    for i, sentence in enumerate(questions):
        x[i] = [char_to_id[c] for c in list(sentence)]
    for i, sentence in enumerate(answers):
        t[i] = [char_to_id[c] for c in list(sentence)]

    # shuffle
    indices = numpy.arange(len(x))
    if seed is not None:
        numpy.random.seed(seed)
    numpy.random.shuffle(indices)
    x = x[indices]
    t = t[indices]

    # 10% for validation set
    split_at = len(x) - len(x) // 10
    (x_train, x_val) = x[:split_at], x[split_at:]
    (t_train, t_val) = t[:split_at], t[split_at:]

    return (x_train, t_train), (x_val, t_val)


def get_vocab():
    return char_to_id, id_to_char

In [36]:
(x_train, t_train), (x_val, t_val) = sequence_load_data('addition.txt')
char_to_id, id_to_char = get_vocab()

print(x_train.shape, t_train.shape)
print(x_val.shape, t_val.shape)

(45000, 7) (45000, 5)
(5000, 7) (5000, 5)


In [38]:
print(x_train[0])
print(t_train[0])

[ 3  0  2  0  0 11  5]
[ 6  0 11  7  5]


In [39]:
print("".join([id_to_char[c] for c in x_train[0]]))
print("".join([id_to_char[c] for c in t_train[0]]))


71+118 
_189 


In [17]:
class Encoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        
        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful = False)
        
        self.params = self.embed.params + self.lstm.params
        self.grads = self.embed.grads + self.lstm.grads
        self.hs = None
        
    def forward(self, xs):
        xs = self.embed.forward(xs)
        hs = self.lstm.forward(xs)
        self.hs = hs
        return hs[:, -1, :]
    
    def backward(self, dh):
        dhs = np.zeros_like(self.hs)
        dhs[:, -1, :] = dh
        
        dout = self.lstm.backward(dhs)
        dout = self.embed.backward(dout)
        return dout

In [18]:
class Decoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(D, 4 * H) / np.sqrt(D)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_w = (rn(H, V) / np.sqrt(H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful = True)
        self.affine = TimeAffine(affine_w, affine_b)
        
        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
            
    def forward(self, xs, h):
        self.lstm.set_state(h)
        
        out = self.embed.forward(xs)
        out = self.lstm.forward(out)
        score = self.affine.forward(out)
        return score
    
    def backward(self, dscore):
        dout = self.affine.backward(dscore)
        dout = self.lstm.backward(dout)
        dout = self.embed.backward(dout)
        dh = self.lstm.dh
        return dh
    
    def generate(self, h, start_id, sample_size):
        sampled = []
        sample_id = start_id
        self.lstm.set_state(h)
        
        for _ in range(sample_size):
            x = np.array(sample_id).reshape((1, 1))
            out = self.embed.forward(x)
            out = self.lstm.forward(out)
            score = self.affine.forward(out)
            
            sample_id = np.argmax(score.flatten())
            sampled.append(int(sample_id))
            
        return sampled

In [19]:
class BaseModel:
    def __init__(self):
        self.params, self.grads = None, None

    def forward(self, *args):
        raise NotImplementedError

    def backward(self, *args):
        raise NotImplementedError

    def save_params(self, file_name=None):
        if file_name is None:
            file_name = self.__class__.__name__ + '.pkl'

        params = [p.astype(np.float16) for p in self.params]

        with open(file_name, 'wb') as f:
            pickle.dump(params, f)

    def load_params(self, file_name=None):
        if file_name is None:
            file_name = self.__class__.__name__ + '.pkl'

        if not os.path.exists(file_name):
            raise IOError('No file: '' + file_name + ''')

        with open(file_name, 'rb') as f:
            params = pickle.load(f)

        params = [p.astype('f') for p in params]

        for i, param in enumerate(self.params):
            param[...] = params[i]

In [20]:
class Seq2seq(BaseModel):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        self.encoder = Encoder(V, D, H)
        self.decoder = Decoder(V, D, H)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        
    def forward(self, xs, ts):
        decoder_xs, decoder_ts = ts[:, :-1], ts[:, 1:]
        
        h = self.encoder.forward(xs)
        score = self.decoder.forward(decoder_xs, h)
        loss = self.softmax.forward(score, decoder_ts)
        return loss
    
    def backward(self, dout = 1):
        dout = self.softmax.backward(dout)
        dh = self.decoder.backward(dout)
        dout = self.encoder.backward(dh)
        return dout
    
    def generate(self, xs, start_id, sample_size):
        h = self.encoder.forward(xs)
        sampled = self.decoder.generate(h, start_id, sample_size)
        return sampled

In [21]:
import matplotlib.pyplot as plt

In [22]:
def eval_seq2seq(model, question, correct, id_to_char, verbos=False, is_reverse=False):
    correct = correct.flatten()
    # 頭の区切り文字
    start_id = correct[0]
    correct = correct[1:]
    guess = model.generate(question, start_id, len(correct))

    # 文字列へ変換
    question = ''.join([id_to_char[int(c)] for c in question.flatten()])
    correct = ''.join([id_to_char[int(c)] for c in correct])
    guess = ''.join([id_to_char[int(c)] for c in guess])

    if verbos:
        if is_reverse:
            question = question[::-1]

        colors = {'ok': '\033[92m', 'fail': '\033[91m', 'close': '\033[0m'}
        print('Q', question)
        print('T', correct)
        if correct == guess:
            print(colors['ok'] + '☑' + colors['close'] + ' ' + guess)
        else:
            print(colors['fail'] + '☒' + colors['close'] + ' ' + guess)
        print('---')

    return 1 if guess == correct else 0


In [23]:
class Adam:
    '''
    Adam (http://arxiv.org/abs/1412.6980v8)
    '''
    def __init__(self, lr=0.001, beta1=0.9, beta2=0.999):
        self.lr = lr
        self.beta1 = beta1
        self.beta2 = beta2
        self.iter = 0
        self.m = None
        self.v = None
        
    def update(self, params, grads):
        if self.m is None:
            self.m, self.v = [], []
            for param in params:
                self.m.append(np.zeros_like(param))
                self.v.append(np.zeros_like(param))
        
        self.iter += 1
        lr_t = self.lr * np.sqrt(1.0 - self.beta2**self.iter) / (1.0 - self.beta1**self.iter)

        for i in range(len(params)):
            self.m[i] += (1 - self.beta1) * (grads[i] - self.m[i])
            self.v[i] += (1 - self.beta2) * (grads[i]**2 - self.v[i])
            
            params[i] -= lr_t * self.m[i] / (np.sqrt(self.v[i]) + 1e-7)

In [24]:
# データセットの読み込み
(x_train, t_train), (x_val, t_val) = sequence_load_data('addition.txt')
x_train, x_val = x_train[:, ::-1], x_val[:, ::-1]
char_to_id, id_to_char = get_vocab()

# ハイパーパラメータの設定
vocab_size = len(char_to_id)
wordvec_size = 16
hidden_size = 128
batch_size = 128
max_epoch = 25
max_grad = 5.0


In [25]:
import time
class Trainer:
    def __init__(self, model, optimizer):
        self.model = model
        self.optimizer = optimizer
        self.loss_list = []
        self.eval_interval = None
        self.current_epoch = 0

    def fit(self, x, t, max_epoch=10, batch_size=32, max_grad=None, eval_interval=20):
        data_size = len(x)
        max_iters = data_size // batch_size
        self.eval_interval = eval_interval
        model, optimizer = self.model, self.optimizer
        total_loss = 0
        loss_count = 0

        start_time = time.time()
        for epoch in range(max_epoch):
            # シャッフル
            idx = numpy.random.permutation(numpy.arange(data_size))
            x = x[idx]
            t = t[idx]

            for iters in range(max_iters):
                batch_x = x[iters*batch_size:(iters+1)*batch_size]
                batch_t = t[iters*batch_size:(iters+1)*batch_size]

                # 勾配を求め、パラメータを更新
                loss = model.forward(batch_x, batch_t)
                model.backward()
                params, grads = remove_duplicate(model.params, model.grads)  # 共有された重みを1つに集約
                if max_grad is not None:
                    clip_grads(grads, max_grad)
                optimizer.update(params, grads)
                total_loss += loss
                loss_count += 1

                # 評価
                if (eval_interval is not None) and (iters % eval_interval) == 0:
                    avg_loss = total_loss / loss_count
                    elapsed_time = time.time() - start_time
                    print('| epoch %d |  iter %d / %d | time %d[s] | loss %.2f'
                          % (self.current_epoch + 1, iters + 1, max_iters, elapsed_time, avg_loss))
                    self.loss_list.append(float(avg_loss))
                    total_loss, loss_count = 0, 0

            self.current_epoch += 1

    def plot(self, ylim=None):
        x = numpy.arange(len(self.loss_list))
        if ylim is not None:
            plt.ylim(*ylim)
        plt.plot(x, self.loss_list, label='train')
        plt.xlabel('iterations (x' + str(self.eval_interval) + ')')
        plt.ylabel('loss')
        plt.show()


In [26]:
def remove_duplicate(params, grads):
    '''
    パラメータ配列中の重複する重みをひとつに集約し、
    その重みに対応する勾配を加算する
    '''
    params, grads = params[:], grads[:]  # copy list

    while True:
        find_flg = False
        L = len(params)

        for i in range(0, L - 1):
            for j in range(i + 1, L):
                # 重みを共有する場合
                if params[i] is params[j]:
                    grads[i] += grads[j]  # 勾配の加算
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)
                # 転置行列として重みを共有する場合（weight tying）
                elif params[i].ndim == 2 and params[j].ndim == 2 and \
                     params[i].T.shape == params[j].shape and np.all(params[i].T == params[j]):
                    grads[i] += grads[j].T
                    find_flg = True
                    params.pop(j)
                    grads.pop(j)

                if find_flg: break
            if find_flg: break

        if not find_flg: break

    return params, grads

In [27]:
def clip_grads(grads, max_norm):
    total_norm = 0
    for grad in grads:
        total_norm += np.sum(grad ** 2)
    total_norm = np.sqrt(total_norm)

    rate = max_norm / (total_norm + 1e-6)
    if rate < 1:
        for grad in grads:
            grad *= rate

In [60]:
# モデル/オプティマイザ/トレーナーの作成
model = Seq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch = 1,
               batch_size = batch_size, max_grad = max_grad)
    
    correct_num = 0
    for i in range(len(x_val)):
        question, correct = x_val[[i]], t_val[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
    acc = float(correct_num) / len(x_val)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 3[s] | loss 2.52
| epoch 1 |  iter 41 / 351 | time 7[s] | loss 2.17
| epoch 1 |  iter 61 / 351 | time 11[s] | loss 1.96
| epoch 1 |  iter 81 / 351 | time 15[s] | loss 1.91
| epoch 1 |  iter 101 / 351 | time 19[s] | loss 1.87
| epoch 1 |  iter 121 / 351 | time 24[s] | loss 1.86
| epoch 1 |  iter 141 / 351 | time 28[s] | loss 1.84
| epoch 1 |  iter 161 / 351 | time 32[s] | loss 1.80
| epoch 1 |  iter 181 / 351 | time 36[s] | loss 1.78
| epoch 1 |  iter 201 / 351 | time 40[s] | loss 1.77
| epoch 1 |  iter 221 / 351 | time 45[s] | loss 1.77
| epoch 1 |  iter 241 / 351 | time 49[s] | loss 1.76
| epoch 1 |  iter 261 / 351 | time 53[s] | loss 1.75
| epoch 1 |  iter 281 / 351 | time 57[s] | loss 1.74
| epoch 1 |  iter 301 / 351 | time 61[s] | loss 1.74
| epoch 1 |  iter 321 / 351 | time 65[s] | loss 1.74
| epoch 1 |  iter 341 / 351 | time 69[s] | loss 1.73
Q   58+77
T 162 
[91m☒[0m 100 
---
Q 461+579
T 1139


| epoch 7 |  iter 101 / 351 | time 21[s] | loss 0.68
| epoch 7 |  iter 121 / 351 | time 26[s] | loss 0.67
| epoch 7 |  iter 141 / 351 | time 30[s] | loss 0.67
| epoch 7 |  iter 161 / 351 | time 34[s] | loss 0.67
| epoch 7 |  iter 181 / 351 | time 38[s] | loss 0.66
| epoch 7 |  iter 201 / 351 | time 42[s] | loss 0.66
| epoch 7 |  iter 221 / 351 | time 46[s] | loss 0.66
| epoch 7 |  iter 241 / 351 | time 51[s] | loss 0.64
| epoch 7 |  iter 261 / 351 | time 55[s] | loss 0.65
| epoch 7 |  iter 281 / 351 | time 59[s] | loss 0.64
| epoch 7 |  iter 301 / 351 | time 63[s] | loss 0.63
| epoch 7 |  iter 321 / 351 | time 67[s] | loss 0.63
| epoch 7 |  iter 341 / 351 | time 72[s] | loss 0.62
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[91m☒[0m 1142
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[91m☒[0m 162 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[91m☒[0m 859 
---
Q 292+167
T 1053
[91m☒[0m 1144
---
Q 795+038
T 1427
[91m☒[0m 1431
---
Q  838+62
T 864

| epoch 14 |  iter 61 / 351 | time 13[s] | loss 0.40
| epoch 14 |  iter 81 / 351 | time 16[s] | loss 0.40
| epoch 14 |  iter 101 / 351 | time 20[s] | loss 0.41
| epoch 14 |  iter 121 / 351 | time 25[s] | loss 0.39
| epoch 14 |  iter 141 / 351 | time 29[s] | loss 0.39
| epoch 14 |  iter 161 / 351 | time 33[s] | loss 0.38
| epoch 14 |  iter 181 / 351 | time 38[s] | loss 0.38
| epoch 14 |  iter 201 / 351 | time 42[s] | loss 0.38
| epoch 14 |  iter 221 / 351 | time 46[s] | loss 0.38
| epoch 14 |  iter 241 / 351 | time 50[s] | loss 0.39
| epoch 14 |  iter 261 / 351 | time 54[s] | loss 0.40
| epoch 14 |  iter 281 / 351 | time 58[s] | loss 0.41
| epoch 14 |  iter 301 / 351 | time 62[s] | loss 0.39
| epoch 14 |  iter 321 / 351 | time 67[s] | loss 0.39
| epoch 14 |  iter 341 / 351 | time 71[s] | loss 0.39
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[91m☒[0m 1137
---
Q  48+285
T 666 
[91m☒[0m 667 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+0

| epoch 20 |  iter 101 / 351 | time 26[s] | loss 0.31
| epoch 20 |  iter 121 / 351 | time 31[s] | loss 0.33
| epoch 20 |  iter 141 / 351 | time 36[s] | loss 0.32
| epoch 20 |  iter 161 / 351 | time 40[s] | loss 0.33
| epoch 20 |  iter 181 / 351 | time 45[s] | loss 0.31
| epoch 20 |  iter 201 / 351 | time 49[s] | loss 0.30
| epoch 20 |  iter 221 / 351 | time 53[s] | loss 0.32
| epoch 20 |  iter 241 / 351 | time 57[s] | loss 0.33
| epoch 20 |  iter 261 / 351 | time 61[s] | loss 0.35
| epoch 20 |  iter 281 / 351 | time 66[s] | loss 0.36
| epoch 20 |  iter 301 / 351 | time 70[s] | loss 0.34
| epoch 20 |  iter 321 / 351 | time 74[s] | loss 0.32
| epoch 20 |  iter 341 / 351 | time 78[s] | loss 0.32
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[91m☒[0m 665 
---
Q   551+8
T 163 
[91m☒[0m 162 
---
Q  55+763
T 422 
[91m☒[0m 421 
---
Q 752+006
T 857 
[91m☒[0m 859 
---
Q 292+167
T 1053
[91m☒[0m 1051
---
Q 795+038
T 1427
[91m☒[0m 1428
---
Q 

In [29]:
class PeekyDecoder:
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        V, D, H = vocab_size, wordvec_size, hidden_size
        rn = np.random.randn
        
        embed_w = (rn(V, D) / 100).astype('f')
        lstm_wx = (rn(H + D, 4 * H)).astype('f')
        lstm_wh = (rn(H, 4 * H) / np.sqrt(H)).astype('f')
        lstm_b = np.zeros(4 * H).astype('f')
        affine_w = (rn(H + H, V) / np.sqrt(H + H)).astype('f')
        affine_b = np.zeros(V).astype('f')
        
        self.embed = TimeEmbedding(embed_w)
        self.lstm = TimeLSTM(lstm_wx, lstm_wh, lstm_b, stateful = True)
        self.affine = TimeAffine(affine_w, affine_b)
        
        self.params, self.grads = [], []
        for layer in (self.embed, self.lstm, self.affine):
            self.params += layer.params
            self.grads += layer.grads
        self.cache = None
        
    def forward(self, xs, h):
        N, T = xs.shape
        N, H = h.shape
        
        self.lstm.set_state(h)
        
        out = self.embed.forward(xs)
        hs = np.repeat(h, T, axis = 0).reshape(N, T, H)
        out = np.concatenate((hs, out), axis = 2)
        
        out = self.lstm.forward(out)
        out = np.concatenate((hs, out), axis = 2)
        
        score = self.affine.forward(out)
        self.cache = H
        return score
    
    def backward(self, dscore):
        H = self.cache

        dout = self.affine.backward(dscore)
        dout, dhs0 = dout[:, :, H:], dout[:, :, :H]
        dout = self.lstm.backward(dout)
        dembed, dhs1 = dout[:, :, H:], dout[:, :, :H]
        self.embed.backward(dembed)

        dhs = dhs0 + dhs1
        dh = self.lstm.dh + np.sum(dhs, axis=1)
        return dh

    def generate(self, h, start_id, sample_size):
        sampled = []
        char_id = start_id
        self.lstm.set_state(h)

        H = h.shape[1]
        peeky_h = h.reshape(1, 1, H)
        for _ in range(sample_size):
            x = np.array([char_id]).reshape((1, 1))
            out = self.embed.forward(x)

            out = np.concatenate((peeky_h, out), axis=2)
            out = self.lstm.forward(out)
            out = np.concatenate((peeky_h, out), axis=2)
            score = self.affine.forward(out)

            char_id = np.argmax(score.flatten())
            sampled.append(char_id)

        return sampled

In [33]:
class PeekySeq2seq(Seq2seq):
    def __init__(self, vocab_size, wordvec_size, hidden_size):
        self.encoder = Encoder(vocab_size, wordvec_size, hidden_size)
        self.decoder = PeekyDecoder(vocab_size, wordvec_size, hidden_size)
        self.softmax = TimeSoftmaxWithLoss()
        
        self.params = self.encoder.params + self.decoder.params
        self.grads = self.encoder.grads + self.decoder.grads
        

In [34]:
# モデル/オプティマイザ/トレーナーの作成
model = PeekySeq2seq(vocab_size, wordvec_size, hidden_size)
optimizer = Adam()
trainer = Trainer(model, optimizer)

acc_list = []
for epoch in range(max_epoch):
    trainer.fit(x_train, t_train, max_epoch = 1,
               batch_size = batch_size, max_grad = max_grad)
    
    correct_num = 0
    for i in range(len(x_val)):
        question, correct = x_val[[i]], t_val[[i]]
        verbose = i < 10
        correct_num += eval_seq2seq(model, question, correct, id_to_char, verbose)
    acc = float(correct_num) / len(x_val)
    acc_list.append(acc)
    print('val acc %.3f%%' % (acc * 100))

| epoch 1 |  iter 1 / 351 | time 0[s] | loss 2.56
| epoch 1 |  iter 21 / 351 | time 1[s] | loss 2.38
| epoch 1 |  iter 41 / 351 | time 3[s] | loss 2.09
| epoch 1 |  iter 61 / 351 | time 5[s] | loss 1.88
| epoch 1 |  iter 81 / 351 | time 7[s] | loss 1.81
| epoch 1 |  iter 101 / 351 | time 9[s] | loss 1.80
| epoch 1 |  iter 121 / 351 | time 11[s] | loss 1.78
| epoch 1 |  iter 141 / 351 | time 13[s] | loss 1.77
| epoch 1 |  iter 161 / 351 | time 15[s] | loss 1.76
| epoch 1 |  iter 181 / 351 | time 17[s] | loss 1.76
| epoch 1 |  iter 201 / 351 | time 19[s] | loss 1.75
| epoch 1 |  iter 221 / 351 | time 21[s] | loss 1.75
| epoch 1 |  iter 241 / 351 | time 24[s] | loss 1.75
| epoch 1 |  iter 261 / 351 | time 26[s] | loss 1.74
| epoch 1 |  iter 281 / 351 | time 28[s] | loss 1.74
| epoch 1 |  iter 301 / 351 | time 30[s] | loss 1.72
| epoch 1 |  iter 321 / 351 | time 33[s] | loss 1.72
| epoch 1 |  iter 341 / 351 | time 35[s] | loss 1.71
Q   58+77
T 162 
[91m☒[0m 100 
---
Q 461+579
T 1139
[91

| epoch 7 |  iter 101 / 351 | time 12[s] | loss 0.72
| epoch 7 |  iter 121 / 351 | time 14[s] | loss 0.71
| epoch 7 |  iter 141 / 351 | time 16[s] | loss 0.71
| epoch 7 |  iter 161 / 351 | time 18[s] | loss 0.69
| epoch 7 |  iter 181 / 351 | time 20[s] | loss 0.69
| epoch 7 |  iter 201 / 351 | time 22[s] | loss 0.69
| epoch 7 |  iter 221 / 351 | time 24[s] | loss 0.68
| epoch 7 |  iter 241 / 351 | time 26[s] | loss 0.67
| epoch 7 |  iter 261 / 351 | time 28[s] | loss 0.66
| epoch 7 |  iter 281 / 351 | time 30[s] | loss 0.66
| epoch 7 |  iter 301 / 351 | time 32[s] | loss 0.64
| epoch 7 |  iter 321 / 351 | time 34[s] | loss 0.63
| epoch 7 |  iter 341 / 351 | time 36[s] | loss 0.63
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[91m☒[0m 1143
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[91m☒[0m 153 
---
Q  55+763
T 422 
[91m☒[0m 423 
---
Q 752+006
T 857 
[91m☒[0m 850 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[91m☒[0m 1421
---
Q  838+62
T 864

| epoch 13 |  iter 161 / 351 | time 16[s] | loss 0.13
| epoch 13 |  iter 181 / 351 | time 18[s] | loss 0.13
| epoch 13 |  iter 201 / 351 | time 20[s] | loss 0.13
| epoch 13 |  iter 221 / 351 | time 22[s] | loss 0.13
| epoch 13 |  iter 241 / 351 | time 24[s] | loss 0.12
| epoch 13 |  iter 261 / 351 | time 26[s] | loss 0.12
| epoch 13 |  iter 281 / 351 | time 28[s] | loss 0.12
| epoch 13 |  iter 301 / 351 | time 30[s] | loss 0.12
| epoch 13 |  iter 321 / 351 | time 32[s] | loss 0.12
| epoch 13 |  iter 341 / 351 | time 34[s] | loss 0.11
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 90.960%
| epoch 14 |  iter 1 / 351 | time 0[s] | loss 0.12
| epoch 14 |  iter 21 /

| epoch 19 |  iter 221 / 351 | time 22[s] | loss 0.04
| epoch 19 |  iter 241 / 351 | time 24[s] | loss 0.04
| epoch 19 |  iter 261 / 351 | time 26[s] | loss 0.04
| epoch 19 |  iter 281 / 351 | time 28[s] | loss 0.04
| epoch 19 |  iter 301 / 351 | time 30[s] | loss 0.06
| epoch 19 |  iter 321 / 351 | time 32[s] | loss 0.05
| epoch 19 |  iter 341 / 351 | time 34[s] | loss 0.05
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 93.640%
| epoch 20 |  iter 1 / 351 | time 0[s] | loss 0.05
| epoch 20 |  iter 21 / 351 | time 2[s] | loss 0.05
| epoch 20 |  iter 41 / 351 | time 4[s] | loss 0.05
| epoch 20 |  iter 61 / 351 | time 6[s] | loss 0.04
| epoch 20 |  iter 81 / 351 |

| epoch 25 |  iter 281 / 351 | time 28[s] | loss 0.02
| epoch 25 |  iter 301 / 351 | time 30[s] | loss 0.02
| epoch 25 |  iter 321 / 351 | time 32[s] | loss 0.02
| epoch 25 |  iter 341 / 351 | time 34[s] | loss 0.02
Q   58+77
T 162 
[92m☑[0m 162 
---
Q 461+579
T 1139
[92m☑[0m 1139
---
Q  48+285
T 666 
[92m☑[0m 666 
---
Q   551+8
T 163 
[92m☑[0m 163 
---
Q  55+763
T 422 
[92m☑[0m 422 
---
Q 752+006
T 857 
[92m☑[0m 857 
---
Q 292+167
T 1053
[92m☑[0m 1053
---
Q 795+038
T 1427
[92m☑[0m 1427
---
Q  838+62
T 864 
[92m☑[0m 864 
---
Q  39+341
T 236 
[92m☑[0m 236 
---
val acc 96.800%
