In [1]:
import logging
import h5py
import torch
import torch._utils

try:
    torch._utils._rebuild_tensor_v2
except AttributeError:
    def _rebuild_tensor_v2(storage, storage_offset, size, stride, requires_grad, backward_hooks):
        tensor = torch._utils._rebuild_tensor(storage, storage_offset, size, stride)
        tensor.requires_grad = requires_grad
        tensor._backward_hooks = backward_hooks
        return tensor
    torch._utils._rebuild_tensor_v2 = _rebuild_tensor_v2

import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import torch.optim as optim
import time
import numpy as np
import sys
from models.baseline_snli import encoder
from models.baseline_snli import binary_label_atten
import argparse
from models.snli_data import snli_data
from models.snli_data import w2v
from random import shuffle
from models.baseline_snli import SeqAttnMatch

In [2]:
train_file = '/homes/rpujari/scratch/parikh_nli/preprocess/decomp-attn/data/st_bin-train.hdf5'
dev_file = '/homes/rpujari/scratch/parikh_nli/preprocess/decomp-attn/data/st_bin-val.hdf5'
test_file = '/homes/rpujari/scratch/parikh_nli/preprocess/decomp-attn/data/st_bin-test.hdf5'
w2v_file = '/homes/rpujari/scratch/parikh_nli/preprocess/decomp-attn/data/glove.hdf5'
log_dir = '/homes/rpujari/scratch/parikh_nli/code/trained_model/'
log_fname = 'st_bin_0.log'
gpu_id = 1
embedding_size = 300
epoch = 250
dev_interval = 1
optimizer ='Adagrad'
Adagrad_init = 0.
lr = 0.05
hidden_size = 300
max_length = -1
display_interval = 1000
max_grad_norm = 5
para_init = 0.1
weight_decay = 1e-5
model_path = '/homes/rpujari/scratch/parikh_nli/code/trained_model/'
trained_encoder_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_input-encoder.pt'
trained_attn_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_inter-atten.pt'
seq_attn_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_seq-atten.pt'
input_optimizer_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_input-optimizer.pt'
inter_atten_optimizer_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_inter-atten-optimizer.pt'
seq_atten_optimizer_saved = '/homes/rpujari/scratch/parikh_nli/code/saved_model/snli_bin_1_epoch-241_dev-acc-0.781_seq-atten-optimizer.pt' 
resume = True

In [3]:
if max_length < 0:
    max_length = 9999

# initialize the logger
# create logger
logger_name = "mylog"
logger = logging.getLogger(logger_name)
logger.setLevel(logging.INFO)

# file handler
fh = logging.FileHandler(log_dir + log_fname)
fh.setLevel(logging.INFO)
logger.addHandler(fh)

# stream handler
console = logging.StreamHandler()
console.setLevel(logging.INFO)
logger.addHandler(console)

torch.cuda.set_device(gpu_id)

In [4]:
# load train/dev/test data
# train data
logger.info('loading data...')
train_data = snli_data(train_file, max_length)
train_batches = train_data.batches
train_lbl_size = 2
dev_data = snli_data(dev_file, max_length)
dev_batches = dev_data.batches
test_data = snli_data(test_file, max_length)
test_batches = test_data.batches
logger.info('train size # sent ' + str(train_data.size))
logger.info('dev size # sent ' + str(dev_data.size))
logger.info('test size # sent ' + str(test_data.size))

loading data...
train size # sent 37765
dev size # sent 15988
test size # sent 9802


In [5]:
# get input embeddings
logger.info('loading input embeddings...')
word_vecs = w2v(w2v_file).word_vecs 

# build the model
input_encoder = encoder(word_vecs.size()[0], embedding_size, hidden_size, para_init)
input_encoder.embedding.weight.data.copy_(word_vecs)
input_encoder.embedding.weight.requires_grad = False
seq_atten = SeqAttnMatch(hidden_size, para_init)
inter_atten = binary_label_atten(hidden_size, train_lbl_size, para_init)

input_encoder.cuda(gpu_id)
seq_atten.cuda(gpu_id)  
inter_atten.cuda(gpu_id)

loading input embeddings...


binary_label_atten (
  (mlp_f): Sequential (
    (0): Dropout (p = 0.2)
    (1): Linear (300 -> 300)
    (2): ReLU ()
    (3): Dropout (p = 0.2)
    (4): Linear (300 -> 300)
    (5): ReLU ()
  )
  (mlp_g): Sequential (
    (0): Dropout (p = 0.2)
    (1): Linear (600 -> 300)
    (2): ReLU ()
    (3): Dropout (p = 0.2)
    (4): Linear (300 -> 300)
    (5): ReLU ()
  )
  (mlp_h): Sequential (
    (0): Dropout (p = 0.2)
    (1): Linear (600 -> 300)
    (2): ReLU ()
    (3): Dropout (p = 0.2)
    (4): Linear (300 -> 300)
    (5): ReLU ()
  )
  (entail_linear): Linear (300 -> 2)
  (contradict_linear): Linear (300 -> 2)
  (log_prob): LogSoftmax ()
)

In [6]:
if resume == True:
    logger.info('loading trained model.')    
    input_encoder.load_state_dict(torch.load(trained_encoder_saved, map_location={'cuda:0':'cuda:1'}))
    inter_atten.load_state_dict(torch.load(trained_attn_saved, map_location={'cuda:0':'cuda:1'}))
    seq_atten.load_state_dict(torch.load(seq_attn_saved, map_location={'cuda:0':'cuda:1'})) 

loading trained model.


In [7]:
with open('/homes/rpujari/scratch/parikh_nli/preprocess/decomp-attn/data/snli.word.dict', 'r') as infile:
    vocab_dict = {}
    flines = infile.read().split('\n')
    for line in flines:
        cols = line.split()
        if len(cols) == 2:
            vocab_dict[int(cols[1])] = cols[0]

In [13]:
def test_acc(inp_data, debug=False):
    #test before training starts
    input_encoder.eval()
    seq_atten.eval()
    inter_atten.eval()

    correct = 0.
    total = 0.
    
    ent_c = 0.
    ent_tot = 0.
    
    contr_c = 0.
    contr_tot = 0.
    
    neu_c = 0.
    neu_tot = 0.
    
    ent_pr = 0.
    contr_pr = 0.
    neu_pr = 0.

    have_ques = inp_data.have_ques
    #have_ques = 0
    
    logger.info('test before training starts')
    for i in range(len(inp_data.batches)):
        test_src_batch, test_tgt_batch, test_ques_batch, test_lbl_batch = inp_data.batches[i]

        test_src_batch = Variable(test_src_batch.cuda(gpu_id))
        test_tgt_batch = Variable(test_tgt_batch.cuda(gpu_id))
        test_ques_batch = Variable(test_ques_batch.cuda(gpu_id))
        test_lbl_batch = Variable(test_lbl_batch.cuda(gpu_id))

        test_src_linear, test_tgt_linear, test_ques_linear = input_encoder( 
           test_src_batch, test_tgt_batch, test_ques_batch)

        if have_ques == 1:
            #Prepare masks
            test_ques_mask = Variable(torch.from_numpy(np.zeros(test_ques_linear.data.shape[:2])).byte().cuda(gpu_id))
            test_src_linear = seq_atten.forward(test_src_linear, test_ques_linear, test_ques_mask)
            test_tgt_linear = seq_atten.forward(test_tgt_linear, test_ques_linear, test_ques_mask)

        ent_probs, contr_probs = inter_atten(test_src_linear, test_tgt_linear)  

        ent_prob, ent_pred = ent_probs.data.max(dim=1)
        contr_prob, contr_pred = contr_probs.data.max(dim=1)
        total += test_lbl_batch.data.size()[0]

        for eg_num in range(len(ent_pred)):
            #Debugging info
            if debug:
                sent = []
                for idx in range(test_src_batch.data[eg_num].size()[0]):
                    sent.append(vocab_dict[test_src_batch.data[eg_num][idx] + 1])
                t_sent = []
                for idx in range(test_tgt_batch.data[eg_num].size()[0]):
                    t_sent.append(vocab_dict[test_tgt_batch.data[eg_num][idx] + 1])
                q_sent = []
                for idx in range(test_ques_batch.data[eg_num].size()[0]):
                    q_sent.append(vocab_dict[test_ques_batch.data[eg_num][idx] + 1])
                print(' '.join(q_sent))
                print(' '.join(sent))
                print(' '.join(t_sent))
                print(ent_pred[eg_num], contr_pred[eg_num], test_lbl_batch.data[eg_num][0], test_lbl_batch.data[eg_num][1])
                print('\n')
            
            #Prediction and book-keeping
            if (ent_pred[eg_num] == test_lbl_batch.data[eg_num][0] or test_lbl_batch.data[eg_num][0] == -1) and \
                (contr_pred[eg_num] == test_lbl_batch.data[eg_num][1] or test_lbl_batch.data[eg_num][1] == -1):
                correct += 1.0
                if ent_pred[eg_num] == 1 and contr_pred[eg_num] == 0:
                    ent_c += 1
                elif ent_pred[eg_num] == 0 and contr_pred[eg_num] == 1:
                    contr_c += 1
                elif ent_pred[eg_num] == 0 and contr_pred[eg_num] == 0:
                    neu_c += 1
            
            if ent_pred[eg_num] == 1 and contr_pred[eg_num] == 0:
                ent_pr += 1
            elif ent_pred[eg_num] == 0 and contr_pred[eg_num] == 1:
                contr_pr += 1
            elif ent_pred[eg_num] == 0 and contr_pred[eg_num] == 0:
                neu_pr += 1
            else:
                if ent_prob[eg_num] > contr_prob[eg_num]:
                    ent_pr += 1
                else:
                    contr_pr += 1
                        
            if test_lbl_batch.data[eg_num][0] == 1 and test_lbl_batch.data[eg_num][1] <= 0:
                ent_tot += 1
            elif test_lbl_batch.data[eg_num][0] <= 0 and test_lbl_batch.data[eg_num][1] == 1:
                contr_tot += 1
            elif test_lbl_batch.data[eg_num][0] <= 0 and test_lbl_batch.data[eg_num][1] <= 0:
                neu_tot += 1
                    
    return correct/total, ent_c, contr_c, neu_c, ent_pr, contr_pr, neu_pr, ent_tot, contr_tot, neu_tot

In [12]:
print test_acc(test_data)

test before training starts


(0.3526471488319902, 1857.0, 1600.0, 0.0, 5592.0, 2794.0, 1417.0, 3652.0, 6151.0, 0.0)


In [14]:
best_dev = []   # (epoch, dev_acc)

input_encoder.train()
seq_atten.train()
inter_atten.train()

para1 = filter(lambda p: p.requires_grad, input_encoder.parameters())
para2 = inter_atten.parameters()
para3 = seq_atten.parameters()

if optimizer == 'Adagrad':
    input_optimizer = optim.Adagrad(para1, lr=lr, weight_decay=weight_decay)
    inter_atten_optimizer = optim.Adagrad(para2, lr=lr, weight_decay=weight_decay)
    seq_atten_optimizer = optim.Adagrad(para3, lr=lr, weight_decay=weight_decay)
elif optimizer == 'Adadelta':
    input_optimizer = optim.Adadelta(para1, lr=lr)
    inter_atten_optimizer = optim.Adadelta(para2, lr=lr)
    seq_atten_optimizer = optim.Adadelta(para3, lr=lr, weight_decay=weight_decay)
else:
    logger.info('No Optimizer.')
    sys.exit()

if resume:
    input_optimizer.load_state_dict(torch.load(input_optimizer_saved, map_location={'cuda:0':'cuda:1'}))
    inter_atten_optimizer.load_state_dict(torch.load(inter_atten_optimizer_saved, map_location={'cuda:0':'cuda:1'}))
    if not seq_atten_optimizer_saved == 'none':
        seq_atten_optimizer.load_state_dict(torch.load(seq_atten_optimizer_saved, map_location={'cuda:0':'cuda:1'}))

class_weights = torch.FloatTensor([1.0, 3.0]).cuda(gpu_id)        
criterion = nn.NLLLoss(size_average=True, weight=class_weights)
# criterion = nn.CrossEntropyLoss()

In [None]:
logger.info('start to train...')
for k in range(epoch):

    total = 0.
    correct = 0.
    loss_data = 0.
    train_sents = 0.

    shuffle(train_batches)
    timer = time.time()

    # initialize the optimizer
    if k == 0 and optimizer == 'Adagrad' and (not resume):
        logger.info('Initializing optimizer')
        for group in input_optimizer.param_groups:
            for p in group['params']:
                state = input_optimizer.state[p]
                state['sum'] += Adagrad_init
        for group in inter_atten_optimizer.param_groups:
            for p in group['params']:
                state = inter_atten_optimizer.state[p]
                state['sum'] += Adagrad_init
        for group in seq_atten_optimizer.param_groups:
            for p in group['params']:
                state = seq_atten_optimizer.state[p]
                state['sum'] += Adagrad_init
    elif k == 0 and optimizer == 'Adagrad' and seq_atten_optimizer == 'none':
        for group in seq_atten_optimizer.param_groups:
            for p in group['params']:
                state = seq_atten_optimizer.state[p]
                state['sum'] += Adagrad_init


    for i in range(len(train_batches)):
        train_src_batch, train_tgt_batch, train_ques_batch, train_lbl_batch = train_batches[i]

        train_src_batch = Variable(train_src_batch.cuda(gpu_id))
        train_tgt_batch = Variable(train_tgt_batch.cuda(gpu_id))
        train_ques_batch = Variable(train_ques_batch.cuda(gpu_id))
        train_lbl_batch = Variable(train_lbl_batch.cuda(gpu_id))

        batch_size = train_src_batch.size(0)
        train_sents += batch_size

        input_optimizer.zero_grad()
        inter_atten_optimizer.zero_grad()
        seq_atten_optimizer.zero_grad()

        train_src_linear, train_tgt_linear, train_ques_linear = input_encoder(
            train_src_batch, train_tgt_batch, train_ques_batch)

        if train_data.have_ques == 1:
            #Prepare masks
            train_ques_mask = Variable(torch.from_numpy(np.zeros(train_ques_linear.data.shape[:2])).byte().cuda(gpu_id))
            train_src_linear = seq_atten.forward(train_src_linear, train_ques_linear, train_ques_mask)
            train_tgt_linear = seq_atten.forward(train_tgt_linear, train_ques_linear, train_ques_mask)

        ent_prob, contr_prob = inter_atten(train_src_linear, train_tgt_linear)

        loss = 0
        for eg_num in range(contr_prob.size()[0]):
            #print(eg_num)
            if int(train_lbl_batch[eg_num][0].data[0]) >= 0 and int(train_lbl_batch[eg_num][1].data[0]) >= 0:
                loss += criterion(contr_prob[eg_num].view(-1, 2), train_lbl_batch[eg_num][1].view(-1)) + \
                       criterion(ent_prob[eg_num].view(-1, 2), train_lbl_batch[eg_num][0].view(-1))
            elif int(train_lbl_batch[eg_num][0].data[0]) >= 0:
                loss += criterion(ent_prob[eg_num].view(-1, 2), train_lbl_batch[eg_num][0].view(-1))
            elif int(train_lbl_batch[eg_num][1].data[0]) >= 0:
                loss += criterion(contr_prob[eg_num].view(-1, 2), train_lbl_batch[eg_num][1].view(-1))

        loss.backward()

        grad_norm = 0.
        para_norm = 0.

        for m in input_encoder.modules():
            if isinstance(m, nn.Linear):
                grad_norm += m.weight.grad.data.norm() ** 2
                para_norm += m.weight.data.norm() ** 2
                if m.bias:
                    grad_norm += m.bias.grad.data.norm() ** 2
                    para_norm += m.bias.data.norm() ** 2

        for m in inter_atten.modules():
            if isinstance(m, nn.Linear):
                if m.weight.grad is not None:
                    grad_norm += m.weight.grad.data.norm() ** 2
                    para_norm += m.weight.data.norm() ** 2
                    if int(m.bias.data[0]):
                        grad_norm += m.bias.grad.data.norm() ** 2
                        para_norm += m.bias.data.norm() ** 2

        if train_data.have_ques == 1:
            for m in seq_atten.modules():
                if isinstance(m, nn.Linear):
                    grad_norm += m.weight.grad.data.norm() ** 2
                    para_norm += m.weight.data.norm() ** 2
                    if int(m.bias.data[0]):
                        grad_norm += m.bias.grad.data.norm() ** 2
                        para_norm += m.bias.data.norm() ** 2


        grad_norm = grad_norm ** 0.5
        para_norm = para_norm ** 0.5

        shrinkage = max_grad_norm / (grad_norm + 0.01)
        if shrinkage < 1 :
            for m in input_encoder.modules():
                # print m
                if isinstance(m, nn.Linear):
                    m.weight.grad.data = m.weight.grad.data * shrinkage
            for m in inter_atten.modules():
                #print m
                if isinstance(m, nn.Linear):
                    if m.weight.grad is not None:
                        m.weight.grad.data = m.weight.grad.data * shrinkage
                        m.bias.grad.data = m.bias.grad.data * shrinkage
            if train_data.have_ques == 1:
                for m in seq_atten.modules():
                    # print m
                    if isinstance(m, nn.Linear):
                        m.weight.grad.data = m.weight.grad.data * shrinkage
                        m.bias.grad.data = m.bias.grad.data * shrinkage


        input_optimizer.step()
        inter_atten_optimizer.step()
        if train_data.have_ques == 1:
            seq_atten_optimizer.step()

        _, ent_pred = ent_prob.data.max(dim=1)
        _, contr_pred = contr_prob.data.max(dim=1)
        total += train_lbl_batch.data.size()[0]
        for eg_num in range(len(ent_pred)):
            if ent_pred[eg_num] == train_lbl_batch.data[eg_num][0] and contr_pred[eg_num] == train_lbl_batch.data[eg_num][1]:
                correct += 1.0
        
        loss_data += (loss.data[0] * batch_size)  # / train_lbl_batch.data.size()[0])

        if (i + 1) % display_interval == 0:
            logger.info('epoch %d, batches %d|%d, train-acc %.3f, loss %.3f, para-norm %.3f, grad-norm %.3f, time %.2fs, ' %
                        (k, i + 1, len(train_batches), correct / total,
                         loss_data / train_sents, para_norm, grad_norm, time.time() - timer))
            train_sents = 0.
            timer = time.time()
            loss_data = 0.
            correct = 0.
            total = 0.
        if i == len(train_batches) - 1:
            logger.info('epoch %d, batches %d|%d, train-acc %.3f, loss %.3f, para-norm %.3f, grad-norm %.3f, time %.2fs, ' %
                        (k, i + 1, len(train_batches), correct / total,
                         loss_data / train_sents, para_norm, grad_norm, time.time() - timer))
            train_sents = 0.
            timer = time.time()
            loss_data = 0.
            correct = 0.
            total = 0.           

    # evaluate
    if (k + 1) % dev_interval == 0:
        res = test_acc(dev_data)

        dev_acc = res[0]
        print(res)
        logger.info('dev-acc %.3f' % (dev_acc))

        if (k + 1) / dev_interval == 1:
            model_fname = '%s%s_epoch-%d_dev-acc-%.3f' %(model_path, log_fname.split('.')[0], k, dev_acc)
            torch.save(input_encoder.state_dict(), model_fname + '_input-encoder.pt')
            torch.save(inter_atten.state_dict(), model_fname + '_inter-atten.pt')
            torch.save(seq_atten.state_dict(), model_fname + '_seq-atten.pt')
            torch.save(input_optimizer.state_dict(), model_fname + '_input-optimizer.pt')
            torch.save(inter_atten_optimizer.state_dict(), model_fname + '_inter-atten-optimizer.pt')
            torch.save(seq_atten_optimizer.state_dict(), model_fname + '_seq-atten-optimizer.pt')
            best_dev.append((k, dev_acc, model_fname))
            logger.info('current best-dev:')
            for t in best_dev:
                logger.info('\t%d %.3f' %(t[0], t[1]))
            logger.info('save model!') 
        else:
            if dev_acc > best_dev[-1][1]:
                model_fname = '%s%s_epoch-%d_dev-acc-%.3f' %(model_path, log_fname.split('.')[0], k, dev_acc)
                torch.save(input_encoder.state_dict(), model_fname + '_input-encoder.pt')
                torch.save(inter_atten.state_dict(), model_fname + '_inter-atten.pt')
                torch.save(seq_atten.state_dict(), model_fname + '_seq-atten.pt')
                torch.save(input_optimizer.state_dict(), model_fname + '_input-optimizer.pt')
                torch.save(inter_atten_optimizer.state_dict(), model_fname + '_inter-atten-optimizer.pt')
                torch.save(seq_atten_optimizer.state_dict(), model_fname + '_seq-atten-optimizer.pt') 
                best_dev.append((k, dev_acc, model_fname))
                logger.info('current best-dev:')
                for t in best_dev:
                    logger.info('\t%d %.3f' %(t[0], t[1]))
                logger.info('save model!') 

        input_encoder.train()
        inter_atten.train()
        seq_atten.train()

logger.info('training end!')

start to train...
epoch 0, batches 1000|7692, train-acc 0.463, loss 30.947, para-norm 81.764, grad-norm 272.292, time 18.69s, 
epoch 0, batches 2000|7692, train-acc 0.582, loss 24.754, para-norm 82.192, grad-norm 28.045, time 20.72s, 
epoch 0, batches 3000|7692, train-acc 0.609, loss 21.282, para-norm 82.397, grad-norm 68.757, time 19.60s, 
epoch 0, batches 4000|7692, train-acc 0.636, loss 21.324, para-norm 82.502, grad-norm 8.833, time 18.07s, 
epoch 0, batches 5000|7692, train-acc 0.626, loss 22.280, para-norm 82.446, grad-norm 16.150, time 19.01s, 
epoch 0, batches 6000|7692, train-acc 0.631, loss 21.944, para-norm 82.459, grad-norm 46.975, time 20.48s, 
epoch 0, batches 7000|7692, train-acc 0.640, loss 22.418, para-norm 82.507, grad-norm 0.427, time 20.74s, 
epoch 0, batches 7692|7692, train-acc 0.647, loss 23.039, para-norm 82.483, grad-norm 47.610, time 14.77s, 
test before training starts
dev-acc 0.655
current best-dev:
	0 0.655
save model!


(0.6552004503095878, 1116.0, 9360.0, 0.0, 1520.0, 14359.0, 110.0, 6183.0, 9806.0, 0.0)


epoch 1, batches 1000|7692, train-acc 0.662, loss 19.803, para-norm 82.423, grad-norm 49.021, time 21.70s, 
epoch 1, batches 2000|7692, train-acc 0.668, loss 21.927, para-norm 82.383, grad-norm 305.303, time 20.51s, 
epoch 1, batches 3000|7692, train-acc 0.643, loss 20.866, para-norm 82.399, grad-norm 241.836, time 19.68s, 
epoch 1, batches 4000|7692, train-acc 0.644, loss 23.003, para-norm 82.384, grad-norm 13.370, time 17.39s, 
epoch 1, batches 5000|7692, train-acc 0.648, loss 21.102, para-norm 82.336, grad-norm 106.239, time 20.23s, 
epoch 1, batches 6000|7692, train-acc 0.645, loss 21.341, para-norm 82.306, grad-norm 12.698, time 17.10s, 
epoch 1, batches 7000|7692, train-acc 0.652, loss 22.302, para-norm 82.250, grad-norm 81.806, time 17.01s, 
epoch 1, batches 7692|7692, train-acc 0.665, loss 20.992, para-norm 82.202, grad-norm 8.029, time 11.82s, 
test before training starts
dev-acc 0.661
current best-dev:
	0 0.655
	1 0.661
save model!


(0.6612671211457878, 1394.0, 9179.0, 0.0, 1965.0, 13887.0, 137.0, 6183.0, 9806.0, 0.0)


epoch 2, batches 1000|7692, train-acc 0.659, loss 18.863, para-norm 82.177, grad-norm 21.380, time 16.13s, 
epoch 2, batches 2000|7692, train-acc 0.659, loss 23.063, para-norm 82.162, grad-norm 33.715, time 17.72s, 
epoch 2, batches 3000|7692, train-acc 0.660, loss 21.978, para-norm 82.084, grad-norm 41.253, time 17.37s, 
epoch 2, batches 4000|7692, train-acc 0.654, loss 22.833, para-norm 82.031, grad-norm 20.561, time 19.35s, 
epoch 2, batches 5000|7692, train-acc 0.664, loss 21.053, para-norm 81.946, grad-norm 74.331, time 18.67s, 
epoch 2, batches 6000|7692, train-acc 0.657, loss 20.347, para-norm 81.884, grad-norm 23.460, time 18.41s, 
epoch 2, batches 7000|7692, train-acc 0.664, loss 20.151, para-norm 81.806, grad-norm 16.758, time 17.19s, 
epoch 2, batches 7692|7692, train-acc 0.670, loss 21.419, para-norm 81.784, grad-norm 25.973, time 13.11s, 
test before training starts
dev-acc 0.658


(0.6582025142285322, 1639.0, 8885.0, 0.0, 2414.0, 13332.0, 243.0, 6183.0, 9806.0, 0.0)


epoch 3, batches 1000|7692, train-acc 0.661, loss 20.152, para-norm 81.736, grad-norm 23.620, time 16.49s, 
epoch 3, batches 2000|7692, train-acc 0.665, loss 21.334, para-norm 81.690, grad-norm 0.024, time 17.93s, 
epoch 3, batches 3000|7692, train-acc 0.653, loss 20.776, para-norm 81.660, grad-norm 30.274, time 19.28s, 
epoch 3, batches 4000|7692, train-acc 0.674, loss 20.010, para-norm 81.637, grad-norm 0.011, time 18.14s, 
epoch 3, batches 5000|7692, train-acc 0.656, loss 21.741, para-norm 81.592, grad-norm 21.053, time 17.92s, 
epoch 3, batches 6000|7692, train-acc 0.673, loss 22.103, para-norm 81.541, grad-norm 4.425, time 18.68s, 
epoch 3, batches 7000|7692, train-acc 0.667, loss 22.147, para-norm 81.489, grad-norm 22.696, time 16.51s, 
epoch 3, batches 7692|7692, train-acc 0.666, loss 20.839, para-norm 81.460, grad-norm 6.370, time 11.89s, 
test before training starts
dev-acc 0.661


(0.6612045781474764, 1316.0, 9256.0, 0.0, 1799.0, 14034.0, 156.0, 6183.0, 9806.0, 0.0)


epoch 4, batches 1000|7692, train-acc 0.675, loss 19.819, para-norm 81.414, grad-norm 24.905, time 17.75s, 
epoch 4, batches 2000|7692, train-acc 0.670, loss 21.034, para-norm 81.352, grad-norm 21.241, time 17.25s, 
epoch 4, batches 3000|7692, train-acc 0.659, loss 21.541, para-norm 81.284, grad-norm 29.402, time 18.26s, 
epoch 4, batches 4000|7692, train-acc 0.678, loss 20.675, para-norm 81.243, grad-norm 0.653, time 18.02s, 
epoch 4, batches 5000|7692, train-acc 0.672, loss 21.193, para-norm 81.248, grad-norm 32.238, time 18.81s, 
epoch 4, batches 6000|7692, train-acc 0.663, loss 22.156, para-norm 81.187, grad-norm 44.091, time 17.59s, 
epoch 4, batches 7000|7692, train-acc 0.661, loss 21.181, para-norm 81.151, grad-norm 3.293, time 19.04s, 
epoch 4, batches 7692|7692, train-acc 0.657, loss 21.048, para-norm 81.150, grad-norm 0.720, time 12.48s, 
test before training starts
dev-acc 0.664
current best-dev:
	0 0.655
	1 0.661
	4 0.664
save model!


(0.6635812120833072, 1408.0, 9202.0, 0.0, 1970.0, 13895.0, 124.0, 6183.0, 9806.0, 0.0)


epoch 5, batches 1000|7692, train-acc 0.658, loss 22.306, para-norm 81.121, grad-norm 61.024, time 17.89s, 
epoch 5, batches 2000|7692, train-acc 0.678, loss 21.013, para-norm 81.069, grad-norm 8.519, time 17.73s, 
epoch 5, batches 3000|7692, train-acc 0.671, loss 19.817, para-norm 81.019, grad-norm 40.661, time 17.54s, 
epoch 5, batches 4000|7692, train-acc 0.666, loss 20.351, para-norm 80.960, grad-norm 13.750, time 16.32s, 
epoch 5, batches 5000|7692, train-acc 0.678, loss 19.955, para-norm 80.959, grad-norm 1.357, time 17.59s, 
epoch 5, batches 6000|7692, train-acc 0.661, loss 22.125, para-norm 80.928, grad-norm 9.678, time 19.55s, 
epoch 5, batches 7000|7692, train-acc 0.659, loss 21.556, para-norm 80.910, grad-norm 8.824, time 19.26s, 
epoch 5, batches 7692|7692, train-acc 0.661, loss 20.569, para-norm 80.908, grad-norm 9.494, time 12.73s, 
test before training starts
dev-acc 0.664
current best-dev:
	0 0.655
	1 0.661
	4 0.664
	5 0.664
save model!


(0.6644568140596661, 1426.0, 9198.0, 0.0, 1990.0, 13904.0, 95.0, 6183.0, 9806.0, 0.0)


epoch 6, batches 1000|7692, train-acc 0.670, loss 22.126, para-norm 80.876, grad-norm 160.493, time 18.26s, 
epoch 6, batches 2000|7692, train-acc 0.667, loss 21.793, para-norm 80.837, grad-norm 2.268, time 18.73s, 
epoch 6, batches 3000|7692, train-acc 0.668, loss 20.808, para-norm 80.798, grad-norm 41.952, time 18.88s, 
epoch 6, batches 4000|7692, train-acc 0.674, loss 21.859, para-norm 80.744, grad-norm 1.493, time 18.51s, 
epoch 6, batches 5000|7692, train-acc 0.660, loss 19.463, para-norm 80.692, grad-norm 27.426, time 17.30s, 
epoch 6, batches 6000|7692, train-acc 0.668, loss 21.119, para-norm 80.645, grad-norm 37.084, time 17.38s, 
epoch 6, batches 7000|7692, train-acc 0.673, loss 20.340, para-norm 80.616, grad-norm 48.597, time 18.48s, 
epoch 6, batches 7692|7692, train-acc 0.665, loss 20.359, para-norm 80.613, grad-norm 0.047, time 12.93s, 
test before training starts
dev-acc 0.667
current best-dev:
	0 0.655
	1 0.661
	4 0.664
	5 0.664
	6 0.667
save model!


(0.6674588779786103, 1702.0, 8970.0, 0.0, 2497.0, 13401.0, 91.0, 6183.0, 9806.0, 0.0)


epoch 7, batches 1000|7692, train-acc 0.675, loss 20.280, para-norm 80.598, grad-norm 29.182, time 17.32s, 
epoch 7, batches 2000|7692, train-acc 0.673, loss 21.194, para-norm 80.602, grad-norm 31.378, time 16.75s, 
epoch 7, batches 3000|7692, train-acc 0.677, loss 21.038, para-norm 80.580, grad-norm 14.700, time 17.62s, 
epoch 7, batches 4000|7692, train-acc 0.672, loss 20.379, para-norm 80.549, grad-norm 73.892, time 18.09s, 
epoch 7, batches 5000|7692, train-acc 0.665, loss 22.515, para-norm 80.533, grad-norm 44.219, time 17.72s, 
epoch 7, batches 6000|7692, train-acc 0.663, loss 20.698, para-norm 80.520, grad-norm 111.584, time 16.99s, 
epoch 7, batches 7000|7692, train-acc 0.672, loss 20.760, para-norm 80.483, grad-norm 32.453, time 18.14s, 
epoch 7, batches 7692|7692, train-acc 0.663, loss 20.194, para-norm 80.439, grad-norm 2.696, time 12.40s, 
test before training starts
dev-acc 0.662


(0.6623928951153918, 1417.0, 9174.0, 0.0, 2000.0, 13839.0, 150.0, 6183.0, 9806.0, 0.0)


epoch 8, batches 1000|7692, train-acc 0.665, loss 21.994, para-norm 80.430, grad-norm 26.103, time 18.37s, 
epoch 8, batches 2000|7692, train-acc 0.670, loss 20.780, para-norm 80.416, grad-norm 76.237, time 17.28s, 
epoch 8, batches 3000|7692, train-acc 0.667, loss 21.418, para-norm 80.389, grad-norm 8.172, time 19.08s, 
epoch 8, batches 4000|7692, train-acc 0.673, loss 20.454, para-norm 80.354, grad-norm 0.864, time 18.77s, 
epoch 8, batches 5000|7692, train-acc 0.658, loss 20.290, para-norm 80.331, grad-norm 24.741, time 17.99s, 
epoch 8, batches 6000|7692, train-acc 0.672, loss 20.866, para-norm 80.316, grad-norm 1.305, time 17.62s, 
epoch 8, batches 7000|7692, train-acc 0.671, loss 20.154, para-norm 80.282, grad-norm 0.875, time 17.62s, 
epoch 8, batches 7692|7692, train-acc 0.681, loss 21.429, para-norm 80.281, grad-norm 8.749, time 13.95s, 
test before training starts
dev-acc 0.666


(0.6658327600225155, 1536.0, 9110.0, 0.0, 2164.0, 13727.0, 98.0, 6183.0, 9806.0, 0.0)


epoch 9, batches 1000|7692, train-acc 0.674, loss 22.119, para-norm 80.305, grad-norm 16.369, time 18.82s, 
epoch 9, batches 2000|7692, train-acc 0.668, loss 19.825, para-norm 80.266, grad-norm 70.447, time 17.25s, 
epoch 9, batches 3000|7692, train-acc 0.685, loss 21.218, para-norm 80.214, grad-norm 3.463, time 18.95s, 
epoch 9, batches 4000|7692, train-acc 0.667, loss 21.804, para-norm 80.213, grad-norm 0.811, time 17.33s, 
epoch 9, batches 5000|7692, train-acc 0.670, loss 19.302, para-norm 80.192, grad-norm 52.503, time 18.05s, 
epoch 9, batches 6000|7692, train-acc 0.663, loss 21.194, para-norm 80.163, grad-norm 25.508, time 17.79s, 
epoch 9, batches 7000|7692, train-acc 0.665, loss 21.384, para-norm 80.138, grad-norm 56.717, time 19.17s, 
epoch 9, batches 7692|7692, train-acc 0.676, loss 20.174, para-norm 80.124, grad-norm 76.245, time 12.52s, 
test before training starts
dev-acc 0.667


(0.6669585339921196, 1568.0, 9096.0, 0.0, 2236.0, 13658.0, 95.0, 6183.0, 9806.0, 0.0)


epoch 10, batches 1000|7692, train-acc 0.664, loss 20.192, para-norm 80.090, grad-norm 18.076, time 17.50s, 
epoch 10, batches 2000|7692, train-acc 0.667, loss 21.138, para-norm 80.087, grad-norm 6.240, time 18.07s, 
epoch 10, batches 3000|7692, train-acc 0.657, loss 20.772, para-norm 80.077, grad-norm 11.960, time 18.39s, 
epoch 10, batches 4000|7692, train-acc 0.680, loss 20.712, para-norm 80.067, grad-norm 1.296, time 18.01s, 
epoch 10, batches 5000|7692, train-acc 0.687, loss 20.630, para-norm 80.052, grad-norm 39.892, time 18.03s, 
epoch 10, batches 6000|7692, train-acc 0.677, loss 22.820, para-norm 80.042, grad-norm 9.826, time 18.85s, 
epoch 10, batches 7000|7692, train-acc 0.679, loss 19.622, para-norm 80.057, grad-norm 46.429, time 18.72s, 
epoch 10, batches 7692|7692, train-acc 0.670, loss 21.096, para-norm 80.036, grad-norm 50.370, time 11.70s, 
test before training starts
dev-acc 0.666


(0.6661454750140722, 1779.0, 8872.0, 0.0, 2670.0, 13196.0, 123.0, 6183.0, 9806.0, 0.0)


epoch 11, batches 1000|7692, train-acc 0.665, loss 21.115, para-norm 80.020, grad-norm 39.277, time 17.81s, 
epoch 11, batches 2000|7692, train-acc 0.667, loss 21.793, para-norm 80.004, grad-norm 27.633, time 18.70s, 
epoch 11, batches 3000|7692, train-acc 0.671, loss 21.724, para-norm 79.971, grad-norm 71.649, time 18.45s, 
epoch 11, batches 4000|7692, train-acc 0.674, loss 18.659, para-norm 79.947, grad-norm 14.226, time 17.62s, 
epoch 11, batches 5000|7692, train-acc 0.680, loss 20.453, para-norm 79.923, grad-norm 25.926, time 18.39s, 
epoch 11, batches 6000|7692, train-acc 0.663, loss 20.182, para-norm 79.906, grad-norm 48.792, time 17.37s, 
epoch 11, batches 7000|7692, train-acc 0.676, loss 21.809, para-norm 79.893, grad-norm 7.742, time 17.01s, 
epoch 11, batches 7692|7692, train-acc 0.672, loss 20.963, para-norm 79.873, grad-norm 0.000, time 12.41s, 
test before training starts
dev-acc 0.667


(0.6670210769904309, 1633.0, 9032.0, 0.0, 2355.0, 13544.0, 90.0, 6183.0, 9806.0, 0.0)


epoch 12, batches 1000|7692, train-acc 0.684, loss 21.041, para-norm 79.872, grad-norm 0.211, time 18.94s, 
epoch 12, batches 2000|7692, train-acc 0.683, loss 21.618, para-norm 79.843, grad-norm 16.123, time 18.28s, 
epoch 12, batches 3000|7692, train-acc 0.683, loss 20.031, para-norm 79.844, grad-norm 22.889, time 19.41s, 
epoch 12, batches 4000|7692, train-acc 0.679, loss 20.128, para-norm 79.833, grad-norm 53.232, time 19.02s, 
epoch 12, batches 5000|7692, train-acc 0.670, loss 19.776, para-norm 79.799, grad-norm 52.999, time 17.15s, 
epoch 12, batches 6000|7692, train-acc 0.663, loss 20.268, para-norm 79.769, grad-norm 30.068, time 17.26s, 
epoch 12, batches 7000|7692, train-acc 0.674, loss 22.493, para-norm 79.756, grad-norm 0.366, time 18.63s, 
epoch 12, batches 7692|7692, train-acc 0.656, loss 21.712, para-norm 79.758, grad-norm 61.373, time 12.01s, 
test before training starts
dev-acc 0.669


(0.6689599099380824, 1741.0, 8955.0, 0.0, 2549.0, 13341.0, 99.0, 6183.0, 9806.0, 0.0)


current best-dev:
	0 0.655
	1 0.661
	4 0.664
	5 0.664
	6 0.667
	12 0.669
save model!
epoch 13, batches 1000|7692, train-acc 0.686, loss 20.782, para-norm 79.748, grad-norm 38.251, time 18.56s, 
epoch 13, batches 2000|7692, train-acc 0.687, loss 20.592, para-norm 79.747, grad-norm 11.921, time 17.55s, 
epoch 13, batches 3000|7692, train-acc 0.649, loss 20.866, para-norm 79.734, grad-norm 18.547, time 17.83s, 
epoch 13, batches 4000|7692, train-acc 0.662, loss 20.534, para-norm 79.750, grad-norm 61.217, time 17.71s, 
epoch 13, batches 5000|7692, train-acc 0.682, loss 21.385, para-norm 79.751, grad-norm 62.774, time 18.48s, 
epoch 13, batches 6000|7692, train-acc 0.671, loss 19.398, para-norm 79.738, grad-norm 112.378, time 16.98s, 
epoch 13, batches 7000|7692, train-acc 0.683, loss 22.204, para-norm 79.747, grad-norm 70.105, time 19.33s, 
epoch 13, batches 7692|7692, train-acc 0.672, loss 21.336, para-norm 79.745, grad-norm 54.960, time 13.80s, 
test before training starts
dev-acc 0.668


(0.667959221965101, 1644.0, 9036.0, 0.0, 2367.0, 13530.0, 92.0, 6183.0, 9806.0, 0.0)


epoch 14, batches 1000|7692, train-acc 0.676, loss 21.032, para-norm 79.729, grad-norm 61.930, time 18.34s, 
epoch 14, batches 2000|7692, train-acc 0.679, loss 22.347, para-norm 79.727, grad-norm 5.013, time 18.37s, 
epoch 14, batches 3000|7692, train-acc 0.681, loss 19.630, para-norm 79.725, grad-norm 0.264, time 18.30s, 
epoch 14, batches 4000|7692, train-acc 0.673, loss 20.911, para-norm 79.717, grad-norm 10.289, time 18.40s, 
epoch 14, batches 5000|7692, train-acc 0.666, loss 21.711, para-norm 79.693, grad-norm 32.435, time 17.76s, 
epoch 14, batches 6000|7692, train-acc 0.687, loss 20.215, para-norm 79.682, grad-norm 2.142, time 17.24s, 
epoch 14, batches 7000|7692, train-acc 0.667, loss 18.829, para-norm 79.682, grad-norm 19.596, time 18.70s, 
epoch 14, batches 7692|7692, train-acc 0.654, loss 21.962, para-norm 79.688, grad-norm 26.220, time 12.97s, 
test before training starts
dev-acc 0.669
current best-dev:
	0 0.655
	1 0.661
	4 0.664
	5 0.664
	6 0.667
	12 0.669
	14 0.669
save m

(0.6690224529363937, 1701.0, 8996.0, 0.0, 2454.0, 13435.0, 100.0, 6183.0, 9806.0, 0.0)


epoch 15, batches 1000|7692, train-acc 0.688, loss 21.482, para-norm 79.684, grad-norm 80.803, time 17.85s, 
epoch 15, batches 2000|7692, train-acc 0.667, loss 21.121, para-norm 79.675, grad-norm 136.538, time 18.19s, 
epoch 15, batches 3000|7692, train-acc 0.687, loss 20.713, para-norm 79.683, grad-norm 84.350, time 18.70s, 
epoch 15, batches 4000|7692, train-acc 0.687, loss 20.946, para-norm 79.681, grad-norm 23.851, time 18.05s, 
epoch 15, batches 5000|7692, train-acc 0.677, loss 19.786, para-norm 79.656, grad-norm 36.852, time 19.34s, 
epoch 15, batches 6000|7692, train-acc 0.672, loss 21.955, para-norm 79.648, grad-norm 14.255, time 16.25s, 
epoch 15, batches 7000|7692, train-acc 0.670, loss 19.949, para-norm 79.644, grad-norm 2.663, time 17.76s, 
epoch 15, batches 7692|7692, train-acc 0.668, loss 20.676, para-norm 79.641, grad-norm 1.105, time 11.42s, 
test before training starts
dev-acc 0.665


(0.6647695290512227, 1658.0, 8971.0, 0.0, 2451.0, 13458.0, 80.0, 6183.0, 9806.0, 0.0)


epoch 16, batches 1000|7692, train-acc 0.688, loss 20.639, para-norm 79.644, grad-norm 131.923, time 18.37s, 
epoch 16, batches 2000|7692, train-acc 0.678, loss 20.710, para-norm 79.633, grad-norm 10.786, time 20.00s, 
epoch 16, batches 3000|7692, train-acc 0.672, loss 20.670, para-norm 79.618, grad-norm 0.033, time 17.93s, 
epoch 16, batches 4000|7692, train-acc 0.670, loss 20.585, para-norm 79.626, grad-norm 0.041, time 17.48s, 
epoch 16, batches 5000|7692, train-acc 0.672, loss 22.476, para-norm 79.635, grad-norm 29.027, time 18.01s, 
epoch 16, batches 6000|7692, train-acc 0.697, loss 20.306, para-norm 79.622, grad-norm 28.050, time 17.93s, 
epoch 16, batches 7000|7692, train-acc 0.681, loss 21.162, para-norm 79.650, grad-norm 56.872, time 18.85s, 
epoch 16, batches 7692|7692, train-acc 0.675, loss 19.610, para-norm 79.648, grad-norm 16.075, time 12.27s, 
test before training starts
dev-acc 0.667


(0.6668334479954969, 1680.0, 8982.0, 0.0, 2450.0, 13436.0, 103.0, 6183.0, 9806.0, 0.0)


epoch 17, batches 1000|7692, train-acc 0.668, loss 22.681, para-norm 79.676, grad-norm 117.731, time 18.50s, 
epoch 17, batches 2000|7692, train-acc 0.688, loss 20.426, para-norm 79.668, grad-norm 39.815, time 18.22s, 
epoch 17, batches 3000|7692, train-acc 0.682, loss 20.867, para-norm 79.637, grad-norm 132.699, time 17.61s, 
epoch 17, batches 4000|7692, train-acc 0.682, loss 20.292, para-norm 79.640, grad-norm 113.334, time 18.22s, 
epoch 17, batches 5000|7692, train-acc 0.689, loss 20.400, para-norm 79.640, grad-norm 11.298, time 18.27s, 
epoch 17, batches 6000|7692, train-acc 0.680, loss 21.363, para-norm 79.624, grad-norm 26.352, time 16.70s, 
epoch 17, batches 7000|7692, train-acc 0.678, loss 20.203, para-norm 79.609, grad-norm 106.954, time 18.41s, 
epoch 17, batches 7692|7692, train-acc 0.681, loss 19.679, para-norm 79.605, grad-norm 39.201, time 13.15s, 
test before training starts
dev-acc 0.666


(0.6659578460191382, 1814.0, 8834.0, 0.0, 2713.0, 13160.0, 116.0, 6183.0, 9806.0, 0.0)


epoch 18, batches 1000|7692, train-acc 0.666, loss 22.556, para-norm 79.615, grad-norm 33.501, time 19.38s, 
epoch 18, batches 2000|7692, train-acc 0.683, loss 20.199, para-norm 79.610, grad-norm 45.795, time 17.58s, 
epoch 18, batches 3000|7692, train-acc 0.684, loss 20.427, para-norm 79.602, grad-norm 139.884, time 17.49s, 
epoch 18, batches 4000|7692, train-acc 0.690, loss 19.863, para-norm 79.597, grad-norm 16.323, time 16.77s, 
epoch 18, batches 5000|7692, train-acc 0.690, loss 20.514, para-norm 79.600, grad-norm 1.589, time 18.61s, 
epoch 18, batches 6000|7692, train-acc 0.683, loss 20.598, para-norm 79.597, grad-norm 30.636, time 18.70s, 
epoch 18, batches 7000|7692, train-acc 0.683, loss 20.489, para-norm 79.609, grad-norm 4.420, time 17.00s, 
epoch 18, batches 7692|7692, train-acc 0.680, loss 21.360, para-norm 79.602, grad-norm 72.610, time 12.75s, 
test before training starts
dev-acc 0.666


(0.6662080180123835, 1733.0, 8919.0, 0.0, 2572.0, 13330.0, 87.0, 6183.0, 9806.0, 0.0)


epoch 19, batches 1000|7692, train-acc 0.691, loss 21.710, para-norm 79.616, grad-norm 0.352, time 17.79s, 
epoch 19, batches 2000|7692, train-acc 0.674, loss 21.191, para-norm 79.605, grad-norm 25.365, time 19.12s, 
epoch 19, batches 3000|7692, train-acc 0.679, loss 20.832, para-norm 79.600, grad-norm 0.803, time 18.82s, 
epoch 19, batches 4000|7692, train-acc 0.692, loss 19.745, para-norm 79.596, grad-norm 80.502, time 16.58s, 
epoch 19, batches 5000|7692, train-acc 0.677, loss 20.328, para-norm 79.585, grad-norm 42.692, time 17.83s, 
epoch 19, batches 6000|7692, train-acc 0.679, loss 20.953, para-norm 79.579, grad-norm 5.282, time 18.38s, 
epoch 19, batches 7000|7692, train-acc 0.680, loss 21.329, para-norm 79.578, grad-norm 9.441, time 17.03s, 
epoch 19, batches 7692|7692, train-acc 0.684, loss 18.763, para-norm 79.573, grad-norm 72.584, time 12.16s, 
test before training starts
dev-acc 0.666


(0.6657702170242041, 1679.0, 8966.0, 0.0, 2466.0, 13390.0, 133.0, 6183.0, 9806.0, 0.0)


epoch 20, batches 1000|7692, train-acc 0.693, loss 22.736, para-norm 79.590, grad-norm 0.000, time 17.69s, 
epoch 20, batches 2000|7692, train-acc 0.675, loss 21.290, para-norm 79.577, grad-norm 12.866, time 18.24s, 
epoch 20, batches 3000|7692, train-acc 0.696, loss 18.978, para-norm 79.576, grad-norm 29.924, time 17.68s, 
epoch 20, batches 4000|7692, train-acc 0.679, loss 20.568, para-norm 79.577, grad-norm 31.269, time 18.85s, 
epoch 20, batches 5000|7692, train-acc 0.675, loss 20.828, para-norm 79.564, grad-norm 29.685, time 17.70s, 
epoch 20, batches 6000|7692, train-acc 0.686, loss 19.228, para-norm 79.547, grad-norm 35.719, time 16.38s, 
epoch 20, batches 7000|7692, train-acc 0.676, loss 21.591, para-norm 79.548, grad-norm 0.088, time 17.81s, 
epoch 20, batches 7692|7692, train-acc 0.678, loss 19.715, para-norm 79.558, grad-norm 41.390, time 12.06s, 
test before training starts
dev-acc 0.668


(0.667771592970167, 1756.0, 8921.0, 0.0, 2579.0, 13299.0, 111.0, 6183.0, 9806.0, 0.0)


epoch 21, batches 1000|7692, train-acc 0.688, loss 19.589, para-norm 79.565, grad-norm 73.085, time 17.64s, 
epoch 21, batches 2000|7692, train-acc 0.680, loss 20.746, para-norm 79.572, grad-norm 33.215, time 17.61s, 
epoch 21, batches 3000|7692, train-acc 0.680, loss 22.077, para-norm 79.554, grad-norm 11.988, time 18.22s, 
epoch 21, batches 4000|7692, train-acc 0.692, loss 18.685, para-norm 79.552, grad-norm 0.259, time 18.63s, 
epoch 21, batches 5000|7692, train-acc 0.678, loss 21.625, para-norm 79.550, grad-norm 68.316, time 17.86s, 
epoch 21, batches 6000|7692, train-acc 0.675, loss 21.012, para-norm 79.553, grad-norm 99.940, time 16.16s, 
epoch 21, batches 7000|7692, train-acc 0.692, loss 21.138, para-norm 79.567, grad-norm 24.534, time 19.10s, 
epoch 21, batches 7692|7692, train-acc 0.682, loss 20.834, para-norm 79.554, grad-norm 102.686, time 11.95s, 
test before training starts
dev-acc 0.668


(0.668146850960035, 1909.0, 8774.0, 0.0, 2872.0, 13017.0, 100.0, 6183.0, 9806.0, 0.0)


epoch 22, batches 1000|7692, train-acc 0.688, loss 19.726, para-norm 79.552, grad-norm 5.777, time 18.96s, 
epoch 22, batches 2000|7692, train-acc 0.687, loss 20.000, para-norm 79.554, grad-norm 59.791, time 18.89s, 
epoch 22, batches 3000|7692, train-acc 0.690, loss 21.373, para-norm 79.546, grad-norm 32.093, time 18.70s, 
epoch 22, batches 4000|7692, train-acc 0.683, loss 20.349, para-norm 79.533, grad-norm 266.205, time 17.37s, 
epoch 22, batches 5000|7692, train-acc 0.685, loss 20.733, para-norm 79.538, grad-norm 9.876, time 17.97s, 
epoch 22, batches 6000|7692, train-acc 0.684, loss 20.757, para-norm 79.543, grad-norm 17.436, time 18.08s, 
epoch 22, batches 7000|7692, train-acc 0.688, loss 21.833, para-norm 79.535, grad-norm 0.036, time 18.55s, 
epoch 22, batches 7692|7692, train-acc 0.679, loss 20.758, para-norm 79.545, grad-norm 23.348, time 12.88s, 
test before training starts
dev-acc 0.666


(0.6663956470073176, 1842.0, 8813.0, 0.0, 2792.0, 13096.0, 101.0, 6183.0, 9806.0, 0.0)


epoch 23, batches 1000|7692, train-acc 0.679, loss 21.363, para-norm 79.558, grad-norm 0.296, time 20.54s, 
epoch 23, batches 2000|7692, train-acc 0.685, loss 22.000, para-norm 79.550, grad-norm 16.398, time 20.05s, 
epoch 23, batches 3000|7692, train-acc 0.705, loss 20.268, para-norm 79.550, grad-norm 103.346, time 18.42s, 
epoch 23, batches 4000|7692, train-acc 0.697, loss 21.270, para-norm 79.554, grad-norm 0.272, time 18.64s, 
epoch 23, batches 5000|7692, train-acc 0.676, loss 20.626, para-norm 79.545, grad-norm 39.852, time 17.97s, 
epoch 23, batches 6000|7692, train-acc 0.688, loss 18.841, para-norm 79.551, grad-norm 20.718, time 18.67s, 
epoch 23, batches 7000|7692, train-acc 0.690, loss 20.280, para-norm 79.547, grad-norm 36.791, time 18.22s, 
epoch 23, batches 7692|7692, train-acc 0.705, loss 19.198, para-norm 79.556, grad-norm 0.350, time 12.35s, 
test before training starts
dev-acc 0.667


(0.6673337919819876, 1740.0, 8930.0, 0.0, 2577.0, 13337.0, 75.0, 6183.0, 9806.0, 0.0)


epoch 24, batches 1000|7692, train-acc 0.703, loss 20.486, para-norm 79.572, grad-norm 99.164, time 18.31s, 
epoch 24, batches 2000|7692, train-acc 0.679, loss 19.894, para-norm 79.575, grad-norm 26.676, time 18.22s, 
epoch 24, batches 3000|7692, train-acc 0.700, loss 19.926, para-norm 79.571, grad-norm 0.000, time 19.83s, 
epoch 24, batches 4000|7692, train-acc 0.691, loss 21.183, para-norm 79.570, grad-norm 100.763, time 18.65s, 
epoch 24, batches 5000|7692, train-acc 0.686, loss 19.659, para-norm 79.579, grad-norm 10.192, time 17.66s, 
epoch 24, batches 6000|7692, train-acc 0.690, loss 21.822, para-norm 79.581, grad-norm 1.329, time 19.32s, 
epoch 24, batches 7000|7692, train-acc 0.692, loss 21.303, para-norm 79.576, grad-norm 0.585, time 18.86s, 
epoch 24, batches 7692|7692, train-acc 0.672, loss 20.827, para-norm 79.566, grad-norm 104.917, time 13.19s, 
test before training starts
dev-acc 0.665


(0.6646444430546, 1958.0, 8669.0, 0.0, 3031.0, 12824.0, 134.0, 6183.0, 9806.0, 0.0)


epoch 25, batches 1000|7692, train-acc 0.694, loss 18.733, para-norm 79.573, grad-norm 37.131, time 17.43s, 
epoch 25, batches 2000|7692, train-acc 0.695, loss 21.212, para-norm 79.582, grad-norm 100.925, time 18.45s, 
epoch 25, batches 3000|7692, train-acc 0.688, loss 21.854, para-norm 79.589, grad-norm 90.948, time 17.43s, 
epoch 25, batches 4000|7692, train-acc 0.701, loss 19.687, para-norm 79.588, grad-norm 39.206, time 16.87s, 
epoch 25, batches 5000|7692, train-acc 0.684, loss 21.965, para-norm 79.587, grad-norm 27.392, time 18.64s, 
epoch 25, batches 6000|7692, train-acc 0.697, loss 20.156, para-norm 79.581, grad-norm 140.441, time 17.29s, 
epoch 25, batches 7000|7692, train-acc 0.695, loss 20.905, para-norm 79.573, grad-norm 143.784, time 17.44s, 
epoch 25, batches 7692|7692, train-acc 0.685, loss 19.994, para-norm 79.585, grad-norm 2.619, time 11.86s, 
test before training starts
dev-acc 0.668


(0.6678341359684783, 1896.0, 8782.0, 0.0, 2860.0, 13037.0, 92.0, 6183.0, 9806.0, 0.0)


epoch 26, batches 1000|7692, train-acc 0.702, loss 18.925, para-norm 79.593, grad-norm 39.944, time 19.11s, 
epoch 26, batches 2000|7692, train-acc 0.682, loss 21.143, para-norm 79.573, grad-norm 14.276, time 17.86s, 
epoch 26, batches 3000|7692, train-acc 0.703, loss 20.116, para-norm 79.589, grad-norm 0.288, time 18.65s, 
epoch 26, batches 4000|7692, train-acc 0.678, loss 21.090, para-norm 79.588, grad-norm 64.352, time 16.58s, 
epoch 26, batches 5000|7692, train-acc 0.702, loss 21.172, para-norm 79.598, grad-norm 6.174, time 18.47s, 
epoch 26, batches 6000|7692, train-acc 0.702, loss 20.739, para-norm 79.601, grad-norm 112.353, time 18.39s, 
epoch 26, batches 7000|7692, train-acc 0.680, loss 20.854, para-norm 79.596, grad-norm 29.257, time 18.09s, 
epoch 26, batches 7692|7692, train-acc 0.686, loss 20.197, para-norm 79.593, grad-norm 0.000, time 12.33s, 
test before training starts
dev-acc 0.664


(0.6638939270748639, 2033.0, 8582.0, 0.0, 3215.0, 12689.0, 85.0, 6183.0, 9806.0, 0.0)


epoch 27, batches 1000|7692, train-acc 0.702, loss 20.736, para-norm 79.607, grad-norm 106.413, time 18.47s, 
epoch 27, batches 2000|7692, train-acc 0.687, loss 21.505, para-norm 79.613, grad-norm 16.622, time 19.48s, 
epoch 27, batches 3000|7692, train-acc 0.695, loss 20.073, para-norm 79.613, grad-norm 0.027, time 20.95s, 
epoch 27, batches 4000|7692, train-acc 0.695, loss 20.312, para-norm 79.615, grad-norm 85.386, time 18.46s, 
epoch 27, batches 5000|7692, train-acc 0.690, loss 21.279, para-norm 79.610, grad-norm 0.003, time 17.93s, 
epoch 27, batches 6000|7692, train-acc 0.687, loss 19.418, para-norm 79.597, grad-norm 91.471, time 17.30s, 
epoch 27, batches 7000|7692, train-acc 0.718, loss 19.643, para-norm 79.589, grad-norm 38.598, time 18.53s, 
epoch 27, batches 7692|7692, train-acc 0.684, loss 20.953, para-norm 79.606, grad-norm 0.628, time 11.73s, 
test before training starts
dev-acc 0.667


(0.6667709049971856, 1974.0, 8687.0, 0.0, 3028.0, 12836.0, 125.0, 6183.0, 9806.0, 0.0)


epoch 28, batches 1000|7692, train-acc 0.701, loss 19.732, para-norm 79.611, grad-norm 0.044, time 17.26s, 
epoch 28, batches 2000|7692, train-acc 0.681, loss 20.326, para-norm 79.605, grad-norm 21.907, time 17.86s, 
epoch 28, batches 3000|7692, train-acc 0.691, loss 22.479, para-norm 79.602, grad-norm 49.224, time 19.70s, 
epoch 28, batches 4000|7692, train-acc 0.695, loss 18.950, para-norm 79.610, grad-norm 206.980, time 19.07s, 
epoch 28, batches 5000|7692, train-acc 0.706, loss 20.536, para-norm 79.615, grad-norm 2.998, time 17.17s, 
epoch 28, batches 6000|7692, train-acc 0.702, loss 21.177, para-norm 79.617, grad-norm 50.701, time 18.46s, 
epoch 28, batches 7000|7692, train-acc 0.694, loss 20.377, para-norm 79.595, grad-norm 71.767, time 17.39s, 
epoch 28, batches 7692|7692, train-acc 0.680, loss 21.082, para-norm 79.599, grad-norm 12.576, time 11.80s, 
test before training starts
dev-acc 0.664


(0.664269185064732, 2086.0, 8535.0, 0.0, 3282.0, 12597.0, 110.0, 6183.0, 9806.0, 0.0)


epoch 29, batches 1000|7692, train-acc 0.692, loss 19.844, para-norm 79.611, grad-norm 27.742, time 18.51s, 
epoch 29, batches 2000|7692, train-acc 0.693, loss 19.253, para-norm 79.611, grad-norm 4.131, time 17.68s, 
epoch 29, batches 3000|7692, train-acc 0.696, loss 20.481, para-norm 79.626, grad-norm 87.389, time 17.93s, 
epoch 29, batches 4000|7692, train-acc 0.695, loss 19.416, para-norm 79.618, grad-norm 0.641, time 19.18s, 
epoch 29, batches 5000|7692, train-acc 0.702, loss 20.790, para-norm 79.622, grad-norm 6.581, time 18.69s, 
epoch 29, batches 6000|7692, train-acc 0.704, loss 20.645, para-norm 79.625, grad-norm 8.874, time 17.91s, 
epoch 29, batches 7000|7692, train-acc 0.696, loss 22.135, para-norm 79.636, grad-norm 76.878, time 18.19s, 
epoch 29, batches 7692|7692, train-acc 0.696, loss 21.637, para-norm 79.634, grad-norm 0.131, time 12.19s, 
test before training starts
dev-acc 0.666


(0.6664581900056289, 1881.0, 8775.0, 0.0, 2862.0, 13030.0, 97.0, 6183.0, 9806.0, 0.0)


epoch 30, batches 1000|7692, train-acc 0.696, loss 19.466, para-norm 79.636, grad-norm 147.083, time 17.42s, 
epoch 30, batches 2000|7692, train-acc 0.701, loss 21.907, para-norm 79.651, grad-norm 131.422, time 17.08s, 
epoch 30, batches 3000|7692, train-acc 0.701, loss 21.566, para-norm 79.653, grad-norm 0.002, time 18.02s, 
epoch 30, batches 4000|7692, train-acc 0.696, loss 19.833, para-norm 79.655, grad-norm 37.518, time 18.32s, 
epoch 30, batches 5000|7692, train-acc 0.711, loss 19.161, para-norm 79.652, grad-norm 33.415, time 17.45s, 
epoch 30, batches 6000|7692, train-acc 0.691, loss 20.969, para-norm 79.649, grad-norm 76.113, time 17.90s, 
epoch 30, batches 7000|7692, train-acc 0.698, loss 19.334, para-norm 79.655, grad-norm 63.549, time 17.42s, 
epoch 30, batches 7692|7692, train-acc 0.691, loss 21.362, para-norm 79.652, grad-norm 59.531, time 13.19s, 
test before training starts
dev-acc 0.666


(0.6658953030208268, 1867.0, 8780.0, 0.0, 2825.0, 13060.0, 104.0, 6183.0, 9806.0, 0.0)


epoch 31, batches 1000|7692, train-acc 0.703, loss 21.829, para-norm 79.655, grad-norm 64.548, time 17.86s, 
epoch 31, batches 2000|7692, train-acc 0.698, loss 19.973, para-norm 79.646, grad-norm 0.273, time 17.63s, 
epoch 31, batches 3000|7692, train-acc 0.701, loss 21.559, para-norm 79.646, grad-norm 0.368, time 18.80s, 
epoch 31, batches 4000|7692, train-acc 0.699, loss 21.032, para-norm 79.643, grad-norm 13.510, time 18.25s, 
epoch 31, batches 5000|7692, train-acc 0.720, loss 18.441, para-norm 79.640, grad-norm 76.087, time 17.41s, 
epoch 31, batches 6000|7692, train-acc 0.692, loss 19.929, para-norm 79.650, grad-norm 2.415, time 18.11s, 
epoch 31, batches 7000|7692, train-acc 0.685, loss 20.474, para-norm 79.649, grad-norm 2.277, time 17.67s, 
epoch 31, batches 7692|7692, train-acc 0.687, loss 19.509, para-norm 79.643, grad-norm 54.782, time 13.17s, 
test before training starts
dev-acc 0.660


(0.6598911751829383, 2149.0, 8402.0, 0.0, 3467.0, 12398.0, 124.0, 6183.0, 9806.0, 0.0)


epoch 32, batches 1000|7692, train-acc 0.694, loss 20.877, para-norm 79.647, grad-norm 72.936, time 17.89s, 
epoch 32, batches 2000|7692, train-acc 0.699, loss 20.683, para-norm 79.641, grad-norm 13.205, time 18.28s, 
epoch 32, batches 3000|7692, train-acc 0.701, loss 20.915, para-norm 79.652, grad-norm 20.832, time 17.30s, 
epoch 32, batches 4000|7692, train-acc 0.712, loss 18.946, para-norm 79.665, grad-norm 6.080, time 17.13s, 
epoch 32, batches 5000|7692, train-acc 0.691, loss 20.019, para-norm 79.666, grad-norm 4.921, time 19.04s, 
epoch 32, batches 6000|7692, train-acc 0.699, loss 19.857, para-norm 79.669, grad-norm 92.779, time 17.66s, 
epoch 32, batches 7000|7692, train-acc 0.709, loss 21.776, para-norm 79.671, grad-norm 0.641, time 18.16s, 
epoch 32, batches 7692|7692, train-acc 0.708, loss 20.053, para-norm 79.678, grad-norm 33.391, time 10.97s, 
test before training starts
dev-acc 0.665


(0.6647069860529113, 2120.0, 8508.0, 0.0, 3351.0, 12518.0, 120.0, 6183.0, 9806.0, 0.0)


epoch 33, batches 1000|7692, train-acc 0.693, loss 21.576, para-norm 79.668, grad-norm 26.249, time 18.91s, 
epoch 33, batches 2000|7692, train-acc 0.712, loss 19.865, para-norm 79.699, grad-norm 46.489, time 17.62s, 
epoch 33, batches 3000|7692, train-acc 0.710, loss 19.220, para-norm 79.705, grad-norm 9.879, time 16.96s, 
epoch 33, batches 4000|7692, train-acc 0.706, loss 19.343, para-norm 79.712, grad-norm 45.843, time 17.28s, 
epoch 33, batches 5000|7692, train-acc 0.702, loss 21.571, para-norm 79.707, grad-norm 122.587, time 17.99s, 
epoch 33, batches 6000|7692, train-acc 0.714, loss 20.046, para-norm 79.703, grad-norm 56.509, time 18.74s, 
epoch 33, batches 7000|7692, train-acc 0.701, loss 20.410, para-norm 79.703, grad-norm 11.746, time 19.12s, 
epoch 33, batches 7692|7692, train-acc 0.702, loss 21.319, para-norm 79.709, grad-norm 17.708, time 13.23s, 
test before training starts
dev-acc 0.663


(0.6630808680968165, 2224.0, 8378.0, 0.0, 3585.0, 12290.0, 114.0, 6183.0, 9806.0, 0.0)


epoch 34, batches 1000|7692, train-acc 0.704, loss 20.534, para-norm 79.705, grad-norm 36.717, time 18.89s, 
epoch 34, batches 2000|7692, train-acc 0.716, loss 18.873, para-norm 79.714, grad-norm 68.121, time 18.45s, 
epoch 34, batches 3000|7692, train-acc 0.711, loss 20.145, para-norm 79.725, grad-norm 0.760, time 19.61s, 
epoch 34, batches 4000|7692, train-acc 0.706, loss 20.054, para-norm 79.728, grad-norm 71.580, time 17.31s, 
epoch 34, batches 5000|7692, train-acc 0.697, loss 20.974, para-norm 79.734, grad-norm 37.403, time 18.44s, 
epoch 34, batches 6000|7692, train-acc 0.695, loss 20.087, para-norm 79.730, grad-norm 3.191, time 19.45s, 
epoch 34, batches 7000|7692, train-acc 0.714, loss 20.050, para-norm 79.734, grad-norm 2.609, time 18.90s, 
epoch 34, batches 7692|7692, train-acc 0.694, loss 22.410, para-norm 79.734, grad-norm 0.000, time 11.98s, 
test before training starts
dev-acc 0.664


(0.664081556069798, 2232.0, 8386.0, 0.0, 3588.0, 12301.0, 100.0, 6183.0, 9806.0, 0.0)


epoch 35, batches 1000|7692, train-acc 0.717, loss 19.940, para-norm 79.746, grad-norm 23.261, time 16.63s, 
epoch 35, batches 2000|7692, train-acc 0.700, loss 20.369, para-norm 79.753, grad-norm 55.937, time 18.38s, 
epoch 35, batches 3000|7692, train-acc 0.711, loss 20.062, para-norm 79.762, grad-norm 124.740, time 17.89s, 
epoch 35, batches 4000|7692, train-acc 0.708, loss 20.003, para-norm 79.756, grad-norm 58.804, time 17.86s, 
epoch 35, batches 5000|7692, train-acc 0.713, loss 20.771, para-norm 79.762, grad-norm 0.488, time 19.28s, 
epoch 35, batches 6000|7692, train-acc 0.717, loss 20.586, para-norm 79.770, grad-norm 3.716, time 18.42s, 
epoch 35, batches 7000|7692, train-acc 0.696, loss 20.078, para-norm 79.760, grad-norm 0.003, time 18.47s, 
epoch 35, batches 7692|7692, train-acc 0.705, loss 20.469, para-norm 79.753, grad-norm 0.436, time 12.05s, 
test before training starts
dev-acc 0.667


(0.6671461629870536, 2052.0, 8615.0, 0.0, 3185.0, 12710.0, 94.0, 6183.0, 9806.0, 0.0)


epoch 36, batches 1000|7692, train-acc 0.692, loss 20.418, para-norm 79.767, grad-norm 0.121, time 20.06s, 
epoch 36, batches 2000|7692, train-acc 0.711, loss 21.704, para-norm 79.785, grad-norm 155.604, time 17.64s, 
epoch 36, batches 3000|7692, train-acc 0.707, loss 20.925, para-norm 79.784, grad-norm 58.791, time 19.18s, 
epoch 36, batches 4000|7692, train-acc 0.720, loss 20.070, para-norm 79.781, grad-norm 50.610, time 18.15s, 
epoch 36, batches 5000|7692, train-acc 0.727, loss 19.698, para-norm 79.786, grad-norm 15.957, time 18.83s, 
epoch 36, batches 6000|7692, train-acc 0.702, loss 18.646, para-norm 79.784, grad-norm 69.789, time 16.98s, 
epoch 36, batches 7000|7692, train-acc 0.706, loss 20.402, para-norm 79.782, grad-norm 71.194, time 16.52s, 
epoch 36, batches 7692|7692, train-acc 0.714, loss 20.021, para-norm 79.776, grad-norm 59.036, time 11.98s, 
test before training starts
dev-acc 0.664


(0.6640190130714867, 2223.0, 8394.0, 0.0, 3576.0, 12321.0, 92.0, 6183.0, 9806.0, 0.0)


epoch 37, batches 1000|7692, train-acc 0.711, loss 20.393, para-norm 79.783, grad-norm 88.565, time 19.29s, 
epoch 37, batches 2000|7692, train-acc 0.718, loss 19.763, para-norm 79.785, grad-norm 12.713, time 17.74s, 
epoch 37, batches 3000|7692, train-acc 0.719, loss 20.132, para-norm 79.785, grad-norm 71.338, time 17.41s, 
epoch 37, batches 4000|7692, train-acc 0.707, loss 19.787, para-norm 79.784, grad-norm 215.941, time 19.54s, 
epoch 37, batches 5000|7692, train-acc 0.718, loss 20.798, para-norm 79.778, grad-norm 110.863, time 18.42s, 
epoch 37, batches 6000|7692, train-acc 0.703, loss 19.989, para-norm 79.777, grad-norm 85.437, time 18.33s, 
epoch 37, batches 7000|7692, train-acc 0.706, loss 21.736, para-norm 79.778, grad-norm 22.195, time 17.21s, 
epoch 37, batches 7692|7692, train-acc 0.713, loss 18.557, para-norm 79.781, grad-norm 74.946, time 11.35s, 
test before training starts
dev-acc 0.666


(0.6660829320157609, 2070.0, 8580.0, 0.0, 3266.0, 12650.0, 73.0, 6183.0, 9806.0, 0.0)


epoch 38, batches 1000|7692, train-acc 0.713, loss 19.316, para-norm 79.773, grad-norm 48.704, time 17.43s, 
epoch 38, batches 2000|7692, train-acc 0.718, loss 19.804, para-norm 79.792, grad-norm 0.193, time 18.35s, 
epoch 38, batches 3000|7692, train-acc 0.707, loss 20.143, para-norm 79.803, grad-norm 0.022, time 17.19s, 
epoch 38, batches 4000|7692, train-acc 0.698, loss 21.354, para-norm 79.810, grad-norm 57.019, time 18.23s, 


In [None]:
# test
best_model_fname = best_dev[-1][2]
input_encoder.load_state_dict(torch.load(best_model_fname + '_input-encoder.pt'))
inter_atten.load_state_dict(torch.load(best_model_fname + '_inter-atten.pt'))
seq_atten.load_state_dict(torch.load(best_model_fname + '_seq-atten.pt'))

input_encoder.eval()
inter_atten.eval()
seq_atten.eval()

test_acc(test_data)