In [1]:
# WIP: inspired from https://github.com/markdtw/meta-learning-lstm-pytorch
# modifiying it to make it simpler
#import libraries

from __future__ import division, print_function, absolute_import
import os
import re
import pdb
import copy
import glob
import pickle
import numpy as np

import torch
import torch
import torch.nn as nn
import torch.utils.data as data
import torchvision.datasets as datasets
import torchvision.transforms as transforms
import PIL.Image as PILI

from tqdm import tqdm
from collections import OrderedDict
import random
import logging

#### Step 1: Data Loader 

In [2]:
class EpisodeDataset(data.Dataset):

    def __init__(self, root, phase='train', n_shot=5, n_eval=15, transform=None):
        """Args:
            root (str): path to data
            phase (str): train, val or test
            n_shot (int): how many examples per class for training (k/n_support)
            n_eval (int): how many examples per class for evaluation
                - n_shot + n_eval = batch_size for data.DataLoader of ClassDataset
            transform (torchvision.transforms): data augmentation
        """
        root = os.path.join(root, phase)
        self.labels = sorted(os.listdir(root))
        images = [glob.glob(os.path.join(root, label, '*')) for label in self.labels]

        self.episode_loader = [data.DataLoader(
            ClassDataset(images=images[idx], label=idx, transform=transform),
            batch_size=n_shot+n_eval, shuffle=True, num_workers=0) for idx, _ in enumerate(self.labels)]

    def __getitem__(self, idx):
        return next(iter(self.episode_loader[idx]))

    def __len__(self):
        return len(self.labels)

In [3]:
class ClassDataset(data.Dataset):

    def __init__(self, images, label, transform=None):
        """Args:
            images (list of str): each item is a path to an image of the same label
            label (int): the label of all the images
        """
        self.images = images
        self.label = label
        self.transform = transform

    def __getitem__(self, idx):
        image = PILI.open(self.images[idx]).convert('RGB')
        if self.transform is not None:
            image = self.transform(image)

        return image, self.label

    def __len__(self):
        return len(self.images)

In [4]:
class EpisodicSampler(data.Sampler):

    def __init__(self, total_classes, n_class, n_episode):
        self.total_classes = total_classes
        self.n_class = n_class
        self.n_episode = n_episode

    def __iter__(self):
        for i in range(self.n_episode):
            yield torch.randperm(self.total_classes)[:self.n_class]

    def __len__(self):
        return self.n_episode

In [5]:
def prepare_data(args):

    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    
    train_set = EpisodeDataset(args['data_root'], 'train', args['n_shot'], args['n_eval'],
        transform=transforms.Compose([
            transforms.RandomResizedCrop(args['image_size']),
            transforms.RandomHorizontalFlip(),
            transforms.ColorJitter(
                brightness=0.4,
                contrast=0.4,
                saturation=0.4,
                hue=0.2),
            transforms.ToTensor(),
            normalize]))

    val_set = EpisodeDataset(args['data_root'], 'val', args['n_shot'], args['n_eval'],
        transform=transforms.Compose([
            transforms.Resize(args['image_size'] * 8 // 7),
            transforms.CenterCrop(args['image_size']),
            transforms.ToTensor(),
            normalize]))

    test_set = EpisodeDataset(args['data_root'], 'test', args['n_shot'], args['n_eval'],
        transform=transforms.Compose([
            transforms.Resize(args['image_size'] * 8 // 7),
            transforms.CenterCrop(args['image_size']),
            transforms.ToTensor(),
            normalize]))

    train_loader = data.DataLoader(train_set, num_workers=4, pin_memory=True,
        batch_sampler=EpisodicSampler(len(train_set), args['n_class'], args['episode']))

    val_loader = data.DataLoader(val_set, num_workers=2, pin_memory=True,
        batch_sampler=EpisodicSampler(len(val_set), args['n_class'], args['episode_val']))

    test_loader = data.DataLoader(test_set, num_workers=2, pin_memory=True,
        batch_sampler=EpisodicSampler(len(test_set), args['n_class'], args['episode_val']))

    return train_loader, val_loader, test_loader


#### Step 2: Learner 

In [6]:
class Learner(nn.Module):

    def __init__(self, image_size, bn_eps, bn_momentum, n_classes):
        super(Learner, self).__init__()
        self.model = nn.ModuleDict({'features': nn.Sequential(OrderedDict([
            ('conv1', nn.Conv2d(3, 32, 3, padding=1)),
            ('norm1', nn.BatchNorm2d(32, bn_eps, bn_momentum)),
            ('relu1', nn.ReLU(inplace=False)),
            ('pool1', nn.MaxPool2d(2)),

            ('conv2', nn.Conv2d(32, 32, 3, padding=1)),
            ('norm2', nn.BatchNorm2d(32, bn_eps, bn_momentum)),
            ('relu2', nn.ReLU(inplace=False)),
            ('pool2', nn.MaxPool2d(2)),

            ('conv3', nn.Conv2d(32, 32, 3, padding=1)),
            ('norm3', nn.BatchNorm2d(32, bn_eps, bn_momentum)),
            ('relu3', nn.ReLU(inplace=False)),
            ('pool3', nn.MaxPool2d(2)),

            ('conv4', nn.Conv2d(32, 32, 3, padding=1)),
            ('norm4', nn.BatchNorm2d(32, bn_eps, bn_momentum)),
            ('relu4', nn.ReLU(inplace=False)),
            ('pool4', nn.MaxPool2d(2))]))
        })

        clr_in = image_size // 2**4
        self.model.update({'cls': nn.Linear(32 * clr_in * clr_in, n_classes)})
        self.criterion = nn.CrossEntropyLoss()

    def forward(self, x):
        x = self.model.features(x)
        x = torch.reshape(x, [x.size(0), -1])
        outputs = self.model.cls(x)
        return outputs

    def get_flat_params(self):
        return torch.cat([p.view(-1) for p in self.model.parameters()], 0)

    def copy_flat_params(self, cI):
        idx = 0
        for p in self.model.parameters():
            plen = p.view(-1).size(0)
            p.data.copy_(cI[idx: idx+plen].view_as(p))
            idx += plen

    def transfer_params(self, learner_w_grad, cI):
        # Use load_state_dict only to copy the running mean/var in batchnorm, the values of the parameters
        #  are going to be replaced by cI
        self.load_state_dict(learner_w_grad.state_dict())
        #  replace nn.Parameters with tensors from cI (NOT nn.Parameters anymore).
        idx = 0
        for m in self.model.modules():
            if isinstance(m, nn.Conv2d) or isinstance(m, nn.BatchNorm2d) or isinstance(m, nn.Linear):
                wlen = m._parameters['weight'].view(-1).size(0)
                m._parameters['weight'] = cI[idx: idx+wlen].view_as(m._parameters['weight']).clone()
                idx += wlen
                if m._parameters['bias'] is not None:
                    blen = m._parameters['bias'].view(-1).size(0)
                    m._parameters['bias'] = cI[idx: idx+blen].view_as(m._parameters['bias']).clone()
                    idx += blen

    def reset_batch_stats(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.reset_running_stats()

#### Step 4: Meta Learner 

In [7]:
class MetaLSTMCell(nn.Module):
    """C_t = f_t * C_{t-1} + i_t * \tilde{C_t}"""
    def __init__(self, input_size, hidden_size, n_learner_params):
        super(MetaLSTMCell, self).__init__()
        """Args:
            input_size (int): cell input size, default = 20
            hidden_size (int): should be 1
            n_learner_params (int): number of learner's parameters
        """
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_learner_params = n_learner_params
        self.WF = nn.Parameter(torch.Tensor(input_size + 2, hidden_size))
        self.WI = nn.Parameter(torch.Tensor(input_size + 2, hidden_size))
        self.cI = nn.Parameter(torch.Tensor(n_learner_params, 1))
        self.bI = nn.Parameter(torch.Tensor(1, hidden_size))
        self.bF = nn.Parameter(torch.Tensor(1, hidden_size))

        self.reset_parameters()

    def reset_parameters(self):
        for weight in self.parameters():
            nn.init.uniform_(weight, -0.01, 0.01)

        # want initial forget value to be high and input value to be low so that 
        #  model starts with gradient descent
        nn.init.uniform_(self.bF, 4, 6)
        nn.init.uniform_(self.bI, -5, -4)

    def init_cI(self, flat_params):
        self.cI.data.copy_(flat_params.unsqueeze(1))

    def forward(self, inputs, hx=None):
        """Args:
            inputs = [x_all, grad]:
                x_all (torch.Tensor of size [n_learner_params, input_size]): outputs from previous LSTM
                grad (torch.Tensor of size [n_learner_params]): gradients from learner
            hx = [f_prev, i_prev, c_prev]:
                f (torch.Tensor of size [n_learner_params, 1]): forget gate
                i (torch.Tensor of size [n_learner_params, 1]): input gate
                c (torch.Tensor of size [n_learner_params, 1]): flattened learner parameters
        """
        x_all, grad = inputs
        batch, _ = x_all.size()

        if hx is None:
            f_prev = torch.zeros((batch, self.hidden_size)).to(self.WF.device)
            i_prev = torch.zeros((batch, self.hidden_size)).to(self.WI.device)
            c_prev = self.cI
            hx = [f_prev, i_prev, c_prev]

        f_prev, i_prev, c_prev = hx
        
        # f_t = sigmoid(W_f * [grad_t, loss_t, theta_{t-1}, f_{t-1}] + b_f)
        f_next = torch.mm(torch.cat((x_all, c_prev, f_prev), 1), self.WF) + self.bF.expand_as(f_prev)
        # i_t = sigmoid(W_i * [grad_t, loss_t, theta_{t-1}, i_{t-1}] + b_i)
        i_next = torch.mm(torch.cat((x_all, c_prev, i_prev), 1), self.WI) + self.bI.expand_as(i_prev)
        # next cell/params
        c_next = torch.sigmoid(f_next).mul(c_prev) - torch.sigmoid(i_next).mul(grad)

        return c_next, [f_next, i_next, c_next]

    def extra_repr(self):
        s = '{input_size}, {hidden_size}, {n_learner_params}'
        return s.format(**self.__dict__)


class MetaLearner(nn.Module):

    def __init__(self, input_size, hidden_size, n_learner_params):
        super(MetaLearner, self).__init__()
        """Args:
            input_size (int): for the first LSTM layer, default = 4
            hidden_size (int): for the first LSTM layer, default = 20
            n_learner_params (int): number of learner's parameters
        """
        self.lstm = nn.LSTMCell(input_size=input_size, hidden_size=hidden_size)
        self.metalstm = MetaLSTMCell(input_size=hidden_size, hidden_size=1, n_learner_params=n_learner_params)

    def forward(self, inputs, hs=None):
        """Args:
            inputs = [loss, grad_prep, grad]
                loss (torch.Tensor of size [1, 2])
                grad_prep (torch.Tensor of size [n_learner_params, 2])
                grad (torch.Tensor of size [n_learner_params])
            hs = [(lstm_hn, lstm_cn), [metalstm_fn, metalstm_in, metalstm_cn]]
        """
        loss, grad_prep, grad = inputs
        loss = loss.expand_as(grad_prep)
        inputs = torch.cat((loss, grad_prep), 1)   # [n_learner_params, 4]

        if hs is None:
            hs = [None, None]

        lstmhx, lstmcx = self.lstm(inputs, hs[0])
        flat_learner_unsqzd, metalstm_hs = self.metalstm([lstmhx, grad], hs[1])

        return flat_learner_unsqzd.squeeze(), [(lstmhx, lstmcx), metalstm_hs]

#### Step 4: utils 

In [8]:
class GOATLogger:

    def __init__(self, args):
        save = "./" + '-{}'.format(2019)

        self.mode = args['mode']
        self.save_root = save
        self.log_freq = 100

        if self.mode == 'train':
            if not os.path.exists(self.save_root):
                os.mkdir(self.save_root)
            filename = os.path.join(self.save_root, 'console.log')
            logging.basicConfig(level=logging.DEBUG,
                format='%(asctime)s.%(msecs)03d - %(message)s',
                datefmt='%b-%d %H:%M:%S',
                filename=filename,
                filemode='w')
            console = logging.StreamHandler()
            console.setLevel(logging.INFO)
            console.setFormatter(logging.Formatter('%(message)s'))
            logging.getLogger('').addHandler(console)

            logging.info("Logger created at {}".format(filename))
        else:
            logging.basicConfig(level=logging.INFO,
                format='%(asctime)s.%(msecs)03d - %(message)s',
                datefmt='%b-%d %H:%M:%S')

        logging.info("Random Seed: {}".format(2019))
        self.reset_stats()

    def reset_stats(self):
        if self.mode == 'train':
            self.stats = {'train': {'loss': [], 'acc': []},
                          'eval': {'loss': [], 'acc': []}}
        else:
            self.stats = {'eval': {'loss': [], 'acc': []}}

    def batch_info(self, **kwargs):
        if kwargs['phase'] == 'train':
            self.stats['train']['loss'].append(kwargs['loss'])
            self.stats['train']['acc'].append(kwargs['acc'])

            if kwargs['eps'] % self.log_freq == 0 and kwargs['eps'] != 0:
                loss_mean = np.mean(self.stats['train']['loss'])
                acc_mean = np.mean(self.stats['train']['acc'])
                #self.draw_stats()
                self.loginfo("[{:5d}/{:5d}] loss: {:6.4f} ({:6.4f}), acc: {:6.3f}% ({:6.3f}%)".format(\
                    kwargs['eps'], kwargs['totaleps'], kwargs['loss'], loss_mean, kwargs['acc'], acc_mean))

        elif kwargs['phase'] == 'eval':
            self.stats['eval']['loss'].append(kwargs['loss'])
            self.stats['eval']['acc'].append(kwargs['acc'])

        elif kwargs['phase'] == 'evaldone':
            loss_mean = np.mean(self.stats['eval']['loss'])
            loss_std = np.std(self.stats['eval']['loss'])
            acc_mean = np.mean(self.stats['eval']['acc'])
            acc_std = np.std(self.stats['eval']['acc'])
            self.loginfo("[{:5d}] Eval ({:3d} episode) - loss: {:6.4f} +- {:6.4f}, acc: {:6.3f} +- {:5.3f}%".format(\
                kwargs['eps'], kwargs['totaleps'], loss_mean, loss_std, acc_mean, acc_std))

            self.reset_stats()
            return acc_mean

        else:
            raise ValueError("phase {} not supported".format(kwargs['phase']))

    def logdebug(self, strout):
        logging.debug(strout)
    def loginfo(self, strout):
        logging.info(strout)


In [9]:
def accuracy(output, target, topk=(1,)):
    with torch.no_grad():
        maxk = max(topk)
        batch_size = target.size(0)

        _, pred = output.topk(maxk, 1, True, True)
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))

        res = []
        for k in topk:
            correct_k = correct[:k].view(-1).float().sum(0, keepdim=True)
            res.append(correct_k.mul_(100.0 / batch_size))
        return res[0].item() if len(res) == 1 else [r.item() for r in res]

In [10]:
def save_ckpt(episode, metalearner, optim, save):
    if not os.path.exists(os.path.join(save, 'ckpts')):
        os.mkdir(os.path.join(save, 'ckpts'))

    torch.save({
        'episode': episode,
        'metalearner': metalearner.state_dict(),
        'optim': optim.state_dict()
    }, os.path.join(save, 'ckpts', 'meta-learner-{}.pth.tar'.format(episode)))

In [11]:
def resume_ckpt(metalearner, optim, resume, device):
    ckpt = torch.load(resume, map_location=device)
    last_episode = ckpt['episode']
    metalearner.load_state_dict(ckpt['metalearner'])
    optim.load_state_dict(ckpt['optim'])
    return last_episode, metalearner, optim

In [12]:
def preprocess_grad_loss(x):
    p = 10
    indicator = (x.abs() >= np.exp(-p)).to(torch.float32)

    # preproc1
    x_proc1 = indicator * torch.log(x.abs() + 1e-8) / p + (1 - indicator) * -1
    # preproc2
    x_proc2 = indicator * torch.sign(x) + (1 - indicator) * np.exp(p) * x
    return torch.stack((x_proc1, x_proc2), 1)

#### Step 5: Main

In [13]:
def meta_test(eps, eval_loader, learner_w_grad, learner_wo_grad, metalearner, args, logger):
    for subeps, (episode_x, episode_y) in enumerate(tqdm(eval_loader, ascii=True)):
        train_input = episode_x[:, :args['n_shot']].reshape(-1, *episode_x.shape[-3:]) # [n_class * n_shot, :]
        train_target = torch.LongTensor(np.repeat(range(args['n_class']), args['n_shot'])) # [n_class * n_shot]
        test_input = episode_x[:, args['n_shot']:].reshape(-1, *episode_x.shape[-3:]) # [n_class * n_eval, :]
        test_target = torch.LongTensor(np.repeat(range(args['n_class']), args['n_eval'])) # [n_class * n_eval]

        # Train learner with metalearner
        learner_w_grad.reset_batch_stats()
        learner_wo_grad.reset_batch_stats()
        learner_w_grad.train()
        learner_wo_grad.eval()
        cI = train_learner(learner_w_grad, metalearner, train_input, train_target, args)

        learner_wo_grad.transfer_params(learner_w_grad, cI)
        output = learner_wo_grad(test_input)
        loss = learner_wo_grad.criterion(output, test_target)
        acc = accuracy(output, test_target)
 
        logger.batch_info(loss=loss.item(), acc=acc, phase='eval')
        #print (loss=loss.item(), acc=acc, )
        

    return logger.batch_info(eps=eps, totaleps=args['episode_val'], phase='evaldone')


In [14]:
def train_learner(learner_w_grad, metalearner, train_input, train_target, args):
    cI = metalearner.metalstm.cI.data
    hs = [None]
    for _ in range(args['epoch']):
        for i in range(0, len(train_input), args['batch_size']):
            x = train_input[i:i+args['batch_size']]
            y = train_target[i:i+args['batch_size']]

            # get the loss/grad
            learner_w_grad.copy_flat_params(cI)
            output = learner_w_grad(x)
            loss = learner_w_grad.criterion(output, y)
            acc = accuracy(output, y)
            learner_w_grad.zero_grad()
            loss.backward()
            grad = torch.cat([p.grad.data.view(-1) / args['batch_size'] for p in learner_w_grad.parameters()], 0)

            # preprocess grad & loss and metalearner forward
            grad_prep = preprocess_grad_loss(grad)  # [n_learner_params, 2]
            loss_prep = preprocess_grad_loss(loss.data.unsqueeze(0)) # [1, 2]
            metalearner_input = [loss_prep, grad_prep, grad.unsqueeze(1)]
            cI, h = metalearner(metalearner_input, hs[-1])
            hs.append(h)
            print("training loss: {:8.6f} acc: {:6.3f}, mean grad: {:8.6f}".format(loss, acc, torch.mean(grad)))

    return cI


In [15]:
def main(args):
    seed = 2019
    if seed is None:
        seed = random.randint(0, 1e3)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    dev = torch.device('cpu')
    logger = GOATLogger(args)

    # Get data
    train_loader, val_loader, test_loader = prepare_data(args)
    
    # Set up learner, meta-learner
    learner_w_grad = Learner(args['image_size'], args['bn_eps'], args['bn_momentum'], args['n_class'])
    learner_wo_grad = copy.deepcopy(learner_w_grad)
    metalearner = MetaLearner(args['input_size'], args['hidden_size'], learner_w_grad.get_flat_params().size(0))
    metalearner.metalstm.init_cI(learner_w_grad.get_flat_params())

    # Set up loss, optimizer, learning rate scheduler
    optim = torch.optim.Adam(metalearner.parameters(), args['lr'])

    if args['resume']:
        logger.loginfo("Initialized from: {}".format(args['resume']))
        last_eps, metalearner, optim = resume_ckpt(metalearner, optim, args['resume'], dev)

    if args['mode'] == 'test':
        _ = meta_test(last_eps, test_loader, learner_w_grad, learner_wo_grad, metalearner, args, logger)
        return

    best_acc = 0.0
    logger.loginfo("Start training")
    # Meta-training
    for eps, (episode_x, episode_y) in enumerate(train_loader):
        # episode_x.shape = [n_class, n_shot + n_eval, c, h, w]
        # episode_y.shape = [n_class, n_shot + n_eval] --> NEVER USED
        train_input = episode_x[:, :args['n_shot']].reshape(-1, *episode_x.shape[-3:]) # [n_class * n_shot, :]
        train_target = torch.LongTensor(np.repeat(range(args['n_shot']), args['n_shot'])) # [n_class * n_shot]
        test_input = episode_x[:, args['n_shot']:].reshape(-1, *episode_x.shape[-3:]) # [n_class * n_eval, :]
        test_target = torch.LongTensor(np.repeat(range(args['n_shot']), args['n_eval'])) # [n_class * n_eval]

        # Train learner with metalearner
        learner_w_grad.reset_batch_stats()
        learner_wo_grad.reset_batch_stats()
        learner_w_grad.train()
        learner_wo_grad.train()
        cI = train_learner(learner_w_grad, metalearner, train_input, train_target, args)

        # Train meta-learner with validation loss
        learner_wo_grad.transfer_params(learner_w_grad, cI)
        output = learner_wo_grad(test_input)
        loss = learner_wo_grad.criterion(output, test_target)
        acc = accuracy(output, test_target)
        
        optim.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(metalearner.parameters(), args['grad_clip'])
        optim.step()

        logger.batch_info(eps=eps, totaleps=args['episode'], loss=loss.item(), acc=acc, phase='train')

        # Meta-validation
        if eps % 1000 == 0 and eps != 0:
            save_ckpt(eps, metalearner, optim, "./")
            acc = meta_test(eps, val_loader, learner_w_grad, learner_wo_grad, metalearner, args, logger)
            if acc > best_acc:
                best_acc = acc
                logger.loginfo("* Best accuracy so far *\n")

    logger.loginfo("Done")

In [16]:
if __name__ == '__main__':
    args_train={'mode':'train','n_shot':5,'n_eval':15,'n_class':5,'input_size':4,'hidden_size':20,'lr':1e-3,'episode':500,
      'episode_val':100,'epoch':8,'batch_size':16,'image_size':84,'grad_clip':0.25,'bn_momentum': 0.95,'bn_eps': 1e-3,
       'data': "miniimagenet",'data_root': "./data/miniImagenet/", 'resume': None}
    
    
    args_test={'mode':'test','n_shot':5,'n_eval':15,'n_class':5,'input_size':4,'hidden_size':20,'lr':1e-3,'episode':500,
      'episode_val':100,'epoch':8,'batch_size':16,'image_size':84,'grad_clip':0.25,'bn_momentum': 0.95,'bn_eps': 1e-3,
       'data': "miniimagenet",'data_root': "./data/miniImagenet/", 'resume': None}
    
    
    print (" BEGIN TRAINING: ")
    main(args_train)
    
    
    print ("BEGIN TESTING")
    main(args_test)
    
    

Logger created at ./-2019/console.log
Random Seed: 2019
Start training


 BEGIN TRAINING: 
training loss: 1.818345 acc: 25.000, mean grad: 0.000053
training loss: 1.644813 acc:  0.000, mean grad: 0.000002
training loss: 1.790295 acc: 25.000, mean grad: 0.000063
training loss: 1.576223 acc:  0.000, mean grad: 0.000009
training loss: 1.766894 acc: 25.000, mean grad: 0.000054
training loss: 1.518992 acc: 22.222, mean grad: 0.000009
training loss: 1.745665 acc: 25.000, mean grad: 0.000053
training loss: 1.471501 acc: 22.222, mean grad: 0.000014
training loss: 1.726845 acc: 25.000, mean grad: 0.000048
training loss: 1.431588 acc: 33.333, mean grad: 0.000010
training loss: 1.710378 acc: 25.000, mean grad: 0.000043
training loss: 1.397009 acc: 44.444, mean grad: 0.000014
training loss: 1.695461 acc: 25.000, mean grad: 0.000042
training loss: 1.366491 acc: 55.556, mean grad: 0.000019
training loss: 1.681154 acc: 25.000, mean grad: 0.000040
training loss: 1.339876 acc: 55.556, mean grad: 0.000020
training loss: 1.605418 acc: 31.250, mean grad: -0.000042
training los

training loss: 1.548511 acc: 25.000, mean grad: 0.000006
training loss: 1.924238 acc:  0.000, mean grad: -0.000006
training loss: 1.540775 acc: 31.250, mean grad: 0.000001
training loss: 1.868076 acc:  0.000, mean grad: -0.000001
training loss: 1.533399 acc: 31.250, mean grad: 0.000006
training loss: 1.816997 acc: 11.111, mean grad: -0.000008
training loss: 1.526887 acc: 31.250, mean grad: 0.000005
training loss: 1.771737 acc: 22.222, mean grad: -0.000013
training loss: 1.521333 acc: 31.250, mean grad: 0.000010
training loss: 1.730765 acc: 22.222, mean grad: -0.000021
training loss: 1.516629 acc: 31.250, mean grad: 0.000009
training loss: 1.693472 acc: 22.222, mean grad: -0.000027
training loss: 1.512358 acc: 31.250, mean grad: 0.000007
training loss: 1.659215 acc: 22.222, mean grad: -0.000028
training loss: 1.508586 acc: 31.250, mean grad: 0.000003
training loss: 1.628090 acc: 22.222, mean grad: -0.000023
training loss: 1.536752 acc: 31.250, mean grad: -0.000009
training loss: 1.89329

training loss: 1.677855 acc: 25.000, mean grad: -0.000033
training loss: 1.615753 acc: 11.111, mean grad: -0.000014
training loss: 1.667161 acc: 25.000, mean grad: -0.000042
training loss: 1.576966 acc: 22.222, mean grad: -0.000014
training loss: 1.654901 acc: 25.000, mean grad: -0.000042
training loss: 1.541796 acc: 22.222, mean grad: -0.000017
training loss: 1.644382 acc: 12.500, mean grad: -0.000042
training loss: 1.510861 acc: 22.222, mean grad: -0.000020
training loss: 1.634868 acc: 12.500, mean grad: -0.000038
training loss: 1.482659 acc: 33.333, mean grad: -0.000018
training loss: 1.626018 acc: 12.500, mean grad: -0.000036
training loss: 1.457137 acc: 33.333, mean grad: -0.000016
training loss: 1.617258 acc: 18.750, mean grad: -0.000039
training loss: 1.433913 acc: 33.333, mean grad: -0.000015
training loss: 1.609069 acc: 18.750, mean grad: -0.000038
training loss: 1.413026 acc: 33.333, mean grad: -0.000013
training loss: 1.666476 acc: 25.000, mean grad: 0.000043
training loss: 

training loss: 1.545787 acc: 31.250, mean grad: -0.000031
training loss: 1.965320 acc:  0.000, mean grad: 0.000019
training loss: 1.547970 acc: 31.250, mean grad: -0.000031
training loss: 1.913669 acc: 11.111, mean grad: 0.000008
training loss: 1.548909 acc: 31.250, mean grad: -0.000032
training loss: 1.866336 acc: 11.111, mean grad: 0.000007
training loss: 1.549880 acc: 31.250, mean grad: -0.000023
training loss: 1.823274 acc: 11.111, mean grad: 0.000014
training loss: 1.550919 acc: 31.250, mean grad: -0.000025
training loss: 1.783915 acc: 11.111, mean grad: 0.000012
training loss: 1.552038 acc: 31.250, mean grad: -0.000016
training loss: 1.747998 acc: 11.111, mean grad: 0.000011
training loss: 1.553068 acc: 31.250, mean grad: -0.000014
training loss: 1.715202 acc: 11.111, mean grad: 0.000009
training loss: 1.554101 acc: 31.250, mean grad: -0.000013
training loss: 1.685208 acc: 11.111, mean grad: 0.000006
training loss: 1.515662 acc: 31.250, mean grad: 0.000003
training loss: 1.842015

training loss: 1.553959 acc: 12.500, mean grad: 0.000002
training loss: 1.912628 acc:  0.000, mean grad: -0.000007
training loss: 1.548029 acc: 12.500, mean grad: 0.000003
training loss: 1.865874 acc:  0.000, mean grad: -0.000012
training loss: 1.540588 acc: 12.500, mean grad: 0.000006
training loss: 1.822937 acc:  0.000, mean grad: -0.000011
training loss: 1.534034 acc: 12.500, mean grad: 0.000012
training loss: 1.783912 acc:  0.000, mean grad: -0.000015
training loss: 1.528636 acc: 12.500, mean grad: 0.000018
training loss: 1.748560 acc:  0.000, mean grad: -0.000017
training loss: 1.524016 acc: 18.750, mean grad: 0.000016
training loss: 1.716529 acc: 11.111, mean grad: -0.000017
training loss: 1.520083 acc: 18.750, mean grad: 0.000015
training loss: 1.687382 acc: 11.111, mean grad: -0.000020
training loss: 1.516654 acc: 18.750, mean grad: 0.000018
training loss: 1.660815 acc: 22.222, mean grad: -0.000021
training loss: 1.569333 acc:  6.250, mean grad: 0.000033
training loss: 1.875655

training loss: 1.533787 acc: 37.500, mean grad: 0.000018
training loss: 1.721704 acc:  0.000, mean grad: 0.000010
training loss: 1.535343 acc: 37.500, mean grad: 0.000017
training loss: 1.693513 acc:  0.000, mean grad: 0.000007
training loss: 1.534893 acc: 37.500, mean grad: 0.000015
training loss: 1.667233 acc: 11.111, mean grad: 0.000004
training loss: 1.534522 acc: 37.500, mean grad: 0.000013
training loss: 1.643312 acc: 11.111, mean grad: -0.000001
training loss: 1.534050 acc: 37.500, mean grad: 0.000012
training loss: 1.621789 acc: 22.222, mean grad: -0.000002
training loss: 1.533635 acc: 37.500, mean grad: 0.000011
training loss: 1.602411 acc: 22.222, mean grad: -0.000003
training loss: 1.533277 acc: 37.500, mean grad: 0.000008
training loss: 1.584947 acc: 33.333, mean grad: -0.000003
training loss: 1.533029 acc: 37.500, mean grad: 0.000006
training loss: 1.569191 acc: 33.333, mean grad: -0.000004
training loss: 1.600837 acc: 18.750, mean grad: -0.000010
training loss: 1.776595 a

training loss: 1.497232 acc: 31.250, mean grad: -0.000005
training loss: 1.829999 acc:  0.000, mean grad: 0.000019
training loss: 1.505761 acc: 31.250, mean grad: -0.000005
training loss: 1.780103 acc:  0.000, mean grad: 0.000019
training loss: 1.512410 acc: 31.250, mean grad: -0.000005
training loss: 1.732516 acc: 11.111, mean grad: 0.000018
training loss: 1.518696 acc: 31.250, mean grad: -0.000002
training loss: 1.688681 acc: 11.111, mean grad: 0.000016
training loss: 1.524488 acc: 31.250, mean grad: -0.000001
training loss: 1.649122 acc: 11.111, mean grad: 0.000017
training loss: 1.529767 acc: 31.250, mean grad: -0.000000
training loss: 1.613954 acc: 33.333, mean grad: 0.000015
training loss: 1.534633 acc: 31.250, mean grad: -0.000001
training loss: 1.582788 acc: 33.333, mean grad: 0.000012
training loss: 1.539141 acc: 31.250, mean grad: -0.000001
training loss: 1.555204 acc: 33.333, mean grad: 0.000011
training loss: 1.571529 acc: 31.250, mean grad: 0.000036
training loss: 1.901347

training loss: 1.486312 acc: 31.250, mean grad: 0.000006
training loss: 1.761653 acc: 11.111, mean grad: 0.000006
training loss: 1.494596 acc: 25.000, mean grad: 0.000004
training loss: 1.726277 acc: 11.111, mean grad: 0.000006
training loss: 1.501164 acc: 25.000, mean grad: 0.000003
training loss: 1.690986 acc: 22.222, mean grad: 0.000007
training loss: 1.507230 acc: 25.000, mean grad: 0.000001
training loss: 1.657847 acc: 22.222, mean grad: 0.000009
training loss: 1.512753 acc: 25.000, mean grad: 0.000002
training loss: 1.627978 acc: 22.222, mean grad: 0.000010
training loss: 1.517846 acc: 31.250, mean grad: 0.000002
training loss: 1.601890 acc: 22.222, mean grad: 0.000011
training loss: 1.522535 acc: 31.250, mean grad: 0.000001
training loss: 1.579172 acc: 22.222, mean grad: 0.000011
training loss: 1.526761 acc: 25.000, mean grad: 0.000000
training loss: 1.559462 acc: 44.444, mean grad: 0.000014
training loss: 1.591564 acc: 25.000, mean grad: -0.000051
training loss: 1.816568 acc: 1

training loss: 1.591355 acc: 18.750, mean grad: -0.000011
training loss: 1.727767 acc: 11.111, mean grad: -0.000036
training loss: 1.592482 acc: 18.750, mean grad: -0.000012
training loss: 1.696572 acc: 11.111, mean grad: -0.000038
training loss: 1.590141 acc: 18.750, mean grad: -0.000017
training loss: 1.663350 acc: 11.111, mean grad: -0.000027
training loss: 1.587325 acc: 18.750, mean grad: -0.000016
training loss: 1.631685 acc: 11.111, mean grad: -0.000026
training loss: 1.584625 acc: 18.750, mean grad: -0.000017
training loss: 1.603289 acc: 22.222, mean grad: -0.000025
training loss: 1.582257 acc: 18.750, mean grad: -0.000015
training loss: 1.578854 acc: 22.222, mean grad: -0.000021
training loss: 1.580306 acc: 18.750, mean grad: -0.000015
training loss: 1.558269 acc: 33.333, mean grad: -0.000019
training loss: 1.578660 acc: 18.750, mean grad: -0.000016
training loss: 1.541076 acc: 33.333, mean grad: -0.000016
training loss: 1.447617 acc: 43.750, mean grad: -0.000001
training loss:

training loss: 1.610139 acc: 18.750, mean grad: 0.000013
training loss: 1.785828 acc: 22.222, mean grad: -0.000010
training loss: 1.611970 acc: 12.500, mean grad: 0.000010
training loss: 1.723380 acc: 33.333, mean grad: -0.000011
training loss: 1.610051 acc: 12.500, mean grad: 0.000009
training loss: 1.655398 acc: 44.444, mean grad: -0.000009
training loss: 1.608013 acc: 12.500, mean grad: 0.000006
training loss: 1.592206 acc: 44.444, mean grad: -0.000012
training loss: 1.606465 acc:  6.250, mean grad: 0.000004
training loss: 1.540340 acc: 44.444, mean grad: -0.000013
training loss: 1.605395 acc: 12.500, mean grad: 0.000003
training loss: 1.499936 acc: 44.444, mean grad: -0.000015
training loss: 1.604607 acc:  6.250, mean grad: 0.000002
training loss: 1.469120 acc: 55.556, mean grad: -0.000015
training loss: 1.603959 acc: 12.500, mean grad: 0.000002
training loss: 1.445957 acc: 55.556, mean grad: -0.000015
training loss: 1.482372 acc: 31.250, mean grad: -0.000014
training loss: 1.92340

training loss: 1.532235 acc:  6.250, mean grad: 0.000006
training loss: 1.982100 acc:  0.000, mean grad: -0.000010
training loss: 1.541743 acc:  6.250, mean grad: 0.000006
training loss: 1.901576 acc:  0.000, mean grad: -0.000012
training loss: 1.550262 acc:  6.250, mean grad: 0.000007
training loss: 1.807729 acc:  0.000, mean grad: -0.000009
training loss: 1.558597 acc: 12.500, mean grad: 0.000007
training loss: 1.722669 acc:  0.000, mean grad: -0.000011
training loss: 1.566021 acc: 12.500, mean grad: 0.000006
training loss: 1.657039 acc:  0.000, mean grad: -0.000011
training loss: 1.572283 acc: 12.500, mean grad: 0.000007
training loss: 1.608618 acc:  0.000, mean grad: -0.000011
training loss: 1.577447 acc: 12.500, mean grad: 0.000006
training loss: 1.573421 acc: 22.222, mean grad: -0.000011
training loss: 1.581637 acc: 18.750, mean grad: 0.000006
training loss: 1.548010 acc: 44.444, mean grad: -0.000010
training loss: 1.424664 acc: 37.500, mean grad: 0.000005
training loss: 1.993947

training loss: 1.522079 acc: 31.250, mean grad: -0.000002
training loss: 2.153321 acc:  0.000, mean grad: -0.000026
training loss: 1.527368 acc: 25.000, mean grad: -0.000003
training loss: 2.055964 acc:  0.000, mean grad: -0.000022
training loss: 1.532482 acc: 25.000, mean grad: -0.000003
training loss: 1.939831 acc:  0.000, mean grad: -0.000021
training loss: 1.539938 acc: 25.000, mean grad: -0.000003
training loss: 1.836980 acc:  0.000, mean grad: -0.000022
training loss: 1.548025 acc: 25.000, mean grad: -0.000005
training loss: 1.758992 acc:  0.000, mean grad: -0.000023
training loss: 1.555808 acc: 31.250, mean grad: -0.000006
training loss: 1.701873 acc:  0.000, mean grad: -0.000023
training loss: 1.562608 acc: 31.250, mean grad: -0.000007
training loss: 1.660173 acc:  0.000, mean grad: -0.000024
training loss: 1.568331 acc: 25.000, mean grad: -0.000009
training loss: 1.629761 acc:  0.000, mean grad: -0.000021
training loss: 1.498671 acc: 25.000, mean grad: 0.000009
training loss: 

[  100/  500] loss: 1.5990 (1.6155), acc: 33.333% (22.086%)


training loss: 1.473080 acc: 50.000, mean grad: -0.000010
training loss: 2.128849 acc:  0.000, mean grad: -0.000040
training loss: 1.473708 acc: 50.000, mean grad: -0.000011
training loss: 2.027520 acc:  0.000, mean grad: -0.000039
training loss: 1.473706 acc: 50.000, mean grad: -0.000011
training loss: 1.907136 acc:  0.000, mean grad: -0.000038
training loss: 1.478385 acc: 43.750, mean grad: -0.000013
training loss: 1.800863 acc:  0.000, mean grad: -0.000033
training loss: 1.486657 acc: 43.750, mean grad: -0.000013
training loss: 1.720634 acc:  0.000, mean grad: -0.000034
training loss: 1.496267 acc: 43.750, mean grad: -0.000014
training loss: 1.662460 acc: 11.111, mean grad: -0.000033
training loss: 1.505770 acc: 37.500, mean grad: -0.000013
training loss: 1.620565 acc: 11.111, mean grad: -0.000033
training loss: 1.514810 acc: 43.750, mean grad: -0.000013
training loss: 1.590402 acc: 33.333, mean grad: -0.000033
training loss: 1.373509 acc: 31.250, mean grad: -0.000007
training loss:

training loss: 1.547016 acc: 12.500, mean grad: 0.000017
training loss: 2.229977 acc:  0.000, mean grad: 0.000031
training loss: 1.549480 acc: 12.500, mean grad: 0.000013
training loss: 2.080208 acc:  0.000, mean grad: 0.000028
training loss: 1.552416 acc: 12.500, mean grad: 0.000011
training loss: 1.899441 acc:  0.000, mean grad: 0.000020
training loss: 1.559313 acc: 12.500, mean grad: 0.000006
training loss: 1.750306 acc:  0.000, mean grad: 0.000017
training loss: 1.567112 acc: 18.750, mean grad: 0.000002
training loss: 1.648193 acc: 22.222, mean grad: 0.000010
training loss: 1.574076 acc: 18.750, mean grad: -0.000003
training loss: 1.580710 acc: 55.556, mean grad: 0.000006
training loss: 1.579781 acc: 12.500, mean grad: -0.000005
training loss: 1.536412 acc: 66.667, mean grad: 0.000004
training loss: 1.584124 acc: 18.750, mean grad: -0.000008
training loss: 1.507735 acc: 66.667, mean grad: 0.000000
training loss: 1.397527 acc: 43.750, mean grad: 0.000007
training loss: 2.338806 acc:

training loss: 1.406884 acc: 25.000, mean grad: -0.000014
training loss: 2.400234 acc:  0.000, mean grad: 0.000013
training loss: 1.416957 acc: 31.250, mean grad: -0.000015
training loss: 2.215071 acc:  0.000, mean grad: 0.000011
training loss: 1.435520 acc: 43.750, mean grad: -0.000019
training loss: 1.980842 acc:  0.000, mean grad: 0.000006
training loss: 1.460387 acc: 56.250, mean grad: -0.000018
training loss: 1.798095 acc:  0.000, mean grad: 0.000002
training loss: 1.483756 acc: 56.250, mean grad: -0.000018
training loss: 1.679187 acc: 11.111, mean grad: -0.000000
training loss: 1.503561 acc: 50.000, mean grad: -0.000020
training loss: 1.604052 acc: 22.222, mean grad: -0.000002
training loss: 1.519984 acc: 50.000, mean grad: -0.000020
training loss: 1.556821 acc: 22.222, mean grad: -0.000001
training loss: 1.533533 acc: 37.500, mean grad: -0.000020
training loss: 1.527767 acc: 44.444, mean grad: -0.000002
training loss: 1.466853 acc: 12.500, mean grad: 0.000021
training loss: 2.37

training loss: 1.406147 acc: 25.000, mean grad: -0.000003
training loss: 2.445801 acc:  0.000, mean grad: 0.000006
training loss: 1.405290 acc: 37.500, mean grad: -0.000002
training loss: 2.229115 acc:  0.000, mean grad: 0.000001
training loss: 1.409923 acc: 56.250, mean grad: 0.000001
training loss: 1.959930 acc:  0.000, mean grad: -0.000007
training loss: 1.428122 acc: 68.750, mean grad: 0.000002
training loss: 1.763112 acc:  0.000, mean grad: -0.000013
training loss: 1.449936 acc: 68.750, mean grad: 0.000003
training loss: 1.642172 acc: 22.222, mean grad: -0.000019
training loss: 1.469644 acc: 62.500, mean grad: 0.000003
training loss: 1.569057 acc: 33.333, mean grad: -0.000025
training loss: 1.486700 acc: 56.250, mean grad: 0.000003
training loss: 1.524744 acc: 44.444, mean grad: -0.000026
training loss: 1.501315 acc: 56.250, mean grad: 0.000002
training loss: 1.498800 acc: 44.444, mean grad: -0.000025
training loss: 1.476137 acc: 12.500, mean grad: 0.000017
training loss: 2.391137

training loss: 1.588018 acc:  6.250, mean grad: 0.000019
training loss: 2.665292 acc:  0.000, mean grad: 0.000009
training loss: 1.558011 acc:  0.000, mean grad: 0.000017
training loss: 2.385941 acc:  0.000, mean grad: 0.000006
training loss: 1.520133 acc:  0.000, mean grad: 0.000016
training loss: 2.060694 acc:  0.000, mean grad: 0.000002
training loss: 1.511334 acc:  6.250, mean grad: 0.000010
training loss: 1.841950 acc:  0.000, mean grad: -0.000001
training loss: 1.517405 acc: 18.750, mean grad: 0.000006
training loss: 1.712693 acc:  0.000, mean grad: -0.000004
training loss: 1.526611 acc: 50.000, mean grad: 0.000004
training loss: 1.635880 acc: 11.111, mean grad: -0.000004
training loss: 1.535395 acc: 50.000, mean grad: 0.000002
training loss: 1.589340 acc: 44.444, mean grad: -0.000005
training loss: 1.543261 acc: 43.750, mean grad: 0.000000
training loss: 1.561487 acc: 44.444, mean grad: -0.000003
training loss: 1.402900 acc: 31.250, mean grad: 0.000003
training loss: 2.858279 ac

training loss: 1.403172 acc: 25.000, mean grad: -0.000007
training loss: 2.980939 acc:  0.000, mean grad: 0.000016
training loss: 1.402938 acc: 25.000, mean grad: -0.000006
training loss: 2.637930 acc:  0.000, mean grad: 0.000019
training loss: 1.416363 acc: 31.250, mean grad: -0.000008
training loss: 2.192138 acc: 11.111, mean grad: 0.000019
training loss: 1.452586 acc: 37.500, mean grad: -0.000008
training loss: 1.872547 acc: 11.111, mean grad: 0.000021
training loss: 1.490016 acc: 37.500, mean grad: -0.000007
training loss: 1.681942 acc: 11.111, mean grad: 0.000022
training loss: 1.518232 acc: 25.000, mean grad: -0.000005
training loss: 1.569527 acc: 22.222, mean grad: 0.000023
training loss: 1.537352 acc: 25.000, mean grad: -0.000003
training loss: 1.502456 acc: 44.444, mean grad: 0.000023
training loss: 1.550048 acc: 18.750, mean grad: -0.000002
training loss: 1.462709 acc: 55.556, mean grad: 0.000021
training loss: 1.427338 acc: 25.000, mean grad: -0.000022
training loss: 3.03636

training loss: 1.549265 acc: 18.750, mean grad: 0.000026
training loss: 3.086335 acc:  0.000, mean grad: 0.000047
training loss: 1.516106 acc: 31.250, mean grad: 0.000022
training loss: 2.722183 acc:  0.000, mean grad: 0.000035
training loss: 1.483240 acc: 31.250, mean grad: 0.000015
training loss: 2.258080 acc:  0.000, mean grad: 0.000020
training loss: 1.491593 acc: 31.250, mean grad: 0.000008
training loss: 1.922149 acc: 11.111, mean grad: 0.000008
training loss: 1.518305 acc: 31.250, mean grad: 0.000003
training loss: 1.716406 acc: 22.222, mean grad: 0.000006
training loss: 1.545050 acc: 25.000, mean grad: 0.000000
training loss: 1.594480 acc: 22.222, mean grad: 0.000004
training loss: 1.565626 acc: 25.000, mean grad: -0.000001
training loss: 1.521824 acc: 55.556, mean grad: -0.000003
training loss: 1.580047 acc: 12.500, mean grad: -0.000001
training loss: 1.478067 acc: 66.667, mean grad: -0.000004
training loss: 1.336753 acc: 31.250, mean grad: 0.000008
training loss: 3.323504 acc

training loss: 1.389965 acc: 50.000, mean grad: 0.000050
training loss: 3.349915 acc:  0.000, mean grad: 0.000025
training loss: 1.351480 acc: 50.000, mean grad: 0.000046
training loss: 2.826499 acc:  0.000, mean grad: 0.000015
training loss: 1.328123 acc: 50.000, mean grad: 0.000034
training loss: 2.153151 acc:  0.000, mean grad: 0.000005
training loss: 1.367449 acc: 62.500, mean grad: 0.000020
training loss: 1.688140 acc: 11.111, mean grad: 0.000003
training loss: 1.422997 acc: 56.250, mean grad: 0.000015
training loss: 1.434087 acc: 44.444, mean grad: -0.000002
training loss: 1.464557 acc: 43.750, mean grad: 0.000012
training loss: 1.296852 acc: 44.444, mean grad: -0.000003
training loss: 1.489868 acc: 37.500, mean grad: 0.000008
training loss: 1.221005 acc: 88.889, mean grad: -0.000003
training loss: 1.505768 acc: 37.500, mean grad: 0.000006
training loss: 1.178867 acc: 100.000, mean grad: -0.000006
training loss: 1.365376 acc: 31.250, mean grad: 0.000028
training loss: 3.282909 ac

training loss: 1.451885 acc: 25.000, mean grad: -0.000023
training loss: 3.781022 acc:  0.000, mean grad: 0.000003
training loss: 1.388539 acc: 18.750, mean grad: -0.000021
training loss: 3.231961 acc:  0.000, mean grad: -0.000001
training loss: 1.316175 acc: 18.750, mean grad: -0.000020
training loss: 2.532265 acc:  0.000, mean grad: -0.000005
training loss: 1.316269 acc: 43.750, mean grad: -0.000020
training loss: 2.039227 acc:  0.000, mean grad: -0.000003
training loss: 1.350302 acc: 50.000, mean grad: -0.000023
training loss: 1.749289 acc: 11.111, mean grad: -0.000007
training loss: 1.388495 acc: 62.500, mean grad: -0.000025
training loss: 1.581099 acc: 11.111, mean grad: -0.000005
training loss: 1.419435 acc: 56.250, mean grad: -0.000026
training loss: 1.482603 acc: 33.333, mean grad: -0.000008
training loss: 1.442812 acc: 56.250, mean grad: -0.000026
training loss: 1.423997 acc: 66.667, mean grad: -0.000008
training loss: 1.410143 acc: 31.250, mean grad: 0.000018
training loss: 3

training loss: 1.294435 acc: 31.250, mean grad: 0.000021
training loss: 4.318246 acc:  0.000, mean grad: 0.000072
training loss: 1.271008 acc: 31.250, mean grad: 0.000014
training loss: 3.763346 acc:  0.000, mean grad: 0.000063
training loss: 1.242667 acc: 43.750, mean grad: -0.000002
training loss: 2.989637 acc:  0.000, mean grad: 0.000050
training loss: 1.250430 acc: 56.250, mean grad: -0.000006
training loss: 2.396495 acc:  0.000, mean grad: 0.000030
training loss: 1.290722 acc: 56.250, mean grad: -0.000010
training loss: 2.021424 acc:  0.000, mean grad: 0.000017
training loss: 1.339007 acc: 56.250, mean grad: -0.000013
training loss: 1.793485 acc:  0.000, mean grad: 0.000010
training loss: 1.382505 acc: 56.250, mean grad: -0.000015
training loss: 1.654942 acc:  0.000, mean grad: 0.000002
training loss: 1.417350 acc: 50.000, mean grad: -0.000015
training loss: 1.569428 acc: 11.111, mean grad: -0.000002
training loss: 1.636535 acc: 18.750, mean grad: 0.000058
training loss: 4.410750 

training loss: 1.293408 acc: 31.250, mean grad: 0.000016
training loss: 4.312894 acc:  0.000, mean grad: 0.000059
training loss: 1.260348 acc: 31.250, mean grad: 0.000015
training loss: 3.720060 acc:  0.000, mean grad: 0.000053
training loss: 1.211076 acc: 37.500, mean grad: 0.000008
training loss: 2.914840 acc:  0.000, mean grad: 0.000040
training loss: 1.209865 acc: 37.500, mean grad: -0.000001
training loss: 2.326945 acc:  0.000, mean grad: 0.000019
training loss: 1.246050 acc: 56.250, mean grad: -0.000007
training loss: 1.967332 acc:  0.000, mean grad: 0.000008
training loss: 1.291623 acc: 62.500, mean grad: -0.000006
training loss: 1.751325 acc: 11.111, mean grad: -0.000001
training loss: 1.333526 acc: 62.500, mean grad: -0.000007
training loss: 1.619833 acc: 22.222, mean grad: -0.000006
training loss: 1.368864 acc: 56.250, mean grad: -0.000006
training loss: 1.538172 acc: 33.333, mean grad: -0.000009
training loss: 1.283694 acc: 50.000, mean grad: -0.000011
training loss: 4.15608

training loss: 1.513526 acc: 25.000, mean grad: 0.000043
training loss: 4.403507 acc:  0.000, mean grad: 0.000018
training loss: 1.456786 acc: 25.000, mean grad: 0.000037
training loss: 3.749807 acc:  0.000, mean grad: 0.000016
training loss: 1.376466 acc: 31.250, mean grad: 0.000023
training loss: 2.855262 acc:  0.000, mean grad: 0.000004
training loss: 1.368292 acc: 37.500, mean grad: 0.000011
training loss: 2.202459 acc:  0.000, mean grad: -0.000011
training loss: 1.407878 acc: 31.250, mean grad: 0.000005
training loss: 1.812854 acc:  0.000, mean grad: -0.000019
training loss: 1.455032 acc: 25.000, mean grad: 0.000001
training loss: 1.591144 acc: 11.111, mean grad: -0.000020
training loss: 1.492543 acc: 31.250, mean grad: -0.000003
training loss: 1.464581 acc: 44.444, mean grad: -0.000024
training loss: 1.519831 acc: 25.000, mean grad: -0.000005
training loss: 1.391081 acc: 88.889, mean grad: -0.000025


[  200/  500] loss: 1.6160 (1.6043), acc: 33.333% (23.410%)


training loss: 1.327469 acc: 37.500, mean grad: 0.000034
training loss: 4.517234 acc:  0.000, mean grad: 0.000083
training loss: 1.280630 acc: 37.500, mean grad: 0.000023
training loss: 3.809595 acc:  0.000, mean grad: 0.000061
training loss: 1.233581 acc: 37.500, mean grad: 0.000005
training loss: 2.890921 acc:  0.000, mean grad: 0.000038
training loss: 1.242794 acc: 37.500, mean grad: -0.000010
training loss: 2.258616 acc:  0.000, mean grad: 0.000010
training loss: 1.285854 acc: 37.500, mean grad: -0.000018
training loss: 1.886380 acc:  0.000, mean grad: -0.000015
training loss: 1.340682 acc: 37.500, mean grad: -0.000011
training loss: 1.669206 acc: 11.111, mean grad: -0.000020
training loss: 1.387596 acc: 50.000, mean grad: -0.000012
training loss: 1.535256 acc: 33.333, mean grad: -0.000026
training loss: 1.421999 acc: 43.750, mean grad: -0.000010
training loss: 1.448107 acc: 44.444, mean grad: -0.000033
training loss: 1.525599 acc: 31.250, mean grad: 0.000025
training loss: 5.10293

training loss: 1.587500 acc: 25.000, mean grad: 0.000083
training loss: 4.603962 acc:  0.000, mean grad: 0.000050
training loss: 1.491851 acc: 25.000, mean grad: 0.000064
training loss: 3.780923 acc:  0.000, mean grad: 0.000042
training loss: 1.367273 acc: 31.250, mean grad: 0.000042
training loss: 2.692118 acc:  0.000, mean grad: 0.000023
training loss: 1.338830 acc: 37.500, mean grad: 0.000023
training loss: 1.945354 acc:  0.000, mean grad: -0.000002
training loss: 1.377036 acc: 43.750, mean grad: 0.000019
training loss: 1.568284 acc: 33.333, mean grad: -0.000010
training loss: 1.414522 acc: 25.000, mean grad: -0.000007
training loss: 1.386450 acc: 55.556, mean grad: -0.000001
training loss: 1.438939 acc: 31.250, mean grad: -0.000004
training loss: 1.301220 acc: 66.667, mean grad: -0.000003
training loss: 1.452508 acc: 25.000, mean grad: -0.000009
training loss: 1.255824 acc: 77.778, mean grad: -0.000003
training loss: 1.415954 acc: 18.750, mean grad: -0.000017
training loss: 4.52309

training loss: 1.298726 acc: 50.000, mean grad: 0.000061
training loss: 4.554849 acc:  0.000, mean grad: 0.000215
training loss: 1.209955 acc: 50.000, mean grad: 0.000018
training loss: 3.613500 acc:  0.000, mean grad: 0.000066
training loss: 1.186527 acc: 56.250, mean grad: 0.000015
training loss: 2.624664 acc:  0.000, mean grad: 0.000029
training loss: 1.234597 acc: 68.750, mean grad: 0.000012
training loss: 1.990861 acc:  0.000, mean grad: 0.000007
training loss: 1.306048 acc: 93.750, mean grad: 0.000013
training loss: 1.663482 acc:  0.000, mean grad: -0.000005
training loss: 1.361376 acc: 93.750, mean grad: 0.000014
training loss: 1.501121 acc: 44.444, mean grad: -0.000010
training loss: 1.397456 acc: 75.000, mean grad: 0.000013
training loss: 1.416961 acc: 55.556, mean grad: -0.000012
training loss: 1.421760 acc: 75.000, mean grad: 0.000014
training loss: 1.372232 acc: 55.556, mean grad: -0.000014
training loss: 1.495074 acc: 18.750, mean grad: 0.000040
training loss: 4.610948 acc

Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/connection.py", line 404, in _send_bytes
    self._send(header + buf)
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/connection.py", line 368, in _send
    n = write(self._handle, buf)
BrokenPipeError: [Errno 32] Broken pipe
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/queues.py", line 240, in _feed
    send_bytes(obj)
  File "/Users/sjadon/anaconda3/envs/mysite/lib/python3.6/multiprocessing/connection.py", line 200, in send_bytes
    self._send_bytes(m[offset:offset + size])

KeyboardInterrupt: 