In [1]:
import os
import sys
# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-9.0/lib64/'
sys.path.append('/usr/local/cuda-9.0/lib64/')

In [11]:
%load_ext autoreload
%autoreload 2

import random
import argparse
import torch.nn as nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import cupy as cp

import math
import chainer.functions as F
import chainer.links as L

import sys, os, time, argparse
import numpy as np
import chainer
import chainer.functions as F
from chainer import Variable, optimizers, cuda, serializers

from chainer_func.source.chainer_functions import loss
from chainer_func.source.data import Data
from chainer_func.source.utils import mkdir_p, load_npz_as_dict
from chainer_func.models import CNN, MLP

from ExpUtils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [60]:
def parse_args():
    args = argparse.Namespace()
    args.dataset = "mnist"
    args.trainer = "VATReg"
    args.lr = 0.001
    args.arch = "mlp"
    args.iterations = 1000
    args.seed = 1
    args.size = 100
    args.no_cuda = False
    
    args.xi = 10
    args.eps = 1
    args.k = 1
    args.use_entmin = False
    args.alpha = 1
    
    args.gpu = -1
    args.data_dir = "./dataset/cifar10/"
    args.log_dir = "log"
    args.n_categories = 10
    args.eval_freq = 5
    args.snapshot_freq = 20
    args.aug_flip = False
    args.aug_trans = False
    args.validation = False
    args.dataset_seed = 1
    args.batchsize = 100
    args.batchsize_eval = 100
    args.num_epochs = 100
    args.num_iter_per_epoch = 1
    args.epoch_decay_start = 80
    args.lr = 0.001
    args.mom1 = 0.9
    args.mom2 = 0.5
    args.method = "vat"
    args.epsilon = 3.5
    args.extra_lamb = 1
    args.dropout_rate = 0.5
    args.top_bn = True
    
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    args.data_dir = os.path.join(os.environ['HOME'], "project/data/dataset/%s" % args.dataset)

    chainer.global_config.cudnn_deterministic = True
    random.seed(args.seed)
    if int(args.gpu) > -1:
        chainer.cuda.get_device(args.gpu).use()
    np.random.seed(args.seed)
    cp.random.seed(args.seed)
    return args

args = parse_args()

# chainer code

In [25]:
from chainer_func import *

In [54]:
set_framework_seed(1)
train_l, train_ul, test = load_dataset(args.data_dir, valid=args.validation, dataset_seed=args.dataset_seed, size=100)
print("N_train_labeled:{}, N_train_unlabeled:{}".format(train_l.N, train_ul.N))
print(train_l.data.sum(), train_l.data.mean(), train_l.data.max())

N_train_labeled:100, N_train_unlabeled:60000
10491.246 0.13381691 0.99609375


In [55]:
train_l.label

array([1, 9, 3, 9, 6, 1, 2, 8, 5, 3, 3, 6, 7, 9, 4, 2, 6, 2, 7, 1, 1, 9,
       9, 0, 5, 4, 0, 1, 5, 4, 8, 7, 4, 4, 3, 0, 0, 6, 1, 0, 4, 8, 2, 2,
       0, 7, 4, 7, 3, 9, 9, 2, 6, 0, 2, 3, 1, 0, 3, 8, 2, 5, 0, 7, 7, 7,
       4, 6, 7, 6, 5, 5, 8, 8, 9, 1, 8, 5, 2, 6, 9, 5, 3, 2, 4, 6, 3, 8,
       1, 9, 3, 6, 5, 5, 7, 4, 8, 1, 0, 8], dtype=int32)

In [57]:
set_framework_seed(1)
x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
print(t)

[9 4 4 5 5 2 0 3 6 7 5 0 1 6 2 4 7 0 2 3 3 8 1 0 1 0 7 1 4 1 6 8 2 4 1 4 3
 8 0 3 3 0 5 2 2 8 7 6 0 0 2 3 5 6 3 9 9 8 9 9 8 4 5 2 7 9 5 2 1 3 0 9 5 7
 8 8 9 7 6 4 4 5 6 7 1 9 4 2 5 8 9 6 7 6 1 1 3 8 7 6]


In [14]:
def call_bn(bn, x, test=False, update_batch_stats=True):
    if test:
        return F.fixed_batch_normalization(x, bn.gamma, bn.beta, bn.avg_mean, bn.avg_var)
    elif not update_batch_stats:
        return F.batch_normalization(x, bn.gamma, bn.beta)
    else:
        return bn(x)

In [12]:
class MLP(chainer.Chain):
    def __init__(self, n_ch=1, n_res=28, n_outputs=10, dropout_rate=0.5, top_bn=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        initializer = chainer.initializers.HeUniform(1.0)
        super(MLP, self).__init__(
            l_c1=L.Linear(n_ch * n_res * n_res, 1200, initialW=initializer),
            l_c2=L.Linear(1200, 1200, initialW=initializer),
            l_c3=L.Linear(1200, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(1200),
            bn2=L.BatchNormalization(1200),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.l_c1(h)
        h = F.relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats))
        h = self.l_c2(h)
        h = F.relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats))
        logit = self.l_c3(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [13]:
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)

variable gamma([1., 1., 1., ..., 1., 1., 1.])

# match MLP results

In [58]:
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
# enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
print(x.sum())
print(t)
set_framework_seed(1)
te = enc(Variable(x))
print(te.data.sum())

10491.246
[9 4 4 5 5 2 0 3 6 7 5 0 1 6 2 4 7 0 2 3 3 8 1 0 1 0 7 1 4 1 6 8 2 4 1 4 3
 8 0 3 3 0 5 2 2 8 7 6 0 0 2 3 5 6 3 9 9 8 9 9 8 4 5 2 7 9 5 2 1 3 0 9 5 7
 8 8 9 7 6 4 4 5 6 7 1 9 4 2 5 8 9 6 7 6 1 1 3 8 7 6]
-139.04709


In [44]:
def loss_labeled(forward, x, t):
    y = forward(x, update_batch_stats=True)
    L = F.softmax_cross_entropy(y, t)
    return L

def loss_test(forward, x, t):
    logit = forward(x, train=False)
    L, acc = F.softmax_cross_entropy(logit, t).data, F.accuracy(logit, t).data
    return L, acc

In [12]:
loss = loss_labeled(enc, Variable(x), t)
loss.backward()

# Train and Evaluation process

In [61]:
def evaluate(epoch, enc, test, args):
    with chainer.using_config("train", False):
        acc_test_sum = 0
        test_x, test_t = test.get()
        N_test = test_x.shape[0]
        for i in range(0, N_test, args.batchsize_eval):
            x = test_x[i:i + args.batchsize_eval]
            t = test_t[i:i + args.batchsize_eval]
            if args.gpu > -1:
                x, t = cuda.to_gpu(x, device=args.gpu), cuda.to_gpu(t, device=args.gpu)
            _, acc = loss_test(enc, Variable(x), Variable(t))
            acc_test_sum += acc * x.shape[0]
        accs_test = acc_test_sum / N_test
        if epoch < 5 or epoch % 10 == 0:
            wlog("Epoch:{}, nll loss:{}".format(epoch, cl_losses[epoch]))
            wlog("test acc:{}".format(accs_test))

In [62]:
train_l, train_ul, test = load_dataset(args.data_dir, valid=args.validation, dataset_seed=args.dataset_seed, size=args.size)
wlog("N_train_labeled:{}, N_train_unlabeled:{}".format(train_l.N, train_ul.N))
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
# enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
    
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()

cl_losses = np.zeros(args.num_epochs)
for epoch in range(args.num_epochs):
    sum_loss_l = 0
    sum_loss_ul = 0
    for it in range(args.num_iter_per_epoch):
        with chainer.using_config("train", True):
            x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
            loss_total = loss_labeled(enc, Variable(x), t)
            enc.cleargrads()
            loss_total.backward()
            optimizer.update()
            sum_loss_l += loss_total.data
        cl_losses[epoch] = sum_loss_l / args.num_iter_per_epoch
    evaluate(epoch, enc, test, args)

2019-03-06 16:23:03,625 - <ipython-input-62-239c4b580332>[line:2]: N_train_labeled:100, N_train_unlabeled:60000
2019-03-06 16:23:04,938 - <ipython-input-61-497a3700c69f>[line:15]: Epoch:0, nll loss:2.476536750793457
2019-03-06 16:23:04,939 - <ipython-input-61-497a3700c69f>[line:16]: test acc:0.46589999765157697
2019-03-06 16:23:06,299 - <ipython-input-61-497a3700c69f>[line:15]: Epoch:1, nll loss:0.2439270168542862
2019-03-06 16:23:06,301 - <ipython-input-61-497a3700c69f>[line:16]: test acc:0.5460999998450279
2019-03-06 16:23:07,580 - <ipython-input-61-497a3700c69f>[line:15]: Epoch:2, nll loss:0.0213171374052763
2019-03-06 16:23:07,581 - <ipython-input-61-497a3700c69f>[line:16]: test acc:0.5950999960303307
2019-03-06 16:23:08,835 - <ipython-input-61-497a3700c69f>[line:15]: Epoch:3, nll loss:0.004524578806012869
2019-03-06 16:23:08,837 - <ipython-input-61-497a3700c69f>[line:16]: test acc:0.6248999983072281
2019-03-06 16:23:10,134 - <ipython-input-61-497a3700c69f>[line:15]: Epoch:4, nll l

In [14]:
class CNN(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False, dropout=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        self.dropout = dropout
        initializer = chainer.initializers.HeUniform(1.0)
        super(CNN, self).__init__(
            c1=L.Convolution2D(3, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c2=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c3=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c4=L.Convolution2D(128, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c5=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c6=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c7=L.Convolution2D(256, 512, ksize=3, stride=1, pad=0, initialW=initializer),
            c8=L.Convolution2D(512, 256, ksize=1, stride=1, pad=0, initialW=initializer),
            c9=L.Convolution2D(256, 128, ksize=1, stride=1, pad=0, initialW=initializer),
            l_cl=L.Linear(128, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(128),
            bn2=L.BatchNormalization(128),
            bn3=L.BatchNormalization(128),
            bn4=L.BatchNormalization(256),
            bn5=L.BatchNormalization(256),
            bn6=L.BatchNormalization(256),
            bn7=L.BatchNormalization(512),
            bn8=L.BatchNormalization(256),
            bn9=L.BatchNormalization(128),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = F.leaky_relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c2(h)
        h = F.leaky_relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c3(h)
        h = F.leaky_relu(call_bn(self.bn3, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c4(h)
        h = F.leaky_relu(call_bn(self.bn4, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c5(h)
        h = F.leaky_relu(call_bn(self.bn5, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c6(h)
        h = F.leaky_relu(call_bn(self.bn6, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c7(h)
        h = F.leaky_relu(call_bn(self.bn7, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c8(h)
        h = F.leaky_relu(call_bn(self.bn8, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c9(h)
        h = F.leaky_relu(call_bn(self.bn9, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.average_pooling_2d(h, ksize=h.data.shape[2])
        logit = self.l_cl(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [16]:
set_framework_seed(1)
# enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
out = enc( Variable(x), update_batch_stats=True)
print(x.sum(), out.data.sum())

-562.97064 40.848587


In [17]:
loss = loss_labeled(enc, Variable(x), t)
loss.backward()

In [16]:
def loss_test(forward, x, t):
    logit = forward(x, train=False)
    L, acc = F.softmax_cross_entropy(logit, t).data, F.accuracy(logit, t).data
    return L, acc

# CNN without dropout

the results are very close, the difference is caused by error/precision

In [26]:
set_framework_seed(1)

enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
optimizer = optimizers.Adam(alpha=args.lr, beta1=args.mom1)
optimizer.setup(enc)
optimizer.use_cleargrads()
set_framework_seed(1)
train_l.reseed()
for it in range(10):
    with chainer.using_config("train", True):
        x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
        
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        print(x.sum(), loss_l)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()


-562.97064 variable(2.260621)
-239.57237 variable(2.737069)
2.0764809 variable(2.6666842)
126.50333 variable(2.240642)
31.462202 variable(2.0930262)
-653.71985 variable(2.2371545)
-206.56343 variable(2.0542524)
-199.04095 variable(2.359859)
173.35344 variable(2.3512278)
-129.28891 variable(2.3139074)


In [23]:
with chainer.using_config("train", False):
    acc_test_sum = 0
    test_x, test_t = test.get()
    N_test = test_x.shape[0]
    for i in range(0, N_test, args.batchsize_eval):
        x = test_x[i:i + args.batchsize_eval]
        t = test_t[i:i + args.batchsize_eval]
        if args.gpu > -1:
            x, t = cuda.to_gpu(x, device=args.gpu), cuda.to_gpu(t, device=args.gpu)
        _, acc = loss_test(enc, Variable(x), Variable(t))
        acc_test_sum += acc * x.shape[0]
    print(acc_test_sum / N_test)

0.10249999959021806


In [24]:
set_framework_seed(1)
# enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
optimizer = optimizers.Adam(alpha=args.lr, beta1=args.mom1)
optimizer.setup(enc)
optimizer.use_cleargrads()
set_framework_seed(1)
train_l.reseed()
for it in range(10):
    set_framework_seed(it % 10000)
    with chainer.using_config("train", True):
        x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
        
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        print(x.sum(), loss_l)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()
        

-562.97064 variable(2.270143)
-239.57237 variable(2.5744624)
2.0764809 variable(2.8344507)
126.50333 variable(2.2541957)
31.462202 variable(2.2864416)
-653.71985 variable(2.2899199)
-206.56343 variable(2.2821083)
-199.04095 variable(2.5041566)
173.35344 variable(2.545939)
-129.28891 variable(2.350461)


In [25]:
with chainer.using_config("train", False):
    acc_test_sum = 0
    test_x, test_t = test.get()
    N_test = test_x.shape[0]
    for i in range(0, N_test, args.batchsize_eval):
        x = test_x[i:i + args.batchsize_eval]
        t = test_t[i:i + args.batchsize_eval]
        if args.gpu > -1:
            x, t = cuda.to_gpu(x, device=args.gpu), cuda.to_gpu(t, device=args.gpu)
        _, acc = loss_test(enc, Variable(x), Variable(t))
        acc_test_sum += acc * x.shape[0]
    print(acc_test_sum / N_test)

0.10439999988302588


# large margin

In [18]:
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
# enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
print(x.sum())
print(t)
set_framework_seed(1)
te = enc(Variable(x))
print(te.data.sum())

-562.97064
[0 2 1 5 2 6 7 2 3 2 0 8 2 4 3 5 1 6 3 3 3 2 8 1 7 0 0 6 6 3 3 1]
-5.4525633


In [85]:
x_variable = Variable(x)
te = enc(x_variable)
print(te.data.sum())
logits = te

new_data = F.repeat(logits.reshape(32, 10, 1), 10, axis=2)
new_data_t = F.transpose(new_data, axes=(0, 2, 1))
dif = F.absolute(new_data - new_data_t)
dif.data[:, np.arange(10), np.arange(10)] = 10000
minimum_dif = F.min(dif, axis=(1, 2))
loss = F.sum(minimum_dif)

-5.4525633


In [95]:
loss.backward()
d = x_variable.grad

In [110]:
d = x_variable.grad

In [96]:
d.shape

(32, 3, 32, 32)

In [97]:
F.batch_l2_norm_squared(d)

variable([43.8882  , 32.518997, 42.099136, 42.29032 , 35.56052 , 32.028145,
          44.87703 , 35.137177, 42.562088, 49.06534 , 37.460518, 56.70897 ,
          48.79129 , 58.22379 , 58.045277, 53.70053 , 44.11779 , 36.081795,
          57.63233 , 31.73954 , 48.41804 , 42.846928, 40.61837 , 42.383553,
          47.325497, 34.945408, 45.08779 , 55.141594, 45.598877, 52.274117,
          45.350273, 36.295547])

In [99]:
xp.sum(d ** 2, axis =tuple(range(1, len(d.shape))))

array([43.8882  , 32.518997, 42.099136, 42.29032 , 35.56052 , 32.028145,
       44.87703 , 35.137177, 42.562088, 49.06534 , 37.460518, 56.70897 ,
       48.79129 , 58.22379 , 58.045277, 53.70053 , 44.11779 , 36.081795,
       57.63233 , 31.73954 , 48.41804 , 42.846928, 40.61837 , 42.383553,
       47.325497, 34.945408, 45.08779 , 55.141594, 45.598877, 52.274117,
       45.350273, 36.295547], dtype=float32)

In [88]:
xp = cuda.get_array_module(x_variable.data)

In [105]:
d = d / xp.sqrt(xp.sum(d ** 2, axis =tuple(range(1, len(d.shape))), keepdims=True))

In [106]:
(d ** 2).sum()

32.0

In [107]:
d.shape

(32, 3, 32, 32)

In [93]:
minimum_dif.shape

(32,)

In [112]:
minimum_dif

variable([0.03196675, 0.01531726, 0.03652483, 0.01392016, 0.01174247,
          0.0463711 , 0.04788411, 0.01914483, 0.02834719, 0.05728129,
          0.00792599, 0.01005088, 0.00395525, 0.06017971, 0.01658297,
          0.05855221, 0.03900647, 0.00564754, 0.00618389, 0.0116004 ,
          0.03603446, 0.03971651, 0.01833946, 0.01108512, 0.00718176,
          0.01502152, 0.17572033, 0.00577849, 0.00456291, 0.0354563 ,
          0.01088476, 0.10022384])

In [111]:
minimum_dif / xp.sqrt(xp.sum(d ** 2, axis =tuple(range(1, len(d.shape)))))

variable([0.0048253 , 0.00268604, 0.00562926, 0.00214054, 0.00196914,
          0.00819373, 0.00714791, 0.00322974, 0.00434509, 0.00817759,
          0.00129499, 0.00133468, 0.00056624, 0.00788678, 0.0021766 ,
          0.00799013, 0.00587259, 0.00094019, 0.00081457, 0.00205908,
          0.00517863, 0.00606752, 0.00287757, 0.00170271, 0.00104396,
          0.00254108, 0.02616932, 0.00077817, 0.00067572, 0.004904  ,
          0.00161633, 0.01663583])

In [21]:
class_prob = F.softmax(logits)

In [24]:
class_prob[:3]

variable([[1.54169023e-01, 5.03710285e-02, 8.62801373e-02, 5.62610142e-02,
           7.01920614e-02, 4.87862900e-02, 1.48522973e-01, 1.27890825e-01,
           1.61214247e-01, 9.63124484e-02],
          [1.34237275e-01, 5.56553602e-02, 1.00720234e-01, 1.13789409e-01,
           1.36309236e-01, 9.80082080e-02, 1.54791638e-01, 6.56634644e-02,
           1.04473017e-01, 3.63521986e-02],
          [1.61101922e-01, 2.13813096e-01, 6.77430443e-03, 3.02409520e-03,
           7.02630868e-03, 9.21606726e-04, 9.70342234e-02, 5.04525900e-01,
           4.32566099e-04, 5.34597319e-03]])

In [22]:
correct_class_prob = class_prob[np.arange(class_prob.shape[0]), t]

In [23]:
correct_class_prob

variable([0.15416902, 0.10072023, 0.2138131 , 0.08669429, 0.32581642,
          0.0384522 , 0.23383127, 0.10946563, 0.01286336, 0.10806721,
          0.03154936, 0.0925095 , 0.07900738, 0.02592808, 0.04358358,
          0.09468471, 0.2182813 , 0.15935653, 0.01923206, 0.09014764,
          0.08973202, 0.15906514, 0.25026926, 0.0619849 , 0.08292997,
          0.09619284, 0.05878942, 0.04833115, 0.19643797, 0.08501525,
          0.03976822, 0.28050393])

In [None]:
def large_margin(_sentinel=None, logits=None, one_hot_labels=None, layers_list=None, gamma=10000, alpha_factor=2, top_k=1, dist_norm=2,
                 epsilon=1e-8, use_approximation=True, loss_type="all_top_k", loss_collection=tf.GraphKeys.LOSSES):
    # Pick the correct class probability.
    correct_class_prob = tf.reduce_sum(class_prob * one_hot_labels, axis=1, keepdims=True)

    # Class probabilities except the correct.
    other_class_prob = class_prob * (1. - one_hot_labels)
    if top_k > 1:
        # Pick the top k class probabilities other than the correct.
        top_k_class_prob, _ = tf.nn.top_k(other_class_prob, k=top_k)
    else:
        top_k_class_prob = tf.reduce_max(other_class_prob, axis=1, keepdims=True)

    # Difference between correct class probailities and top_k probabilities.
    difference_prob = correct_class_prob - top_k_class_prob
    losses_list = []

    difference_prob_grad = [
        tf.layers.flatten(tf.gradients(difference_prob[:, i], layer)[0])
        for i in range(top_k)
    ]

    difference_prob_gradnorm = tf.concat([
        tf.map_fn(norm_fn, difference_prob_grad[i])[:, tf.newaxis]
        for i in range(top_k)
    ], axis=1)

    if use_approximation:
        difference_prob_gradnorm = tf.stop_gradient(difference_prob_gradnorm)

    distance_to_boundary = difference_prob / (
                    difference_prob_gradnorm + epsilon)

    if loss_type == "worst_top_k":
        # Only consider worst distance to boundary.
        distance_to_boundary = tf.reduce_min(distance_to_boundary, axis=1)

    elif loss_type == "average_top_k":
        # Only consider average distance to boundary.
        distance_to_boundary = tf.reduce_mean(distance_to_boundary, axis=1)

    # Distances to consider between distance_upper and distance_lower bounds
    distance_upper = gamma
    distance_lower = gamma * (1 - alpha_factor)

    # Enforce lower bound.
    loss_layer = maximum_with_relu(distance_to_boundary, distance_lower)

    # Enforce upper bound.
    loss_layer = maximum_with_relu(
        0, distance_upper - loss_layer) - distance_upper

    losses_list.append(tf.reduce_mean(loss_layer))

    loss = tf.reduce_mean(losses_list)
    # Add loss to loss_collection.
    tf.losses.add_loss(loss, loss_collection)