In [1]:
import os
import sys
os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-9.0/lib64/'
sys.path.append('/usr/local/cuda-9.0/lib64/')
import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as nfunc
from torch.nn.parameter import Parameter

gpu = ""

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

device = torch.device("cuda" if gpu else "cpu")

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.3

tf.enable_eager_execution(tf_config)

In [2]:
%load_ext autoreload
%autoreload 2

from tf_func import data_loader
from tf_func import mnist_model


class ConfigDict(object):
    """MNIST configration."""

    def __init__(self):
        self.num_classes = 10

        # List of tuples specify (kernel_size, number of filters) for each layer.
        self.filter_sizes_conv_layers = [(5, 32), (5, 64)]
        # Dictionary of pooling type ("max"/"average", size and stride).
        self.pool_params = {"type": "max", "size": 2, "stride": 2}
        self.num_units_fc_layers = [512]
        self.dropout_rate = 0
        self.batch_norm = True
        self.activation = None
        self.regularizer = None
        
        
config = ConfigDict()

In [3]:
dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="train",
    batch_size=128,
    is_training=False)

test_dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="test",
    batch_size=128,
    is_training=False)

In [4]:
images, labels, num_examples, num_classes = (dataset.images, dataset.labels, dataset.num_examples, dataset.num_classes)
images, labels = dataset.get_next()
images.numpy().sum()

12790.145

# chainer

In [17]:
%load_ext autoreload
%autoreload 2

import random
import argparse
import torch.nn as nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import cupy as cp

import sys, os, time, argparse
import numpy as np
import chainer
import chainer.functions as F
import chainer.links as L
from chainer import Variable, optimizers, cuda, serializers

# from source.chainer_functions import loss
# from source.data import Data
# from source.utils import mkdir_p, load_npz_as_dict
# from models.cnn import CNN

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [7]:
def parse_args():
    args = argparse.Namespace()
    args.dataset = "mnist"
    args.trainer = "VATReg"
    args.lr = 0.002
    args.arch = "mlp"
    args.iterations = 1000
    args.seed = 1
    args.size = 100
    args.no_cuda = False
    
    args.xi = 10
    args.eps = 1
    args.k = 1
    args.use_entmin = False
    args.alpha = 1
    
    args.gpu = -1
    args.data_dir = "./dataset/svhn/"
    args.log_dir = "log"
    args.n_categories = 10
    args.eval_freq = 5
    args.snapshot_freq = 20
    args.aug_flip = False
    args.aug_trans = False
    args.validation = False
    args.dataset_seed = 1
    args.batchsize = 100
    args.batchsize_ul = 250
    args.batchsize_eval = 100
    args.num_epochs = 120
    args.num_iter_per_epoch = 500
    args.epoch_decay_start = 0
    args.method = "vat"
    args.epsilon = 0.3
    args.extra_lamb = 1
    args.dropout_rate = 0.5
    args.top_bn = True
    
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = "2"
    args.data_dir = os.path.join("./dataset/%s" % args.dataset)

    chainer.global_config.cudnn_deterministic = True
    random.seed(args.seed)
    if int(args.gpu) > -1:
        chainer.cuda.get_device(args.gpu).use()
    np.random.seed(args.seed)
    cp.random.seed(args.seed)
    return args

args = parse_args()

# chainer code

In [53]:
def loss_labeled(forward, x, t):
    y = forward(x, update_batch_stats=True)
    L = F.softmax_cross_entropy(y, t)
    return L

def loss_test(forward, x, t):
    logit = forward(x, train=False)
    L, acc = F.softmax_cross_entropy(logit, t).data, F.accuracy(logit, t).data
    return L, acc

def set_framework_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    try:
        import torch
        torch.manual_seed(seed)
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    except ImportError:
        pass
    try:
        import cupy as cp
        cp.random.seed(seed)
    except ImportError:
        pass

In [103]:
def loss_test(forward, x, t):
    logit = forward(x, train=False)
    L, acc = F.softmax_cross_entropy(logit, t).data, F.accuracy(logit, t).data
    return L, acc


def evaluate_classifier(model, test_iter):
    total_acc = 0
    total_loss = 0
    size = 0
    
    criterion = nn.CrossEntropyLoss()
    with chainer.using_config("train", False):
        for images, labels in test_iter:
            size += images.numpy().shape[0]
            images = images.numpy()
            labels = labels.numpy()
            loss, acc = loss_test(model, images, labels)
            total_loss += loss * images.shape[0]
            
            total_acc += acc * images.shape[0]
    
    return total_acc / size, total_loss / size

In [40]:
def call_bn(bn, x, test=False, update_batch_stats=True):
    if test:
        return F.fixed_batch_normalization(x, bn.gamma, bn.beta, bn.avg_mean, bn.avg_var)
    elif not update_batch_stats:
        return F.batch_normalization(x, bn.gamma, bn.beta)
    else:
        return bn(x)
    
class MLP(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False, dropout=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        self.dropout = dropout
        initializer = chainer.initializers.HeUniform(1)
        super(MLP, self).__init__(
            c1=L.Linear(784, 1200, initialW=initializer),
            c2=L.Linear(1200, 1200, initialW=initializer),
            l_cl=L.Linear(1200, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(1200),
            bn2=L.BatchNormalization(1200),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = F.relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats))
        h = self.c2(h)
        h = F.relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats))
        logit = self.l_cl(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [102]:
def loss_labeled(forward, x, t):
    y = forward(x, update_batch_stats=True)
    print("logits", (y.data ** 2).sum())
    L = F.softmax_cross_entropy(y, t)
    return L

set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()

for it in range(5):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
        enc.cleargrads()
#         loss_l.backward()
#         optimizer.update()

acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
print("test loss", loss, "test acc", acc)
        

logits 1279.9625
True
it 0 ce loss variable(2.6372051) vat loss variable(0.3949525)
logits 1279.9597
True
it 1 ce loss variable(2.447002) vat loss variable(0.59222496)
logits 1279.9594
True
it 2 ce loss variable(2.3731635) vat loss variable(0.60971195)
logits 1279.9573
True
it 3 ce loss variable(2.6826663) vat loss variable(0.62293935)
logits 1279.9609
True
it 4 ce loss variable(2.6122522) vat loss variable(0.6236712)
test loss 2.315154103132395 test acc 0.11057692307692307


In [100]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()

for it in range(5):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul = 0
        #loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()
   
acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
print("test loss", loss, "test acc", acc)
        

logits 6.67572e-06
it 0 ce loss variable(2.6372051) vat loss 0
logits -6.389618e-05
it 1 ce loss variable(1.0207927) vat loss 0
logits 0.0028562546
it 2 ce loss variable(0.9196422) vat loss 0
logits 0.026527405
it 3 ce loss variable(0.9217887) vat loss 0
logits 0.05603504
it 4 ce loss variable(0.94327796) vat loss 0
test loss 0.9446233698190787 test acc 0.7655248397435898


In [80]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()
debug = True
def vat_loss(forward, distance, x, y=None, train=True, epsilon=8.0,
             xi=1e-6, num_iter=1, p_logit=None):
    if p_logit is None:
        p_logit = forward(x, train=train, update_batch_stats=False).data  # unchain
    else:
        assert not isinstance(p_logit, Variable)

    xp = cuda.get_array_module(x.data)
    d = np.random.random(size=x.shape)
    d = get_normalized_vector(d, xp)
    if debug:
        print("d", d.sum(), (d ** 2).sum())
        
    for ip in range(num_iter):
        x_d = Variable(x.data + xi * d.astype(xp.float32))
        if debug:
            print("input", x_d.data.sum())
        p_d_logit = forward(x_d, train=train, update_batch_stats=True)
        if debug:
            print("output", p_d_logit.data.sum())
        kl_loss = distance(p_logit, p_d_logit)
        if debug:
            print("loss", kl_loss)
        kl_loss.backward()
        d = x_d.grad
        d = d / xp.sqrt(xp.sum(d ** 2, axis=tuple(range(1, len(d.shape))), keepdims=True))
        if debug:
            print("d", d.sum())
    x_adv = x + epsilon * d

    p_adv_logit = forward(x_adv, train=train, update_batch_stats=False)
    pos_cost = distance(p_logit, p_adv_logit)
    if debug:
        print("post cost", pos_cost)
    return pos_cost, p_d_logit

for it in range(3):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
#         enc.cleargrads()
#         loss_l.backward()
#         optimizer.update()
   
acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
print("test loss", loss, "test acc", acc)
        

d 3101.001574004157 127.99999951050698
input 12790.146
output 1.7166138e-05
loss variable(-1.1133411e-08)
d -0.9175085
post cost variable(0.18286765)
it 0 ce loss variable(2.6372051) vat loss variable(0.18286765)
d 3104.2961100598077 127.99999951136697
input 12953.986
output 1.9073486e-06
loss variable(0.77622306)
d -0.025210619
post cost variable(1.2601826)
it 1 ce loss variable(2.447002) vat loss variable(1.2601826)
d 3103.1908931663115 127.99999951057141
input 12442.994
output -1.4781952e-05
loss variable(0.8542405)
d 9.233387
post cost variable(1.3496399)
it 2 ce loss variable(2.3731635) vat loss variable(1.3496399)
test loss 2.316648061458881 test acc 0.1141826923076923


In [96]:
def call_bn(bn, x, test=False, update_batch_stats=True):
    if test:
        return F.fixed_batch_normalization(x, bn.gamma, bn.beta, bn.avg_mean, bn.avg_var)
    elif not update_batch_stats:
        return F.batch_normalization(x, bn.gamma, bn.beta)
    else:
        return bn(x)
    
class MLP(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False, dropout=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        self.dropout = dropout
        initializer = chainer.initializers.HeUniform(1)
        super(MLP, self).__init__(
            c1=L.Linear(784, 1200, initialW=initializer),
            c2=L.Linear(1200, 1200, initialW=initializer),
            l_cl=L.Linear(1200, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(1200),
            bn2=L.BatchNormalization(1200),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = F.relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats))
        h = self.c2(h)
        h = F.relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats))
        logit = self.l_cl(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [95]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)

model = enc
print(images.numpy().sum())
x = images.numpy()
p = model(x, train=True, update_batch_stats=True)
print(p.data.sum())
p = model(x, train=True, update_batch_stats=False)
print(p.data.sum())
p = model(x, train=False, update_batch_stats=True)
print(p.data.sum())

p = model(x, train=False, update_batch_stats=False)
print(p.data.sum())

12442.993
2.4318695e-05
2.4318695e-05
68.56414
68.56414


In [85]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()
debug = True

def vat_loss(forward, distance, x, y=None, train=True, epsilon=8.0,
             xi=1e-6, num_iter=1, p_logit=None):
    if p_logit is None:
        p_logit = forward(x, train=train, update_batch_stats=False).data  # unchain
    else:
        assert not isinstance(p_logit, Variable)

    xp = cuda.get_array_module(x.data)
    d = np.random.random(size=x.shape)
    d = get_normalized_vector(d, xp)
    if debug:
        print("d", d.sum(), (d ** 2).sum())
    print(train)
    for ip in range(num_iter):
        x_d = Variable(x.data + xi * d.astype(xp.float32))
        if debug:
            print("input", x_d.data.sum())
        p_d_logit = forward(x_d, train=False, update_batch_stats=False)
        if debug:
            print("output", p_d_logit.data.sum())
        kl_loss = distance(p_logit, p_d_logit)
        if debug:
            print("loss", kl_loss)
        kl_loss.backward()
        d = x_d.grad
        if debug:
            print("grad d", d.sum())
        d = d / xp.sqrt(xp.sum(d ** 2, axis=tuple(range(1, len(d.shape))), keepdims=True))
        if debug:
            print("d", d.sum())
    x_adv = x + epsilon * d

    p_adv_logit = forward(x_adv, train=False, update_batch_stats=False)
    pos_cost = distance(p_logit, p_adv_logit)
    if debug:
        print("post cost", pos_cost)
    return pos_cost, p_d_logit

for it in range(3):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
#         enc.cleargrads()
#         loss_l.backward()
#         optimizer.update()
   
acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
print("test loss", loss, "test acc", acc)
        

d 3101.001574004157 127.99999951050698
True
input 12790.146
output 60.197327
loss variable(0.27454972)
grad d 0.16488276
d 51.43709
post cost variable(0.3949525)
it 0 ce loss variable(2.6372051) vat loss variable(0.3949525)
d 3104.2961100598077 127.99999951136697
True
input 12953.986
output 68.14215
loss variable(0.43572783)
grad d 0.07118307
d 14.237335
post cost variable(0.59222496)
it 1 ce loss variable(2.447002) vat loss variable(0.59222496)
d 3103.1908931663115 127.99999951057141
True
input 12442.994
output 47.62139
loss variable(0.43799925)
grad d 0.0547363
d 14.975924
post cost variable(0.60971195)
it 2 ce loss variable(2.3731635) vat loss variable(0.60971195)
test loss 2.3108466466267905 test acc 0.11207932692307693


In [12]:
def kl_categorical(p_logit, q_logit):
    if isinstance(p_logit, chainer.Variable):
        xp = cuda.get_array_module(p_logit.data)
    else:
        xp = cuda.get_array_module(p_logit)
    p = F.softmax(p_logit)
    # print(p_logit.sum())
    # print(q_logit.data.sum())
    _kl = F.sum(p * (F.log_softmax(p_logit) - F.log_softmax(q_logit)), 1)
    # print(_kl)
    return F.sum(_kl) / xp.prod(xp.array(_kl.shape))

In [66]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()

for it in range(5):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
#         enc.cleargrads()
#         loss_l.backward()
#         optimizer.update()
    if (it+1) % 10 == 0:
        
        acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
        print("test loss", loss, "test acc", acc)
        

it 0 ce loss variable(2.6372051) vat loss variable(0.18520069)
it 1 ce loss variable(2.447002) vat loss variable(1.2601826)
it 2 ce loss variable(2.3731635) vat loss variable(1.3496399)
it 3 ce loss variable(2.6826663) vat loss variable(1.3082993)
it 4 ce loss variable(2.6122522) vat loss variable(1.2677813)


In [61]:
set_framework_seed(1)
debug = False
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()
images, labels = iterator.get_next()

for it in range(100):
    
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()
    if (it+1) % 10 == 0:
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
        acc, loss = evaluate_classifier(enc, test_dataset.dataset.make_one_shot_iterator())
        print("test loss", loss, "test acc", acc)
        

it 9 ce loss variable(0.34254467) vat loss variable(1.3673117)
test loss 1.2030614102498078 test acc 0.7214543269230769
it 19 ce loss variable(0.2949698) vat loss variable(1.4227189)
test loss 1.136353354423474 test acc 0.7433894230769231
it 29 ce loss variable(0.27102554) vat loss variable(1.4771286)
test loss 1.1068920233310797 test acc 0.7440905448717948
it 39 ce loss variable(0.25327155) vat loss variable(1.5224514)
test loss 1.1016949224166381 test acc 0.7450921474358975
it 49 ce loss variable(0.23831564) vat loss variable(1.565478)
test loss 1.1002999788675554 test acc 0.7425881410256411
it 59 ce loss variable(0.22491816) vat loss variable(1.6105677)
test loss 1.0964220601778765 test acc 0.7416866987179487
it 69 ce loss variable(0.21271741) vat loss variable(1.6542144)
test loss 1.0899064196990087 test acc 0.7415865384615384
it 79 ce loss variable(0.20149495) vat loss variable(1.6965376)
test loss 1.0825705895057092 test acc 0.7407852564102564
it 89 ce loss variable(0.19111884) v

In [51]:
set_framework_seed(1)
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=0, top_bn=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
optimizer = optimizers.Adam(alpha=args.lr)
optimizer.setup(enc)
optimizer.use_cleargrads()
iterator = dataset.dataset.make_one_shot_iterator()

for it in range(100):
    images, labels = iterator.get_next()
    with chainer.using_config("train", True):
        x = images.numpy()
        t = labels.numpy()
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        x_u = x
        loss_ul, _ = vat_loss(enc, kl_categorical, Variable(x_u), None, epsilon=args.epsilon, xi=1e-6, p_logit=logit.data)
        loss_total = loss_l + loss_ul
        print("it", it, "ce loss", loss_l, "vat loss", loss_ul)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()

d 18.809333635120456
loss variable(-1.0035267e-08)
d 2.8182023
variable(0.18520069)
it 0 ce loss variable(2.6372051) vat loss variable(0.18520069)
d 13.625716652791654
loss variable(1.0147533)
d -10.247404
variable(1.2203554)
it 1 ce loss variable(1.0207927) vat loss variable(1.2203554)
d -6.073275889670836
loss variable(1.0948114)
d 2.128046
variable(1.2472886)
it 2 ce loss variable(0.9196422) vat loss variable(1.2472886)
d -5.657291669940053
loss variable(1.1145282)
d -7.6926003
variable(1.2436831)
it 3 ce loss variable(0.9217887) vat loss variable(1.2436831)
d -0.31942830946959505
loss variable(1.0268389)
d 17.752928
variable(1.159715)
it 4 ce loss variable(0.94327796) vat loss variable(1.159715)
d 15.941850424374195
loss variable(1.0886321)
d -1.6491866
variable(1.1983008)
it 5 ce loss variable(0.84185266) vat loss variable(1.1983008)
d -1.8059021547236176
loss variable(1.1284835)
d 12.763174
variable(1.232116)
it 6 ce loss variable(0.8465243) vat loss variable(1.232116)
d 12.15066

loss variable(1.3605834)
d -5.204693
variable(1.4866124)
it 58 ce loss variable(0.4668119) vat loss variable(1.4866124)
d 5.930736741602792
loss variable(1.4217685)
d -2.96503
variable(1.5363104)
it 59 ce loss variable(0.48291457) vat loss variable(1.5363104)
d -2.0687157650696317
loss variable(1.3750725)
d -20.187246
variable(1.5088575)
it 60 ce loss variable(0.4801722) vat loss variable(1.5088575)
d 1.8124146704867552
loss variable(1.4250214)
d -31.683117
variable(1.5315392)
it 61 ce loss variable(0.50071263) vat loss variable(1.5315392)
d -2.850726191424384
loss variable(1.4569461)
d -34.39248
variable(1.5637801)
it 62 ce loss variable(0.42511448) vat loss variable(1.5637801)
d -11.751381348113794
loss variable(1.3885074)
d -24.86853
variable(1.5128889)
it 63 ce loss variable(0.49216902) vat loss variable(1.5128889)
d 6.378625198477101
loss variable(1.4517913)
d -32.447678
variable(1.5701138)
it 64 ce loss variable(0.5427566) vat loss variable(1.5701138)
d 2.7153277303448657
loss va

In [None]:
np.random.seed(args.seed)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=args.top_bn)
if args.gpu:
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
    
with chainer.using_config("train", False):
    logits = enc(Variable(x), train=False, update_batch_stats=False)
    print(logits.shape)
    print(logits.data.sum())

In [None]:
def vat_loss(forward, distance, x, train=True, epsilon=8.0, xi=1e-6, Ip=1, p_logit=None):
    if p_logit is None:
        p_logit = forward(x, train=train, update_batch_stats=False).data  # unchain
    else:
        assert not isinstance(p_logit, Variable)

    xp = cuda.get_array_module(x.data)
    d = xp.random.normal(size=x.shape)
    d = get_normalized_vector(d, xp)
    for ip in range(Ip):
        x_d = Variable(x.data + xi * d.astype(xp.float32))
        p_d_logit = forward(x_d, train=train, update_batch_stats=False)
        kl_loss = distance(p_logit, p_d_logit)
        kl_loss.backward()
        d = x_d.grad
        d = d / xp.sqrt(xp.sum(d ** 2, axis=tuple(range(1, len(d.shape))), keepdims=True))
    x_adv = x + epsilon * d 
    p_adv_logit = forward(x_adv, train=train, update_batch_stats=False)
    return distance(p_logit, p_adv_logit)

In [None]:
set_framework_seed(args.seed)
device = args.device

train_all, test, shape, num_classes = data_set_name(args.dataset)
if args.size == 0:
    args.size = len(train_all)
test_loader = DataLoader(test, 1000, num_workers=3)

train_l = SubsetDataset(train_all, list(range(args.size)))
train_ul = SubsetDataset(train_all, list(range(len(train_all) - 1000)))

print(len(train_l), len(train_ul))

batch_size_l = 32
batch_size_ul = 128

Arch = getattr(models, args.arch)
api_criterion = None

if args.trainer != "none":
    api_criterion = getattr(SemiMode, args.trainer)(args)

set_framework_seed(args.seed)
l_train_iter = iter(DataLoader(train_l, batch_size_l, num_workers=0, sampler=InfiniteSampler(len(train_l))))
ul_train_iter = iter(DataLoader(train_ul, batch_size_ul, num_workers=0, sampler=InfiniteSampler(len(train_ul))))

l_x, l_y = next(l_train_iter)
print("l_y", l_y[:5])
ul_x, ul_y = next(ul_train_iter)

print('Training...')

print("ul_y", ul_y[:5])

In [21]:
class CNN(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False, dropout=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        self.dropout = dropout
        initializer = chainer.initializers.HeUniform(1)
        super(CNN, self).__init__(
            c1=L.Convolution2D(3, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c2=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c3=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c4=L.Convolution2D(128, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c5=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c6=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c7=L.Convolution2D(256, 512, ksize=3, stride=1, pad=0, initialW=initializer),
            c8=L.Convolution2D(512, 256, ksize=1, stride=1, pad=0, initialW=initializer),
            c9=L.Convolution2D(256, 128, ksize=1, stride=1, pad=0, initialW=initializer),
            l_cl=L.Linear(128, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(128),
            bn2=L.BatchNormalization(128),
            bn3=L.BatchNormalization(128),
            bn4=L.BatchNormalization(256),
            bn5=L.BatchNormalization(256),
            bn6=L.BatchNormalization(256),
            bn7=L.BatchNormalization(512),
            bn8=L.BatchNormalization(256),
            bn9=L.BatchNormalization(128),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = F.leaky_relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c2(h)
        h = F.leaky_relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c3(h)
        h = F.leaky_relu(call_bn(self.bn3, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c4(h)
        h = F.leaky_relu(call_bn(self.bn4, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c5(h)
        h = F.leaky_relu(call_bn(self.bn5, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c6(h)
        h = F.leaky_relu(call_bn(self.bn6, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c7(h)
        h = F.leaky_relu(call_bn(self.bn7, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c8(h)
        h = F.leaky_relu(call_bn(self.bn8, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c9(h)
        h = F.leaky_relu(call_bn(self.bn9, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.average_pooling_2d(h, ksize=h.data.shape[2])
        logit = self.l_cl(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [38]:
set_framework_seed(1)
# enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
out = enc(Variable(x), update_batch_stats=True)
print(x.sum(), out.data.sum())
optimizer = optimizers.Adam(alpha=args.lr, beta1=args.mom1)
optimizer.setup(enc)
optimizer.use_cleargrads()

-562.97064 40.848587
