In [1]:
import os
import sys
# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-9.0/lib64/'
sys.path.append('/usr/local/cuda-9.0/lib64/')

In [1]:
%load_ext autoreload
%autoreload 2

import random
import argparse
import torch.nn as nn
from torch.optim import Adam
import torch.optim as optim
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt

import cupy as cp

import sys, os, time, argparse
import numpy as np
import chainer
import chainer.functions as F
from chainer import Variable, optimizers, cuda, serializers

from chainer_func.source.chainer_functions import loss
from chainer_func.source.data import Data
from chainer_func.source.utils import mkdir_p, load_npz_as_dict
from chainer_func.models import CNN, MLP

from ExpUtils import *

ImportError: No module named torch.nn

In [3]:
def parse_args():
    args = argparse.Namespace()
    args.dataset = "cifar10"
    args.trainer = "VATReg"
    args.lr = 0.001
    args.arch = "mlp"
    args.iterations = 1000
    args.seed = 1
    args.size = 100
    args.no_cuda = False
    
    args.xi = 10
    args.eps = 1
    args.k = 1
    args.use_entmin = False
    args.alpha = 1
    
    args.gpu = -1
    args.data_dir = "./dataset/cifar10/"
    args.log_dir = "log"
    args.n_categories = 10
    args.eval_freq = 5
    args.snapshot_freq = 20
    args.aug_flip = False
    args.aug_trans = False
    args.validation = False
    args.dataset_seed = 1
    args.batchsize = 32
    args.batchsize_ul = 128
    args.batchsize_eval = 100
    args.num_epochs = 120
    args.num_iter_per_epoch = 400
    args.epoch_decay_start = 80
    args.lr = 0.001
    args.mom1 = 0.9
    args.mom2 = 0.5
    args.method = "vat"
    args.epsilon = 3.5
    args.extra_lamb = 1
    args.dropout_rate = 0.5
    args.top_bn = True
    
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = "1"
    args.data_dir = os.path.join(os.environ['HOME'], "project/data/dataset/%s" % args.dataset)

    chainer.global_config.cudnn_deterministic = True
    random.seed(args.seed)
    if int(args.gpu) > -1:
        chainer.cuda.get_device(args.gpu).use()
    np.random.seed(args.seed)
    cp.random.seed(args.seed)
    return args

args = parse_args()

# chainer code

In [4]:
from chainer_func import *

In [5]:
def load_dataset(dirpath, valid=False, dataset_seed=1):
    if valid:
        train_l = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'labeled_train_valid.npz'))
        train_ul = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'unlabeled_train_valid.npz'))
        test = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'test_valid.npz'))
    else:
        train_l = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'labeled_train.npz'))
        train_ul = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'unlabeled_train.npz'))
        test = load_npz_as_dict(os.path.join(dirpath, 'seed' + str(dataset_seed), 'test.npz'))
    if 'mnist' in dirpath:
        train_set, test_set = load_mnist_dataset()
    print(train_ul['images'].shape)
    print(train_l['labels'][:20])
    train_l['images'] = train_l['images'].reshape(train_l['images'].shape[0], 3, 32, 32).astype(np.float32)
    train_ul['images'] = train_ul['images'].reshape(train_ul['images'].shape[0], 3, 32, 32).astype(np.float32)
    test['images'] = test['images'].reshape(test['images'].shape[0], 3, 32, 32).astype(np.float32)
    return Data(train_l['images'], train_l['labels'].astype(np.int32)), \
           Data(train_ul['images'], train_ul['labels'].astype(np.int32)), \
           Data(test['images'], test['labels'].astype(np.int32))


In [6]:
train_l, train_ul, test = load_dataset(args.data_dir, valid=args.validation, dataset_seed=args.dataset_seed)
print("N_train_labeled:{}, N_train_unlabeled:{}".format(train_l.N, train_ul.N))
print(train_l.data.sum(), train_l.data.mean(), train_l.data.max())

(50000, 3072)
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
N_train_labeled:4000, N_train_unlabeled:50000
-2591.2783 -0.00021087877 24.7316


In [7]:
train_l.label

array([0, 0, 0, ..., 9, 9, 9], dtype=int32)

In [8]:
set_framework_seed(1)
x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
x_u, _ = train_ul.get(args.batchsize_ul, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
print(t)

[0 2 1 5 2 6 7 2 3 2 0 8 2 4 3 5 1 6 3 3 3 2 8 1 7 0 0 6 6 3 3 1]


In [9]:
import math
import chainer.functions as F
import chainer.links as L

class MLP(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        initializer = chainer.initializers.HeUniform(1.0)
        super(MLP, self).__init__(
            l_c1=L.Linear(3072, 1200, initialW=initializer),
            l_c2=L.Linear(1200, 1200, initialW=initializer),
            l_c3=L.Linear(1200, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(1200),
            bn2=L.BatchNormalization(1200),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.l_c1(h)
        h = F.relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats))
        # h = F.relu(h)
        h = self.l_c2(h)
        h = F.relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats))
        # h = F.relu(h)
        logit = self.l_c3(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [10]:
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc.bn1.gamma

variable gamma([1., 1., 1., ..., 1., 1., 1.])

In [11]:
def call_bn(bn, x, test=False, update_batch_stats=True):
    if test:
        return F.fixed_batch_normalization(x, bn.gamma, bn.beta, bn.avg_mean, bn.avg_var)
    elif not update_batch_stats:
        return F.batch_normalization(x, bn.gamma, bn.beta)
    else:
        return bn(x)

# match MLP results

In [12]:
set_framework_seed(1)
enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
# enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
print(x.sum())
print(t)
set_framework_seed(1)
te = enc(Variable(x))
print(te.data.sum())

-562.97064
[0 2 1 5 2 6 7 2 3 2 0 8 2 4 3 5 1 6 3 3 3 2 8 1 7 0 0 6 6 3 3 1]
-5.4525614


In [13]:
def loss_labeled(forward, x, t):
    y = forward(x, update_batch_stats=True)
    L = F.softmax_cross_entropy(y, t)
    return L

In [14]:
class CNN(chainer.Chain):
    def __init__(self, n_outputs=10, dropout_rate=0.5, top_bn=False, dropout=False):
        self.dropout_rate = dropout_rate
        self.top_bn = top_bn
        self.dropout = dropout
        initializer = chainer.initializers.HeUniform(1.0)
        super(CNN, self).__init__(
            c1=L.Convolution2D(3, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c2=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c3=L.Convolution2D(128, 128, ksize=3, stride=1, pad=1, initialW=initializer),
            c4=L.Convolution2D(128, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c5=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c6=L.Convolution2D(256, 256, ksize=3, stride=1, pad=1, initialW=initializer),
            c7=L.Convolution2D(256, 512, ksize=3, stride=1, pad=0, initialW=initializer),
            c8=L.Convolution2D(512, 256, ksize=1, stride=1, pad=0, initialW=initializer),
            c9=L.Convolution2D(256, 128, ksize=1, stride=1, pad=0, initialW=initializer),
            l_cl=L.Linear(128, n_outputs, initialW=initializer),
            bn1=L.BatchNormalization(128),
            bn2=L.BatchNormalization(128),
            bn3=L.BatchNormalization(128),
            bn4=L.BatchNormalization(256),
            bn5=L.BatchNormalization(256),
            bn6=L.BatchNormalization(256),
            bn7=L.BatchNormalization(512),
            bn8=L.BatchNormalization(256),
            bn9=L.BatchNormalization(128),
        )
        if top_bn:
            self.add_link('bn_cl', L.BatchNormalization(n_outputs))

    def __call__(self, x, train=True, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = F.leaky_relu(call_bn(self.bn1, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c2(h)
        h = F.leaky_relu(call_bn(self.bn2, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c3(h)
        h = F.leaky_relu(call_bn(self.bn3, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c4(h)
        h = F.leaky_relu(call_bn(self.bn4, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c5(h)
        h = F.leaky_relu(call_bn(self.bn5, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c6(h)
        h = F.leaky_relu(call_bn(self.bn6, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.max_pooling_2d(h, ksize=2, stride=2)
        if self.dropout:
            h = F.dropout(h, ratio=self.dropout_rate)

        h = self.c7(h)
        h = F.leaky_relu(call_bn(self.bn7, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c8(h)
        h = F.leaky_relu(call_bn(self.bn8, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = self.c9(h)
        h = F.leaky_relu(call_bn(self.bn9, h, test=not train, update_batch_stats=update_batch_stats), slope=0.1)
        h = F.average_pooling_2d(h, ksize=h.data.shape[2])
        logit = self.l_cl(h)
        if self.top_bn:
            logit = call_bn(self.bn_cl, logit, test=not train, update_batch_stats=update_batch_stats)
        return logit

In [15]:
set_framework_seed(1)
# enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
set_framework_seed(1)
out = enc( Variable(x), update_batch_stats=True)
print(x.sum(), out.data.sum())

-562.97064 40.848587


In [16]:
def loss_test(forward, x, t):
    logit = forward(x, train=False)
    L, acc = F.softmax_cross_entropy(logit, t).data, F.accuracy(logit, t).data
    return L, acc

# CNN without dropout

the results are very close, the difference is caused by error/precision

In [26]:
set_framework_seed(1)

enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=False)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
optimizer = optimizers.Adam(alpha=args.lr, beta1=args.mom1)
optimizer.setup(enc)
optimizer.use_cleargrads()
set_framework_seed(1)
train_l.reseed()
for it in range(10):
    with chainer.using_config("train", True):
        x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
        
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        print(x.sum(), loss_l)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()


-562.97064 variable(2.260621)
-239.57237 variable(2.737069)
2.0764809 variable(2.6666842)
126.50333 variable(2.240642)
31.462202 variable(2.0930262)
-653.71985 variable(2.2371545)
-206.56343 variable(2.0542524)
-199.04095 variable(2.359859)
173.35344 variable(2.3512278)
-129.28891 variable(2.3139074)


In [23]:
with chainer.using_config("train", False):
    acc_test_sum = 0
    test_x, test_t = test.get()
    N_test = test_x.shape[0]
    for i in range(0, N_test, args.batchsize_eval):
        x = test_x[i:i + args.batchsize_eval]
        t = test_t[i:i + args.batchsize_eval]
        if args.gpu > -1:
            x, t = cuda.to_gpu(x, device=args.gpu), cuda.to_gpu(t, device=args.gpu)
        _, acc = loss_test(enc, Variable(x), Variable(t))
        acc_test_sum += acc * x.shape[0]
    print(acc_test_sum / N_test)

0.10249999959021806


In [24]:
set_framework_seed(1)
# enc = MLP(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False)
enc = CNN(n_outputs=args.n_categories, dropout_rate=args.dropout_rate, top_bn=False, dropout=True)
if args.gpu > -1:
    print("gpu")
    chainer.cuda.get_device(args.gpu).use()
    enc.to_gpu()
optimizer = optimizers.Adam(alpha=args.lr, beta1=args.mom1)
optimizer.setup(enc)
optimizer.use_cleargrads()
set_framework_seed(1)
train_l.reseed()
for it in range(10):
    set_framework_seed(it % 10000)
    with chainer.using_config("train", True):
        x, t = train_l.get(args.batchsize, gpu=args.gpu, aug_trans=args.aug_trans, aug_flip=args.aug_flip)
        
        loss_l = loss_labeled(enc, Variable(x), Variable(t))
        print(x.sum(), loss_l)
        enc.cleargrads()
        loss_l.backward()
        optimizer.update()
        

-562.97064 variable(2.270143)
-239.57237 variable(2.5744624)
2.0764809 variable(2.8344507)
126.50333 variable(2.2541957)
31.462202 variable(2.2864416)
-653.71985 variable(2.2899199)
-206.56343 variable(2.2821083)
-199.04095 variable(2.5041566)
173.35344 variable(2.545939)
-129.28891 variable(2.350461)


In [25]:
with chainer.using_config("train", False):
    acc_test_sum = 0
    test_x, test_t = test.get()
    N_test = test_x.shape[0]
    for i in range(0, N_test, args.batchsize_eval):
        x = test_x[i:i + args.batchsize_eval]
        t = test_t[i:i + args.batchsize_eval]
        if args.gpu > -1:
            x, t = cuda.to_gpu(x, device=args.gpu), cuda.to_gpu(t, device=args.gpu)
        _, acc = loss_test(enc, Variable(x), Variable(t))
        acc_test_sum += acc * x.shape[0]
    print(acc_test_sum / N_test)

0.10439999988302588
