In [1]:
import os
import sys

# os.environ['LD_LIBRARY_PATH'] = '/usr/local/cuda-9.0/lib64/'
# sometimes it's required by some frameworks
sys.path.append('/usr/local/cuda-9.0/lib64/')

In [2]:
%load_ext autoreload
%autoreload 2

import os
import sys
import time
import torch
import argparse
import numpy as np

import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
import matplotlib.pyplot as plt
import torch.nn.functional as nfunc
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset

import torch_func.models as models

from torch_func.Subset import SubsetDataset
from torch_func.sampler import InfiniteSampler
from torch_func.TrainerUtils import weights_init
from torch_func.evaluate import evaluate_classifier
from torch_func.data_loader import load_dataset_semi, load_data_set
from ExpUtils import *

In [3]:
def parse_args():
    args = argparse.Namespace()
    args.dataset = "cifar10"
    args.lr = 0.001
    args.arch = "mlp"
    args.iterations = 1000
    args.seed = 1
    args.size = 4000
    args.no_cuda = True
    
    args.xi = 1e-6
    args.eps = 1
    args.k = 1
    args.use_entmin = False
    args.alpha = 1

    args.gpu_id = 1
    args.data_dir = "./dataset/svhn/"
    args.log_dir = "log"
    args.n_categories = 10
    args.eval_freq = 5
    args.snapshot_freq = 20
    args.aug_flip = False
    args.aug_trans = False
    args.validation = False
    args.dataset_seed = 1
    args.batchsize = 32
    args.batchsize_ul = 128
    args.batchsize_eval = 100
    args.num_epochs = 120
    args.num_iter_per_epoch = 400
    args.epoch_decay_start = 80
    args.lr = 0.001
    args.mom1 = 0.9
    args.mom2 = 0.5
    args.method = "vat"
    args.epsilon = 3.5
    args.dropout_rate = 0.5
    args.top_bn = True
    
    args.data_dir = os.path.join("./dataset/%s" % args.dataset)
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu_id)
    args.cuda = not args.no_cuda and torch.cuda.is_available()

    # very important to have the deterministic results -> slower
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

    device = torch.device("cuda" if args.cuda else "cpu")
    args.device = device
    return args

# PyTorch code

## Load data

The datasets can be downloaded by the script in dataset directory.

In [4]:
args = parse_args()
set_framework_seed(args.seed)
device = args.device

# train_set_o, test_set_o, shape, num_classes = load_data_set(args.dataset)
train_set, _, test, shape, num_classes = load_dataset_semi(args.dataset)
test_loader = DataLoader(test, 256, num_workers=3)
train_set = SubsetDataset(train_set, list(range(args.size)))

print("N_train_labeled:{}".format(len(train_set)))
print(train_set.dataset.tensors[0].sum())

N_train_labeled:4000
tensor(-2591.2605)


In [5]:
batch_size = 32
set_framework_seed(1)
train_iter = iter(DataLoader(train_set, batch_size, num_workers=0, sampler=InfiniteSampler(len(train_set))))
x, y = next(train_iter)

In [6]:
y

tensor([0, 2, 1, 5, 2, 6, 7, 2, 3, 2, 0, 8, 2, 4, 3, 5, 1, 6, 3, 3, 3, 2, 8, 1,
        7, 0, 0, 6, 6, 3, 3, 1])

## init the model

PyTorch use its own initializing library which is different from numpy.

So I prefer the method to initialize the weight by a numpy random function.

In [7]:
def call_bn(bn, x, update_batch_stats=True):
    if bn.training is False:
        return bn(x)
    elif not update_batch_stats:
        return nfunc.batch_norm(x, None, None, bn.weight, bn.bias, True, bn.momentum, bn.eps)
    else:
        return bn(x)


class FullyNet(nn.Module):
    def __init__(self, n_class, n_ch, res):
        super(FullyNet, self).__init__()
        self.input_len = n_ch * res * res
        self.fc1 = nn.Linear(self.input_len, 1200)
        self.fc2 = nn.Linear(1200, 1200)
        self.fc3 = nn.Linear(1200, n_class)

        self.bn_fc1 = nn.BatchNorm1d(1200, eps=2e-5)
        self.bn_fc2 = nn.BatchNorm1d(1200, eps=2e-5)

    def forward(self, x, update_batch_stats=True):
        h = nfunc.relu(call_bn(self.bn_fc1, self.fc1(x.view(-1, self.input_len)), update_batch_stats))
        h = nfunc.relu(call_bn(self.bn_fc2, self.fc2(h), update_batch_stats))
        # h = nfunc.relu(self.bn_fc1(self.fc1(x.view(-1, self.input_len))))
        # h = nfunc.relu(self.bn_fc2(self.fc2(h)))
        # h = nfunc.relu(self.fc1(x.view(-1, self.input_len)))
        # h = nfunc.relu(self.fc2(h))
        return self.fc3(h)

In [8]:
args = parse_args()
print(args.arch)
model = getattr(models, args.arch)
set_framework_seed(args.seed)
cls = FullyNet(num_classes, shape[0], shape[1]).to(device)
cls.apply(weights_init)
cls = cls.to(device)
cls.train()

mlp


FullyNet(
  (fc1): Linear(in_features=3072, out_features=1200, bias=True)
  (fc2): Linear(in_features=1200, out_features=1200, bias=True)
  (fc3): Linear(in_features=1200, out_features=10, bias=True)
  (bn_fc1): BatchNorm1d(1200, eps=2e-05, momentum=0.1, affine=True, track_running_stats=True)
  (bn_fc2): BatchNorm1d(1200, eps=2e-05, momentum=0.1, affine=True, track_running_stats=True)
)

In [9]:
set_framework_seed(1)
print(x.sum())
print(y)
output = cls(x.to(args.device))
print(output.sum())

tensor(-562.9708)
tensor([0, 2, 1, 5, 2, 6, 7, 2, 3, 2, 0, 8, 2, 4, 3, 5, 1, 6, 3, 3, 3, 2, 8, 1,
        7, 0, 0, 6, 6, 3, 3, 1])
tensor(-5.4526, grad_fn=<SumBackward0>)


## CNN model wi/wo dropout

Verify the outputs of CNN models with/without dropout layer

1. Without dropout: There's a small gap between PyTorch and Chainer made by the precision(My guess)
2. With dropout: An obivious difference...

In [10]:
class CNN(nn.Module):

    def __init__(self, input_shape=(3, 32, 32), num_conv=128, top_bn=False, dropout=False):
        super(CNN, self).__init__()
        self.dropout = dropout
        self.c1 = nn.Conv2d(input_shape[0], num_conv, 3, 1, 1)
        self.c2 = nn.Conv2d(num_conv, num_conv, 3, 1, 1)
        self.c3 = nn.Conv2d(num_conv, num_conv, 3, 1, 1)
        self.c4 = nn.Conv2d(num_conv, num_conv * 2, 3, 1, 1)
        self.c5 = nn.Conv2d(num_conv * 2, num_conv * 2, 3, 1, 1)
        self.c6 = nn.Conv2d(num_conv * 2, num_conv * 2, 3, 1, 1)
        self.c7 = nn.Conv2d(num_conv * 2, num_conv * 4, 3, 1, 0)
        self.c8 = nn.Conv2d(num_conv * 4, num_conv * 2, 1, 1, 0)
        self.c9 = nn.Conv2d(num_conv * 2, 128, 1, 1, 0)
        self.bn1 = nn.BatchNorm2d(num_conv, eps=2e-5)
        self.bn2 = nn.BatchNorm2d(num_conv, eps=2e-5)
        self.bn3 = nn.BatchNorm2d(num_conv, eps=2e-5)
        self.bn4 = nn.BatchNorm2d(num_conv * 2, eps=2e-5)
        self.bn5 = nn.BatchNorm2d(num_conv * 2, eps=2e-5)
        self.bn6 = nn.BatchNorm2d(num_conv * 2, eps=2e-5)
        self.bn7 = nn.BatchNorm2d(num_conv * 4, eps=2e-5)
        self.bn8 = nn.BatchNorm2d(num_conv * 2, eps=2e-5)
        self.bn9 = nn.BatchNorm2d(num_conv, eps=2e-5)
        self.bnf = nn.BatchNorm2d(128, eps=2e-5)
        self.mp1 = nn.MaxPool2d(2, 2)
        self.mp2 = nn.MaxPool2d(2, 2)
        self.drop1 = nn.Dropout2d()
        self.drop2 = nn.Dropout2d()
        self.aap = nn.AdaptiveAvgPool2d((1, 1))
        self.linear = nn.Linear(128, 10)

        self.top_bn = top_bn
        if top_bn:
            self.bn = nn.BatchNorm1d(10, eps=2e-5)

    def forward(self, x, update_batch_stats=True):
        h = x
        h = self.c1(h)
        h = nfunc.leaky_relu(call_bn(self.bn1, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c2(h)
        h = nfunc.leaky_relu(call_bn(self.bn2, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c3(h)
        h = nfunc.leaky_relu(call_bn(self.bn3, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.mp1(h)
        if self.dropout:
            h = self.drop1(h)

        h = self.c4(h)
        h = nfunc.leaky_relu(call_bn(self.bn4, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c5(h)
        h = nfunc.leaky_relu(call_bn(self.bn5, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c6(h)
        h = nfunc.leaky_relu(call_bn(self.bn6, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.mp2(h)
        if self.dropout:
            h = self.drop2(h)

        h = self.c7(h)
        h = nfunc.leaky_relu(call_bn(self.bn7, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c8(h)
        h = nfunc.leaky_relu(call_bn(self.bn8, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.c9(h)
        h = nfunc.leaky_relu(call_bn(self.bn9, h, update_batch_stats=update_batch_stats), negative_slope=0.1)
        h = self.aap(h)
        output = self.linear(h.view(-1, 128))
        if self.top_bn:
            output = call_bn(self.bnf, output, update_batch_stats=update_batch_stats)
        return output

### without dropout

In [22]:
set_framework_seed(1)
train_iter = iter(DataLoader(train_set, batch_size, num_workers=0, sampler=InfiniteSampler(len(train_set))))

cls = CNN(dropout=False)
set_framework_seed(1)
cls.apply(weights_init)
cls = cls.to(args.device)
optimizer = Adam(list(cls.parameters()), lr=args.lr)

cls.train()
set_framework_seed(1)
for it in range(10):
    # batch norm need a fix seed
    x, y = next(train_iter)
    criterion = nn.CrossEntropyLoss()
    logit = cls(x.to(args.device))
    loss = criterion(logit, y.to(device))
    print(x.sum(), loss.item())
    optimizer.zero_grad()
    loss.backward() 
    optimizer.step()

tensor(-562.9708) 2.2606215476989746
tensor(-239.5726) 2.7371633052825928
tensor(2.0767) 2.6680123805999756
tensor(126.5036) 2.248326063156128
tensor(31.4626) 2.090791940689087
tensor(-653.7198) 2.224613666534424
tensor(-206.5633) 2.0617425441741943
tensor(-199.0407) 2.3420562744140625
tensor(173.3533) 2.3304195404052734
tensor(-129.2892) 2.2965545654296875


In [23]:
evaluate_classifier(cls, test_loader, args.device)

(8990, 10.803810766601563)

In [26]:
set_framework_seed(1)
train_iter = iter(DataLoader(train_set, batch_size, num_workers=0, sampler=InfiniteSampler(len(train_set))))

cls = CNN(dropout=True)
set_framework_seed(1)
cls.apply(weights_init)
cls = cls.to(args.device)
optimizer = Adam(list(cls.parameters()), lr=args.lr)

cls.train()
set_framework_seed(1)
for it in range(10):
    # batch norm need a fix seed
    x, y = next(train_iter)
    criterion = nn.CrossEntropyLoss()
    logit = cls(x.to(args.device))
    loss = criterion(logit, y.to(device))
    print(x.sum(), loss.item())
    optimizer.zero_grad()
    loss.backward() 
    optimizer.step()

tensor(-562.9708) 2.4522294998168945
tensor(-239.5726) 2.507972478866577
tensor(2.0767) 2.7254555225372314
tensor(126.5036) 2.5243442058563232
tensor(31.4626) 2.5457208156585693
tensor(-653.7198) 2.2350082397460938
tensor(-206.5633) 2.480289936065674
tensor(-199.0407) 2.640108823776245
tensor(173.3533) 2.641848087310791
tensor(-129.2892) 2.6940395832061768


In [25]:
evaluate_classifier(cls, test_loader, args.device)

(8973, 19.94179396057129)