# Load data by tensorflow

import torch and tensorflow

set memory usage

In [1]:
import os

import numpy as np
import tensorflow as tf
import tensorflow.contrib.eager as tfe

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as nfunc
from torch.nn.parameter import Parameter

gpu = ""

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

device = torch.device("cuda" if gpu else "cpu")

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.3

tf.enable_eager_execution(tf_config)

In [2]:
%load_ext autoreload
%autoreload 2

from tf_func import data_loader
from tf_func import mnist_model


class ConfigDict(object):
    """MNIST configration."""

    def __init__(self):
        self.num_classes = 10

        # List of tuples specify (kernel_size, number of filters) for each layer.
        self.filter_sizes_conv_layers = [(5, 32), (5, 64)]
        # Dictionary of pooling type ("max"/"average", size and stride).
        self.pool_params = {"type": "max", "size": 2, "stride": 2}
        self.num_units_fc_layers = [512]
        self.dropout_rate = 0
        self.batch_norm = True
        self.activation = None
        self.regularizer = None
        
        
config = ConfigDict()

In [3]:
dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="train",
    batch_size=128,
    is_training=False)

test_dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="test",
    batch_size=128,
    is_training=False)

In [4]:
images, labels, num_examples, num_classes = (dataset.images, dataset.labels, dataset.num_examples, dataset.num_classes)
images, labels = dataset.get_next()
images.numpy().sum()

12790.145

# PyTorch

In [5]:
def weights_init(m):
    """
    initialize kaimin uniform distribution weight matrix
    and set bias to 0
    :param m:
    :return:
    """
    class_name = m.__class__.__name__
    fan_in = 0
    if class_name.find('Conv') != -1:
        shape = m.weight.data.shape
        fan_in = shape[1] * shape[2] * shape[3]
    if class_name.find('Linear') != -1:
        shape = m.weight.data.shape
        fan_in = shape[1]
    if fan_in:
        s = 1.0 * np.sqrt(6.0 / fan_in)
        transpose = np.random.uniform(-s, s, m.weight.data.shape).astype("float32")
        if debug:
            print(shape, transpose.sum())
        tensor = torch.from_numpy(transpose)
        m.weight = Parameter(tensor, requires_grad=True)
        if m.bias is not None:
            m.bias.data.zero_()

In [6]:
def evaluate_classifier(model, test_iter, device):
    total_acc = 0
    total_loss = 0
    size = 0
    model.eval()
    criterion = nn.CrossEntropyLoss()
    with torch.no_grad():
        for images, labels in test_iter:
            size += images.numpy().shape[0]
            images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
            labels = torch.LongTensor(labels.numpy()).to(device)
            logits, _ = model(images)
            total_loss += criterion(logits, labels).item() * images.shape[0]
            pred_y = torch.max(logits, dim=1)[1]        
            total_acc += (pred_y == labels).sum().item()
    model.train()
    return total_acc / size, total_loss / size

In [7]:
EPS = 1e-5
MOMENTUM = 0.9
class CNN3(nn.Module):

    def __init__(self, config):
        super(CNN3, self).__init__()
        self.config = config
        self.batch_norm = config.batch_norm
        self.dropout_rate = config.dropout_rate
        self.conv1 = nn.Conv2d(1, 32, kernel_size=5, stride=(1, 1), 
                               padding=2, bias=not self.batch_norm)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=5, stride=(1, 1), 
                               padding=2, bias=not self.batch_norm)
        self.fc1 = nn.Linear(64 * 7 * 7, 512)
        self.fc2 = nn.Linear(512, 10)
        if self.config.dropout_rate > 0:
            self.drop_conv1 = nn.Dropout2d(self.config.dropout_rate)
            self.drop_conv2 = nn.Dropout2d(self.config.dropout_rate)
            self.drop_fc1 = nn.Dropout(0.1)
            
        if self.batch_norm:
            self.bn1 = nn.BatchNorm2d(32, eps=EPS, momentum=1-MOMENTUM, affine=False)
            self.bn2 = nn.BatchNorm2d(64, eps=EPS, momentum=1-MOMENTUM, affine=False)

    def forward(self, images):
        endpoints = {}
        x = images

        # Conv Layer 1
        x = nfunc.relu(self.conv1(x))
        x = nfunc.max_pool2d(x, 2, stride=2)
        if debug:
            print("after pool", "%.4f" % x.sum().item())
        if self.dropout_rate > 0:
            x = self.drop_conv1(x)
        if self.batch_norm:
            if debug:
                print("before batech norm %.4f" % (x ** 2).sum().item())
            x = self.bn1(x)
            if debug:
                print("after batech norm %.4f" % (x ** 2).sum().item())
        endpoints["conv_layer0"] = x

        # Conv Layer 2
        x = nfunc.max_pool2d(nfunc.relu(self.conv2(x)), 2, stride=2)
        if self.dropout_rate > 0:
            x = self.drop_conv2(x)
        if self.batch_norm:
            if debug:
                print("before batech norm %.4f" % (x ** 2).sum().item())
            x = self.bn2(x)
            if debug:
                print("after batech norm %.4f" % (x ** 2).sum().item())
        if debug:
            print("After two conv %.4f" % (x ** 2).sum().item())
        endpoints["conv_layer1"] = x
        x = x.permute(0, 2, 3, 1).contiguous().view(x.shape[0], -1)

        # fully connect layer 1
        x = nfunc.relu(self.fc1(x))
        if self.dropout_rate > 0:
            x = self.drop_fc1(x)
        if debug:
            print("logits %.4f" % (x ** 2).sum().item())
        endpoints["fc_layer0"] = x

        # fully connect layer logit
        x = self.fc2(x)
        if debug:
            print("logits %.4f" % (x ** 2).sum().item())
        endpoints["logits"] = x
        return x, endpoints

# CrossEntropy

It works fine for TF and PyTorch

without BatchNorm and Dropout

In [8]:
print("pytorch, cross entropy")
config.batch_norm = False
debug = True
model = CNN3(config)
np.random.seed(1)
model.apply(weights_init)
model = model.to(device)
model.train()

max_iters = 30
debug = False
lr = 0.01
momentum = 0.9
torch_optimizer = optim.SGD(list(model.parameters()), lr, momentum=momentum, nesterov=True)

criterion = nn.CrossEntropyLoss(reduction='none')
iterator = dataset.dataset.make_one_shot_iterator()

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
    labels = torch.LongTensor(labels.numpy()).to(device)

    # Build model.
    logits, endpoints = model(images)
    total_loss = 0
    loss_list = criterion(logits, labels)
    xent_loss = torch.mean(loss_list)
    total_loss = xent_loss
    
    print("iter %d,  train loss %.5f\n" % (i, total_loss))
    torch_optimizer.zero_grad()
    total_loss.backward()
    torch_optimizer.step()
    torch_optimizer.zero_grad()
    
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

pytorch, cross entropy
torch.Size([32, 1, 5, 5]) 4.01563
torch.Size([64, 32, 5, 5]) -4.576685
torch.Size([512, 3136]) -19.584167
torch.Size([10, 512]) 1.3087051
data 12790.14453
iter 0,  train loss 2.61532

data 12953.98438
iter 1,  train loss 2.45566

data 12442.99316
iter 2,  train loss 2.16393

data 12292.56152
iter 3,  train loss 1.94132

data 12402.40137
iter 4,  train loss 1.68689

data 13600.25098
iter 5,  train loss 1.51943

data 13450.91211
iter 6,  train loss 1.19969

data 12739.25195
iter 7,  train loss 1.10128

data 11608.33594
iter 8,  train loss 1.11012

data 14801.60449
iter 9,  train loss 0.83975

data 15733.70703
iter 10,  train loss 0.82863

data 12621.97754
iter 11,  train loss 0.66821

data 12366.39551
iter 12,  train loss 0.54461

data 12358.09375
iter 13,  train loss 0.44116

data 12827.04590
iter 14,  train loss 0.43031

data 14613.22363
iter 15,  train loss 0.49216

data 13533.24316
iter 16,  train loss 0.40719

data 13346.56543
iter 17,  train loss 0.39415

dat

# CrossEntropy

with Batch Norm

In [9]:
iterator = dataset.dataset.make_one_shot_iterator()
images, labels = iterator.get_next()

print("input data %.5f" % images.numpy().sum())
with torch.no_grad():
    images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
    bn_layer = nn.BatchNorm2d(1, eps=EPS, momentum=MOMENTUM).to(device)
    p = bn_layer(images)
    print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())
    
    # if want to match tf batch norm, set affine=False
    bn_layer = nn.BatchNorm2d(1, eps=EPS, momentum=MOMENTUM, affine=False).to(device)
    p = bn_layer(images)
    print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())    

input data 12790.14453
sum of output**2
 49262.445
sum of output**2
 100341.16


## A model


In [10]:
print("pytorch, cross entropy, batch norm")
# torch.backends.cudnn.deterministic = True
device = torch.device("cpu")

config.batch_norm = True

model = CNN3(config)
np.random.seed(1)
model.apply(weights_init)
model = model.to(device)
model.train()

debug = True

iterator = dataset.dataset.make_one_shot_iterator()

images, labels = iterator.get_next()
print("data %.5f" % images.numpy().sum())
images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
labels = torch.LongTensor(labels.numpy()).to(device)

# Build model.
logits, endpoints = model(images)
print(logits.sum().item())

pytorch, cross entropy, batch norm
data 12790.14453
after pool 145856.7188
before batech norm 142664.3438
after batech norm 802661.1250
before batech norm 907971.3125
after batech norm 401403.5000
After two conv 401403.5000
logits 62439.2461
logits 2373.4065
-203.366943359375


# Cross Entropy with model contains batch norm

In [11]:
print("pytorch, cross entropy, batch norm")

device = torch.device("cpu")

config.batch_norm = True
model = CNN3(config)
np.random.seed(1)
model.apply(weights_init)
model = model.to(device)
model.train()

debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

lr = 0.01
momentum = 0.9
torch_optimizer = optim.SGD(list(model.parameters()), lr, momentum=momentum, nesterov=True)
max_iters = 30

criterion = nn.CrossEntropyLoss(reduction='none')
iterator = dataset.dataset.make_one_shot_iterator()

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
    labels = torch.LongTensor(labels.numpy()).to(device)

    # Build model.
    logits, endpoints = model(images)

    total_loss = 0
    loss_list = criterion(logits, labels)
    xent_loss = torch.mean(loss_list)
    total_loss = xent_loss
    
    print("iter %d,  train loss %.5f\n" % (i, total_loss))
    torch_optimizer.zero_grad()
    total_loss.backward()
    torch_optimizer.step()
    torch_optimizer.zero_grad()
    
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

pytorch, cross entropy, batch norm
torch.Size([32, 1, 5, 5]) 4.01563
torch.Size([64, 32, 5, 5]) -4.576685
torch.Size([512, 3136]) -19.584167
torch.Size([10, 512]) 1.3087051
test acc 0.12790,  loss 2.59772
data 12790.14453
iter 0,  train loss 3.03514

data 12953.98438
iter 1,  train loss 2.31430

data 12442.99316
iter 2,  train loss 1.12174

data 12292.56152
iter 3,  train loss 0.81504

data 12402.40137
iter 4,  train loss 0.87731

data 13600.25098
iter 5,  train loss 0.68717

data 13450.91211
iter 6,  train loss 0.56344

data 12739.25195
iter 7,  train loss 0.48809

data 11608.33594
iter 8,  train loss 0.60272

data 14801.60449
iter 9,  train loss 0.37551

data 15733.70703
iter 10,  train loss 0.46466

data 12621.97754
iter 11,  train loss 0.24003

data 12366.39551
iter 12,  train loss 0.26288

data 12358.09375
iter 13,  train loss 0.28607

data 12827.04590
iter 14,  train loss 0.15964

data 14613.22363
iter 15,  train loss 0.26158

data 13533.24316
iter 16,  train loss 0.23480

data 1

# Dropout

single dropout layer

In [19]:
iterator = dataset.dataset.make_one_shot_iterator()
images, labels = iterator.get_next()

print("data %.5f" % images.numpy().sum())
images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)

torch.manual_seed(1)
drop_layer = nn.Dropout2d(p=0.5)
p = drop_layer(images)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())

torch.manual_seed(1)
p = nfunc.dropout2d(images, p=0.5)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())

data 12790.14453
sum of output**2
 22012.12
sum of output**2
 22012.12


In [12]:
print("pytorch, cross entropy, dropout")

device = torch.device("cpu")

config.batch_norm = False
config.dropout_rate = 0.3

model = CNN3(config)
np.random.seed(1)
model.apply(weights_init)
model = model.to(device)
model.train()

debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

lr = 0.01
momentum = 0.9
torch_optimizer = optim.SGD(list(model.parameters()), lr, momentum=momentum, nesterov=True)
max_iters = 30

criterion = nn.CrossEntropyLoss(reduction='none')
iterator = dataset.dataset.make_one_shot_iterator()

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
    labels = torch.LongTensor(labels.numpy()).to(device)

    # Build model.
    logits, endpoints = model(images)

    total_loss = 0
    loss_list = criterion(logits, labels)
    xent_loss = torch.mean(loss_list)
    total_loss = xent_loss
    
    print("iter %d,  train loss %.5f\n" % (i, total_loss))
    torch_optimizer.zero_grad()
    total_loss.backward()
    torch_optimizer.step()
    torch_optimizer.zero_grad()
    
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

pytorch, cross entropy, dropout
test acc 0.12790,  loss 2.59773
data 12790.14453
iter 0,  train loss 3.05590

data 12953.98438
iter 1,  train loss 2.85969

data 12442.99316
iter 2,  train loss 2.33658

data 12292.56152
iter 3,  train loss 2.31932

data 12402.40137
iter 4,  train loss 2.06880

data 13600.25098
iter 5,  train loss 1.89253

data 13450.91211
iter 6,  train loss 1.69860

data 12739.25195
iter 7,  train loss 1.63169

data 11608.33594
iter 8,  train loss 1.53965

data 14801.60449
iter 9,  train loss 1.38021

data 15733.70703
iter 10,  train loss 1.36266

data 12621.97754
iter 11,  train loss 1.18539

data 12366.39551
iter 12,  train loss 1.02797

data 12358.09375
iter 13,  train loss 0.82358

data 12827.04590
iter 14,  train loss 0.79694

data 14613.22363
iter 15,  train loss 0.86224

data 13533.24316
iter 16,  train loss 0.70383

data 13346.56543
iter 17,  train loss 0.63417

data 12810.09473
iter 18,  train loss 0.70275

data 13308.19238
iter 19,  train loss 0.62492

data 1

In [13]:
print("pytorch, cross entropy, batch norm and dropout")

device = torch.device("cpu")

config.batch_norm = True
config.dropout_rate = 0.3

model = CNN3(config)
np.random.seed(1)
model.apply(weights_init)
model = model.to(device)
model.train()

debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

lr = 0.01
momentum = 0.9
torch_optimizer = optim.SGD(list(model.parameters()), lr, momentum=momentum, nesterov=True)
max_iters = 30

criterion = nn.CrossEntropyLoss(reduction='none')
iterator = dataset.dataset.make_one_shot_iterator()

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    images = torch.FloatTensor(images.numpy()).permute(0, 3, 1, 2).to(device)
    labels = torch.LongTensor(labels.numpy()).to(device)

    # Build model.
    logits, endpoints = model(images)

    total_loss = 0
    loss_list = criterion(logits, labels)
    xent_loss = torch.mean(loss_list)
    total_loss = xent_loss
    
    print("iter %d,  train loss %.5f\n" % (i, total_loss))
    torch_optimizer.zero_grad()
    total_loss.backward()
    torch_optimizer.step()
    torch_optimizer.zero_grad()
    
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator(), device)
print("test acc %.5f,  loss %.5f" % (acc, loss))

pytorch, cross entropy, batch norm and dropout
test acc 0.12790,  loss 2.59772
data 12790.14453
iter 0,  train loss 3.23084

data 12953.98438
iter 1,  train loss 2.54534

data 12442.99316
iter 2,  train loss 2.26509

data 12292.56152
iter 3,  train loss 1.87439

data 12402.40137
iter 4,  train loss 1.86594

data 13600.25098
iter 5,  train loss 1.56637

data 13450.91211
iter 6,  train loss 1.40208

data 12739.25195
iter 7,  train loss 1.21994

data 11608.33594
iter 8,  train loss 1.15521

data 14801.60449
iter 9,  train loss 0.96120

data 15733.70703
iter 10,  train loss 1.05848

data 12621.97754
iter 11,  train loss 0.67963

data 12366.39551
iter 12,  train loss 0.68146

data 12358.09375
iter 13,  train loss 0.56572

data 12827.04590
iter 14,  train loss 0.55992

data 14613.22363
iter 15,  train loss 0.57170

data 13533.24316
iter 16,  train loss 0.48861

data 13346.56543
iter 17,  train loss 0.49517

data 12810.09473
iter 18,  train loss 0.56066

data 13308.19238
iter 19,  train loss 