# Load data by tensorflow

import torch and tensorflow

set memory usage

In [1]:
import os

import numpy as np
import tensorflow as tf

gpu = ""

os.environ["TF_CPP_MIN_LOG_LEVEL"] = '2'
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ['CUDA_VISIBLE_DEVICES'] = gpu

tf_config = tf.ConfigProto()
tf_config.gpu_options.allow_growth = True
tf_config.gpu_options.per_process_gpu_memory_fraction = 0.3

tf.enable_eager_execution(tf_config)

In [2]:
%load_ext autoreload
%autoreload 2

from tf_func import data_loader
from tf_func import mnist_model


class ConfigDict(object):
    """MNIST configration."""

    def __init__(self):
        self.num_classes = 10

        # List of tuples specify (kernel_size, number of filters) for each layer.
        self.filter_sizes_conv_layers = [(5, 32), (5, 64)]
        # Dictionary of pooling type ("max"/"average", size and stride).
        self.pool_params = {"type": "max", "size": 2, "stride": 2}
        self.num_units_fc_layers = [512]
        self.dropout_rate = 0
        self.batch_norm = True
        self.activation = None
        self.regularizer = None
        
        
config = ConfigDict()

In [3]:
dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="train",
    batch_size=128,
    is_training=False)

test_dataset = data_loader.MNIST(
    data_dir="./data/mnist",
    subset="test",
    batch_size=128,
    is_training=False)

In [4]:
images, labels, num_examples, num_classes = (dataset.images, dataset.labels, dataset.num_examples, dataset.num_classes)
images, labels = dataset.get_next()
images.numpy().sum()

12790.145

# tensorflow (eager)

In [5]:
def weight_init_tf(shape):
    fan_in = 0
    if len(shape) == 4:
        fan_in = shape[1] * shape[2] * shape[3]
    if len(shape) == 2:
        fan_in = shape[1]
    if fan_in:
        s = 1.0 * np.sqrt(6.0 / fan_in)
        transpose = np.random.uniform(-s, s, shape).astype("float32")
    if len(shape) == 2:
        transpose = transpose.T
    if len(shape) == 4:
        transpose = np.transpose(transpose, axes=(2, 3, 1, 0))
    if debug:
        print(shape, transpose.sum())
    return transpose

In [6]:
def evaluate_classifier(model, test_iter):
    total_acc = 0
    total_loss = 0
    size = 0
    for images, labels in test_iter:
        size += images.numpy().shape[0]
        logits, _ = model(images, is_training=False)
        total_loss += tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)).numpy()
        top1_op = tf.nn.in_top_k(logits, labels, 1)
        total_acc += tf.reduce_sum(tf.cast(top1_op, dtype=tf.float32)).numpy()
    return total_acc / size, total_loss / size

In [7]:
EPS = 1e-5
MOMENTUM = 0.9
def pool2d_layer(inputs, pool_type, pool_size=2, pool_stride=2):
    """Pooling layer.

    Args:
      inputs: Tensor of size [batch, H, W, channels].
      pool_type: String ("max", or "average"), specifying pooling type.
      pool_size: Integer > 1 pooling size.
      pool_stride: Integer > 1 pooling stride.

    Returns:
      Pooling result.
    """
    if pool_type == "max":
        # Max pooling layer
        return tf.layers.max_pooling2d(
            inputs, pool_size=[pool_size] * 2, strides=pool_stride)

    elif pool_type == "average":
        # Average pooling layer
        return tf.layers.average_pooling2d(
            inputs, pool_size=[pool_size] * 2, strides=pool_stride)
    

class MNISTNetwork(tf.keras.Model):
    """MNIST model. """

    def __init__(self, config):
        super(MNISTNetwork, self).__init__()
        self.num_classes = config.num_classes
        self.var_list = []
        self.init_ops = None
        self.regularizer = config.regularizer
        self.activation = config.activation
        self.filter_sizes_conv_layers = config.filter_sizes_conv_layers
        self.num_units_fc_layers = config.num_units_fc_layers
        self.pool_params = config.pool_params
        self.dropout_rate = config.dropout_rate
        self.batch_norm = config.batch_norm
        self.conv_layers = []
        self.bn_layers = []
        self.drop_layers = []
        in_channel = 1
        for i, filter_size in enumerate(self.filter_sizes_conv_layers):
            f_size = filter_size[0]
            conv_layer = tf.layers.Conv2D(kernel_size=filter_size[0], filters=filter_size[1], 
                                          strides=(1, 1), padding="same",
                                          use_bias=not self.batch_norm,
                                          kernel_initializer=tf.constant_initializer(
                                              (weight_init_tf((filter_size[1], in_channel, f_size, f_size)))))
            self.conv_layers.append(conv_layer)
            
            batch_norm_layer = tf.layers.BatchNormalization(momentum=MOMENTUM, epsilon=EPS)
            self.bn_layers.append(batch_norm_layer)
            in_channel = filter_size[1]
            if self.dropout_rate > 0:
                drop_layer = tf.layers.Dropout(self.dropout_rate)
                self.drop_layers.append(drop_layer)
            
        self.fc_layers = []
        in_shape = 64 * 7 * 7
        for i, num_units in enumerate(self.num_units_fc_layers):
            fc_layer = tf.layers.Dense(num_units,
                                       kernel_initializer=tf.constant_initializer((weight_init_tf((num_units, in_shape)))),)
            self.fc_layers.append(fc_layer)
            in_shape = num_units
            if self.dropout_rate > 0:
                self.drop_layers.append(tf.layers.Dropout(0.1))
        self.output_layer = tf.layers.Dense(self.num_classes, activation=None,
                                            kernel_initializer=tf.constant_initializer((weight_init_tf((self.num_classes, in_shape)))),)

    def __call__(self, images, is_training=False):
        """Builds model."""
        endpoints = {}
        net = images
        for i in range(len(self.filter_sizes_conv_layers)):
            layer_suffix = "layer%d" % i
            net = self.conv_layers[i](net)
            net = tf.nn.relu(net)
            if self.pool_params:
                net = pool2d_layer(net, pool_type=self.pool_params["type"], pool_size=self.pool_params["size"]
                                   , pool_stride=self.pool_params["stride"])
            if debug: print("after pool", "%.4f" % net.numpy().sum())
            if self.dropout_rate > 0:
                net = tf.layers.dropout(net, rate=self.dropout_rate, training=is_training)
                
            if self.batch_norm:
                if debug:
                    print("before batech norm %.4f" % (net.numpy() ** 2).sum())
                # net = tf.layers.batch_normalization(net, training=is_training, epsilon=EPS, momentum=MOMENTUM)
                net = self.bn_layers[i](net, training=is_training)
                if debug:
                    print("after batech norm %.4f" % (net.numpy() ** 2).sum())

            endpoints["conv_" + layer_suffix] = net
        if debug:
            print("After two conv %.4f" % (net ** 2).numpy().sum())
        net = tf.layers.flatten(net)

        for i in range(len(self.num_units_fc_layers)):
            layer_suffix = "layer%d" % i
            net = self.fc_layers[i](net)
            net = tf.nn.relu(net)
            endpoints["fc_" + layer_suffix] = net

        logits = self.output_layer(net)
        endpoints["logits"] = logits
        if is_training and debug:
            print("logits %.4f" % (logits.numpy() ** 2).sum())
        return logits, endpoints


# CrossEntropy

It works fine for TF and PyTorch

without BatchNorm and Dropout

In [8]:
print("tf, cross entropy, without batch norm")
debug = True
config.batch_norm = False
config.dropout_rate = 0
np.random.seed(1)
model = MNISTNetwork(config)

lr = 0.01
momentum = 0.9
debug = False
max_iters = 30

iterator = dataset.dataset.make_one_shot_iterator()
optimizer = tf.train.MomentumOptimizer(lr, momentum=momentum, use_nesterov=True)

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    with tf.GradientTape(persistent=True) as tp:
        # Build model.
        logits, endpoints = model(images, is_training=True)
        loss_list = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        xent_loss = tf.reduce_mean(loss_list)
        total_loss = xent_loss
    grads = tp.gradient(total_loss, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
    
    print("iter %d, train loss %.5f\n" % (i, total_loss))
    
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

tf, cross entropy, without batch norm
(32, 1, 5, 5) 4.01563
(64, 32, 5, 5) -4.576685
(512, 3136) -19.584167
(10, 512) 1.3087051
data 12790.14453
iter 0, train loss 2.61532

data 12953.98438
iter 1, train loss 2.45566

data 12442.99316
iter 2, train loss 2.16393

data 12292.56152
iter 3, train loss 1.94132

data 12402.40137
iter 4, train loss 1.68689

data 13600.25098
iter 5, train loss 1.51943

data 13450.91211
iter 6, train loss 1.19969

data 12739.25195
iter 7, train loss 1.10128

data 11608.33594
iter 8, train loss 1.11012

data 14801.60449
iter 9, train loss 0.83975

data 15733.70703
iter 10, train loss 0.82863

data 12621.97754
iter 11, train loss 0.66821

data 12366.39551
iter 12, train loss 0.54460

data 12358.09375
iter 13, train loss 0.44114

data 12827.04590
iter 14, train loss 0.43033

data 14613.22363
iter 15, train loss 0.49218

data 13533.24316
iter 16, train loss 0.40717

data 13346.56543
iter 17, train loss 0.39431

data 12810.09473
iter 18, train loss 0.48483

data 133

# CrossEntropy

It works fine for TF and PyTorch

with BatchNorm

## single layer batch norm

data input

In [9]:
iterator = dataset.dataset.make_one_shot_iterator()
images, labels = iterator.get_next()

print("data %.5f" % images.numpy().sum())
bn_layer = tf.layers.BatchNormalization(axis=-1, momentum=MOMENTUM, epsilon=EPS)
print("first images\n", images[0].numpy().sum())

p = bn_layer(images, training=True)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())

p = tf.layers.batch_normalization(images, training=True, momentum=MOMENTUM, epsilon=EPS)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())


data 12790.14453
first images
 107.94118
sum of output**2
 100341.695
sum of output**2
 100341.695


## A model


In [11]:
config.batch_norm = True
np.random.seed(1)
model = MNISTNetwork(config)

iterator = dataset.dataset.make_one_shot_iterator()
global_step = tf.train.get_or_create_global_step()

images, labels = iterator.get_next()
print("data %.5f" % images.numpy().sum())
debug = True
# Build model.

with tf.GradientTape(persistent=True) as tp:
    tp.watch(images)
    logits, endpoints = model(images, is_training=True)
    m = tf.reduce_sum(logits)
print(logits.numpy().sum())

grads = tp.gradient(m, model.variables)
print(len(grads))
print([e.numpy().sum() for e in grads if e is not None])
grads = tp.gradient(m, images)
print(grads.numpy().shape, grads.numpy().sum())

data 12790.14453
after pool 145856.7031
before batech norm 142664.3594
after batech norm 802636.1250
after pool 404656.3438
before batech norm 907941.3125
after batech norm 401403.4688
After two conv 401403.4688
logits 2373.4243
-203.36768
14
[-1253.7639, -39865.3, -0.0032806396, 62.726154, -203.36748, -241.92593, -9245.387, -173.44833, 359090.5, 1280.0]
(128, 28, 28, 1) -41.07355


# Cross Entropy with model contains batch norm

In [13]:
print("tf, cross entropy, batch norm")

config.batch_norm = True
np.random.seed(1)
model = MNISTNetwork(config)

lr = 0.01
momentum = 0.9

debug = False
max_iters = 30
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

iterator = dataset.dataset.make_one_shot_iterator()
optimizer = tf.train.MomentumOptimizer(lr, momentum=momentum, use_nesterov=True)

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    with tf.GradientTape() as tp:
        # Build model.
        logits, endpoints = model(images, is_training=True)
        loss_list = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        xent_loss = tf.reduce_mean(loss_list)
        total_loss = xent_loss
    grads = tp.gradient(total_loss, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
    
    print("iter %d, train loss %.5f\n" % (i, total_loss))
        
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

tf, cross entropy, batch norm
test acc 0.12790,  loss 2.59772
data 12790.14453
iter 0, train loss 3.03516

data 12953.98438
iter 1, train loss 2.31661

data 12442.99316
iter 2, train loss 1.12291

data 12292.56152
iter 3, train loss 0.81643

data 12402.40137
iter 4, train loss 0.87677

data 13600.25098
iter 5, train loss 0.68687

data 13450.91211
iter 6, train loss 0.56323

data 12739.25195
iter 7, train loss 0.48896

data 11608.33594
iter 8, train loss 0.60257

data 14801.60449
iter 9, train loss 0.37499

data 15733.70703
iter 10, train loss 0.46338

data 12621.97754
iter 11, train loss 0.24045

data 12366.39551
iter 12, train loss 0.26338

data 12358.09375
iter 13, train loss 0.28679

data 12827.04590
iter 14, train loss 0.15868

data 14613.22363
iter 15, train loss 0.26080

data 13533.24316
iter 16, train loss 0.23541

data 13346.56543
iter 17, train loss 0.19799

data 12810.09473
iter 18, train loss 0.30681

data 13308.19238
iter 19, train loss 0.15206

data 13648.76465
iter 20, tr

# Dropout

single dropout layer

In [28]:
iterator = dataset.dataset.make_one_shot_iterator()
images, labels = iterator.get_next()

print("data %.5f" % images.numpy().sum())

tf.set_random_seed(0)
drop_layer = tf.layers.Dropout(rate=0.5)

p = drop_layer(images, training=True)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())

tf.set_random_seed(0)
p = tf.layers.dropout(images, rate=0.5, training=True)
print("sum of output**2\n", (p.cpu().numpy() ** 2).sum())

data 12790.14453
sum of output**2
 21480.93
sum of output**2
 21480.93


In [17]:
print("tf, cross entropy, batch norm")

config.batch_norm = False
config.dropout_rate = 0.3
np.random.seed(1)
model = MNISTNetwork(config)

lr = 0.01
momentum = 0.9

debug = False
max_iters = 30
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

iterator = dataset.dataset.make_one_shot_iterator()
optimizer = tf.train.MomentumOptimizer(lr, momentum=momentum, use_nesterov=True)

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    with tf.GradientTape() as tp:
        # Build model.
        logits, endpoints = model(images, is_training=True)
        loss_list = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        xent_loss = tf.reduce_mean(loss_list)
        total_loss = xent_loss
    grads = tp.gradient(total_loss, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
    
    print("iter %d, train loss %.5f\n" % (i, total_loss))
        
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

tf, cross entropy, batch norm
test acc 0.12790,  loss 2.59773
data 12790.14453
iter 0, train loss 3.08468

data 12953.98438
iter 1, train loss 3.24825

data 12442.99316
iter 2, train loss 2.94694

data 12292.56152
iter 3, train loss 2.90814

data 12402.40137
iter 4, train loss 2.30756

data 13600.25098
iter 5, train loss 1.95330

data 13450.91211
iter 6, train loss 1.74396

data 12739.25195
iter 7, train loss 1.60965

data 11608.33594
iter 8, train loss 1.65559

data 14801.60449
iter 9, train loss 1.57181

data 15733.70703
iter 10, train loss 1.53753

data 12621.97754
iter 11, train loss 1.34168

data 12366.39551
iter 12, train loss 1.17563

data 12358.09375
iter 13, train loss 0.94621

data 12827.04590
iter 14, train loss 0.92047

data 14613.22363
iter 15, train loss 0.99282

data 13533.24316
iter 16, train loss 0.75663

data 13346.56543
iter 17, train loss 0.72970

data 12810.09473
iter 18, train loss 0.75371

data 13308.19238
iter 19, train loss 0.64677

data 13648.76465
iter 20, tr

In [15]:
print("tf, cross entropy, batch norm")

config.batch_norm = True
config.dropout_rate = 0.7
np.random.seed(1)
model = MNISTNetwork(config)

lr = 0.01
momentum = 0.9

debug = False
max_iters = 30
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

iterator = dataset.dataset.make_one_shot_iterator()
optimizer = tf.train.MomentumOptimizer(lr, momentum=momentum, use_nesterov=True)

for i in range(max_iters):
    images, labels = iterator.get_next()
    print("data %.5f" % images.numpy().sum())
    with tf.GradientTape() as tp:
        # Build model.
        logits, endpoints = model(images, is_training=True)
        loss_list = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=labels)
        xent_loss = tf.reduce_mean(loss_list)
        total_loss = xent_loss
    grads = tp.gradient(total_loss, model.variables)
    optimizer.apply_gradients(zip(grads, model.variables), global_step=tf.train.get_or_create_global_step())
    
    print("iter %d, train loss %.5f\n" % (i, total_loss))
        
debug = False
acc, loss = evaluate_classifier(model, test_dataset.dataset.make_one_shot_iterator())
print("test acc %.5f,  loss %.5f" % (acc, loss))

tf, cross entropy, batch norm
test acc 0.12790,  loss 2.59772
data 12790.14453
iter 0, train loss 3.23462

data 12953.98438
iter 1, train loss 2.21660

data 12442.99316
iter 2, train loss 2.09295

data 12292.56152
iter 3, train loss 1.86393

data 12402.40137
iter 4, train loss 1.70674

data 13600.25098
iter 5, train loss 1.34170

data 13450.91211
iter 6, train loss 1.11725

data 12739.25195
iter 7, train loss 1.02028

data 11608.33594
iter 8, train loss 1.07337

data 14801.60449
iter 9, train loss 0.92933

data 15733.70703
iter 10, train loss 0.88584

data 12621.97754
iter 11, train loss 0.68412

data 12366.39551
iter 12, train loss 0.54143

data 12358.09375
iter 13, train loss 0.47160

data 12827.04590
iter 14, train loss 0.40853

data 14613.22363
iter 15, train loss 0.52695

data 13533.24316
iter 16, train loss 0.33151

data 13346.56543
iter 17, train loss 0.48184

data 12810.09473
iter 18, train loss 0.53470

data 13308.19238
iter 19, train loss 0.36797

data 13648.76465
iter 20, tr

# A bug in TF 0.12.1 and 0.10.1

[issue report](https://github.com/tensorflow/tensorflow/issues/27043)

In Eager mode, batch norm doesn't support Second order derivative

In [7]:
config = ConfigDict()
# enable/disable batch norm
config.batch_norm = True

model = MNISTNetwork(config)

# Build model.

images = np.random.uniform(0, 1, (3, 28, 28, 1))
images = tf.convert_to_tensor(images, dtype=np.float32)
# images = tf.Variable(images)
print("data %.5f" % images.numpy().sum())

with tf.GradientTape(persistent=True) as t:
    with tf.GradientTape(persistent=True) as t2:
        logits = model(images, is_training=True)
        m = tf.reduce_sum(logits)
        print(logits.numpy().sum())
        dp_dx = t2.gradient(m, model.variables)
    print("first", dp_dx[0].numpy().sum())
    d2y_dx2 = t.gradient(dp_dx[0], model.variables)
    print("second order", d2y_dx2[0].numpy().sum())

data 1182.03564
-5.686387
first -66.77962


RuntimeError: tf.gradients is not supported when eager execution is enabled. Use tf.GradientTape instead.