## Max-Norm Regularization

The idea behind Max-Norm is that it constrains the weights $w$ of the incoming connections such that ${\lVert w \lVert}_2 \le r$ where $r$ is the max-norm hyperparameter.

We can implement this regularizer by computing $\lVert w \lVert_2$ after each training step and *clipping* $w$ if needed $w 	\leftarrow w{\dfrac{r}{\lVert w \lVert_2}}$

Reducing $r$ increases the amount of regularization which, in turn, reduces overfitting.

If we're not using Batch Normalization, Max-Norm cal also helps alleviate vanishing/exploding gradients problems.

In [1]:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data

n_inputs = 784
n_hidden1 = 300
n_hidden2 = 100
n_outputs = 10

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

In [2]:
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

In [3]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

In [4]:
with tf.name_scope("train"):
    initial_learning_rate = 0.1
    decay_steps = 10000
    decay_rate = 1/10
    global_step = tf.Variable(0, trainable=False, name="global_step")
    learning_rate = tf.train.exponential_decay(initial_learning_rate, global_step, decay_steps, decay_rate)
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9)
    training_op = optimizer.minimize(loss, global_step=global_step)

In [5]:
with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [6]:
with tf.variable_scope("hidden3", reuse=tf.AUTO_REUSE):
    hidden3 = tf.layers.dense(hidden2, 100, activation=tf.nn.relu)
    
for variable in tf.global_variables():
    print(variable.name)

hidden2_variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="hidden2")
for variable in hidden2_variables:
    print(variable.name)

hidden1/kernel:0
hidden1/bias:0
hidden2/kernel:0
hidden2/bias:0
outputs/kernel:0
outputs/bias:0
train/global_step:0
hidden1/kernel/Momentum:0
hidden1/bias/Momentum:0
hidden2/kernel/Momentum:0
hidden2/bias/Momentum:0
outputs/kernel/Momentum:0
outputs/bias/Momentum:0
hidden3/dense/kernel:0
hidden3/dense/bias:0
hidden2/kernel:0
hidden2/bias:0
hidden2/kernel/Momentum:0
hidden2/bias/Momentum:0


In [7]:
n_epochs = 10
batch_size = 100
init = tf.global_variables_initializer()
mnist = input_data.read_data_sets("/tmp/data/")

Extracting /tmp/data/train-images-idx3-ubyte.gz
Extracting /tmp/data/train-labels-idx1-ubyte.gz
Extracting /tmp/data/t10k-images-idx3-ubyte.gz
Extracting /tmp/data/t10k-labels-idx1-ubyte.gz


In [8]:
# Max-norm threshold
threshold = 1.0

In [9]:
# Accessing weights of each layer
w1 = tf.get_default_graph().get_tensor_by_name("hidden1/kernel:0")
w2 = tf.get_default_graph().get_tensor_by_name("hidden2/kernel:0")

print(w1)
print(w2)

Tensor("hidden1/kernel:0", shape=(784, 300), dtype=float32_ref)
Tensor("hidden2/kernel:0", shape=(300, 100), dtype=float32_ref)


In [10]:
# Messy approach to implement max-norm
# We're going to create a clip_weights node that will clip the weights
clipped_w1 = tf.clip_by_norm(w1, clip_norm=threshold, axes=1)
w1 = tf.assign(w1, clipped_w1)
print(clipped_w1)
print(w1)

clipped_w2 = tf.clip_by_norm(w2, clip_norm=threshold, axes=1)
w2 = tf.assign(w2, clipped_w2)
print(clipped_w2)
print(w2)

Tensor("clip_by_norm:0", shape=(784, 300), dtype=float32)
Tensor("Assign:0", shape=(784, 300), dtype=float32_ref)
Tensor("clip_by_norm_1:0", shape=(300, 100), dtype=float32)
Tensor("Assign_1:0", shape=(300, 100), dtype=float32_ref)


In [11]:
with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            # Evaluate clip weights nodes
            w1.eval()
            w2.eval()
        
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print("Epoch:", epoch, "--", "Test Accuracy:", acc_test)

Epoch: 0 -- Test Accuracy: 0.9637
Epoch: 1 -- Test Accuracy: 0.9647
Epoch: 2 -- Test Accuracy: 0.9715
Epoch: 3 -- Test Accuracy: 0.9792
Epoch: 4 -- Test Accuracy: 0.9787
Epoch: 5 -- Test Accuracy: 0.9815
Epoch: 6 -- Test Accuracy: 0.9824
Epoch: 7 -- Test Accuracy: 0.9835
Epoch: 8 -- Test Accuracy: 0.9831
Epoch: 9 -- Test Accuracy: 0.9836


In [12]:
tf.reset_default_graph()

n_inputs = 28 * 28
n_hidden1 = 300
n_hidden2 = 50
n_outputs = 10

learning_rate = 0.01
momentum = 0.9

X = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
y = tf.placeholder(tf.int64, shape=(None), name="y")

In [13]:
# Better approach
def max_norm_regularizer(threshold, axes=1, name="max_norm", collection="max_norm"):
    def max_norm(w):
        clipped_w = tf.clip_by_norm(w, clip_norm=threshold, axes=axes)
        w = tf.assign(w, clipped_w)
        
        # Add clipped weights to a collection for fetching later
        tf.add_to_collection(collection, w)
        
        # Max-norm regularization doesn't require adding regularization loss term to the cost function
        # so we return None here
        return None 
    return max_norm

In [14]:
max_norm_regularizer = max_norm_regularizer(threshold)

In [15]:
with tf.name_scope("dnn"):
    hidden1 = tf.layers.dense(X, n_hidden1, activation=tf.nn.relu, 
                              kernel_regularizer=max_norm_regularizer, name="hidden1")
    hidden2 = tf.layers.dense(hidden1, n_hidden2, activation=tf.nn.relu,
                              kernel_regularizer=max_norm_regularizer, name="hidden2")
    logits = tf.layers.dense(hidden2, n_outputs, name="outputs")

In [16]:
with tf.name_scope("loss"):
    xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=logits)
    loss = tf.reduce_mean(xentropy, name="loss")

with tf.name_scope("train"):
    optimizer = tf.train.MomentumOptimizer(learning_rate, momentum)
    training_op = optimizer.minimize(loss)    

with tf.name_scope("eval"):
    correct = tf.nn.in_top_k(logits, y, 1)
    accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

In [17]:
n_epochs = 10
batch_size = 50
init = tf.global_variables_initializer()

clip_weights = tf.get_collection("max_norm")

with tf.Session() as sess:
    init.run()
    for epoch in range(n_epochs):
        for iteration in range(mnist.train.num_examples // batch_size):
            X_batch, y_batch = mnist.train.next_batch(batch_size)
            sess.run(training_op, feed_dict={X: X_batch, y: y_batch})
            sess.run(clip_weights)
        
        acc_test = accuracy.eval(feed_dict={X: mnist.test.images, y: mnist.test.labels})
        print("Epoch:", epoch, "--", "Test Accuracy:", acc_test)

Epoch: 0 -- Test Accuracy: 0.9523
Epoch: 1 -- Test Accuracy: 0.9599
Epoch: 2 -- Test Accuracy: 0.9697
Epoch: 3 -- Test Accuracy: 0.9747
Epoch: 4 -- Test Accuracy: 0.9753
Epoch: 5 -- Test Accuracy: 0.9785
Epoch: 6 -- Test Accuracy: 0.9773
Epoch: 7 -- Test Accuracy: 0.9783
Epoch: 8 -- Test Accuracy: 0.9784
Epoch: 9 -- Test Accuracy: 0.9807


Default DNN configuration:
- Initialization: He Initialization
- Activation function: ELU
- Normalization: Batch normalization
- Regularization: Dropout
- Optimizer: Adam
- Learning rate schedule: None