## Dropout & Regularization

In this tutorial, we'll cover new concepts such as dropout and regularization. Note just like last time this'll be based on the previous example, so only new concepts will be covered. Also note the function being modeled has changed to $X^3+Y^2 = Z$, to increase the difficulty.

In [None]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import math
import Utils as ut
from sklearn.preprocessing import MinMaxScaler

These are our functions that'll be used throughout just like last time, but the main changes in this tutorial will all happen in the make_layer function. First, notice the regularization function. Along with our normal losses, we'll be tracking the magnitude of each weight using the L2 norm (this is also commonly done with the L1 norm). Later we'll incoporate this into our overall loss. This prevents any weight from getting too large and thus making our layer unbalanced and preventing overfitting.

We also have dropout a few lines later. Dropout basically allows us to skip nodes randomly in each layer, removing overdependence on any one weight and preventing overfitting once again. The "keep_prob" parameter ranges from 0 to 1, and is the likelihood that that node will be skipped. For example, if keep_prob is .90, then it'll have a 90% chance of skipping that node. 

In [None]:
def make_layer(act_func, input_val, input_num, output_num, keep_prob):
    W = tf.Variable(tf.random_normal([input_num, output_num], stddev=0.03), dtype=tf.float32)
    b = tf.Variable(tf.random_normal([output_num], stddev=0.03), dtype=tf.float32)
    layer = act_func(tf.matmul(input_val,W) + b)
    regularization = tf.nn.l2_loss(W)
    if input_num == output_num:
        layer = tf.nn.dropout(layer, keep_prob)
    return layer, regularization

linear = lambda x: x

def make_r(y, pred):
    if y.shape[1] == 2:
        total_error = tf.reduce_sum(tf.square(tf.subtract(y[:,0:1], tf.reduce_mean(y[:,0:1])))) + tf.reduce_sum(tf.square(tf.subtract(y[:,1:2], tf.reduce_mean(y[:,1:2]))))
        unexplained_error = tf.reduce_sum(tf.square(tf.subtract(y[:,0:1], pred[:,0:1]))) + tf.reduce_sum(tf.square(tf.subtract(y[:,1:2], pred[:,1:2])))
        R_squared = tf.subtract(1.0, tf.divide(unexplained_error, total_error))
        return R_squared
    elif y.shape[1] == 1:
        total_error = tf.reduce_sum(tf.square(tf.subtract(y, tf.reduce_mean(y))))
        unexplained_error = tf.reduce_sum(tf.square(tf.subtract(y, pred)))
        R_squared = tf.subtract(1.0, tf.divide(unexplained_error, total_error))
        return R_squared
    else:
        raise TypeError("Weird Shape for R Squared")

epochs = 25
learning_rate = 0.001
num_points = 10
num_nodes = 256
num_batch = 30
num_layers = 5
drop_prob = 0.9
beta = 0.0001

#used for inputting into layers
input_tf = tf.placeholder(tf.float32, [None,4])
output_tf = tf.placeholder(tf.float32, [None,2])
np.random.seed()
#actual data points
x_r = np.linspace(0, 20, num_points)
x_i = np.linspace(-10, 0, num_points)
y_r = np.linspace(-30, 10, num_points)
y_i = np.linspace(20, 30, num_points)
x_f = x_r
y_f = y_r
x_r, y_r, x_i, y_i = np.meshgrid(x_r, y_r, x_i, y_i)

#prepping data to be input
x_r = x_r.reshape(-1,1)
x_i = x_i.reshape(-1,1)
y_r = y_r.reshape(-1,1)
y_i = y_i.reshape(-1,1)
z = (x_r+x_i*1j)**3 + (y_r+y_i*1j)**2
data = np.hstack((x_r, x_i, y_r, y_i, z.real, z.imag))
np.random.shuffle(data)

#split into training and validation sets
train_size = int(len(x_r) * 0.7)
data_tr = data[0:train_size]
data_val = data[train_size:]

#normalizing data
norm  = MinMaxScaler(copy=True, feature_range=(-1,1))
norm.fit(data_tr)
data_tr = norm.transform(data_tr)
data_val = norm.transform(data_val)

Here our layers our created a little differently. We define keep_prob as a placeholder so that we can turn on and off whether nodes are skipped when actually feeding the data in. This allows us to turn off dropout when getting our predictions out.

We also have the regularizer parameter that sums up the L2 norm of the weights of each layer.

In [None]:
keep_prob = tf.placeholder(tf.float32)
input, regularizer = make_layer(tf.nn.leaky_relu, input_tf, 4, num_nodes, keep_prob)
hidden = input
for i in range(num_layers):
    hidden, temp = make_layer(tf.nn.leaky_relu, hidden, num_nodes, num_nodes, keep_prob)
    regularizer += temp
output, temp = make_layer(linear, hidden, num_nodes, 2, keep_prob)
regularizer += temp

Here we reduce the mean of loss_real, loss_imag and our regularizer value times some beta defined above to scale it to the value we need. 

In [None]:
loss_real = tf.losses.mean_squared_error(output_tf[:,0:1], output[:,0:1])
loss_imag = tf.losses.mean_squared_error(output_tf[:,1:2], output[:,1:2])
loss = tf.reduce_mean(loss_real + loss_imag + regularizer*beta)
train = tf.train.AdamOptimizer(learning_rate).minimize(loss)

#to calculate R^2
r = make_r(output_tf, output)

init = tf.global_variables_initializer()

Here our session runs the same as before, only this time with keep_prob as part of the feed_dict. Notice it'll have value drop_prob as defined above when training, but when calculating losses, $R^2$, and our prediction it'll have a value of 1.

In [None]:
batch_size = len(data_tr) // num_batch
with tf.Session() as sess:
    sess.run(init)
    plot_epoch = []
    plot_loss_tr = []
    plot_loss_val = []
    plot_r_tr = []
    plot_r_val = []
    count = 0
    for i in range(epochs+1):
        for j in range(0,len(data_tr), batch_size):
            sess.run(train, feed_dict={input_tf: data_tr[j:j+batch_size,0:4], output_tf: data_tr[j:j+batch_size,4:6], keep_prob: drop_prob})

        #get losses
        feed_tr = {input_tf: data_tr[:,0:4], output_tf: data_tr[:,4:6], keep_prob: 1}
        feed_val = {input_tf: data_val[:,0:4], output_tf: data_val[:,4:6], keep_prob: 1}
        plot_epoch += [i]
        plot_loss_tr.append(loss_real.eval(feed_dict=feed_tr) + loss_imag.eval(feed_dict=feed_tr))
        plot_loss_val.append(loss_real.eval(feed_dict=feed_val) + loss_imag.eval(feed_dict=feed_val))
        #get r
        plot_r_tr.append(r.eval(feed_dict=feed_tr))
        plot_r_val.append(r.eval(feed_dict=feed_val))
        #jump ship if it's good enough
        if len(plot_loss_tr) > 1 and abs(plot_loss_tr[-1]-plot_loss_tr[-2]) < 0.00001:
            count += 1
        else:
            count = 0
        if count >= 2:
            break

        if i % 1 == 0:
            print("Epoch: %s \t TLoss: %s \t VLoss: %s" % (i, plot_loss_tr[-1], plot_loss_val[-1]))

    #get predictions
    pred_tr = output.eval(feed_dict=feed_tr)
    pred_val = output.eval(feed_dict=feed_val)


ax1 = plt.subplot(221)
ax1.plot(plot_epoch, plot_loss_tr, 'b-', lw=0.5, label="Training Set")
ax1.plot(plot_epoch, plot_loss_val,'r-', lw=0.5, label="Validation Set")
ax1.axis([0,len(plot_epoch),0,.1])
ax1.legend(loc="upper right")
ax1.set_title("MSE")

ax2 = plt.subplot(222)
ax2.plot(plot_epoch, plot_r_tr,'b-', lw=0.5)
ax2.plot(plot_epoch, plot_r_val,'r-', lw=0.5)
ax2.axis([0,len(plot_epoch),0.9,1.0001])
ax2.set_title("R Squared")

ax3 = plt.subplot(2, 2, 3, projection='3d')
ax3.set_title("Real Part")
ax4 = plt.subplot(2, 2, 4, projection='3d')
ax4.set_title("Imaginary Part")

data_tr[:,4:6] = np.array(pred_tr)
data_tr = norm.inverse_transform(data_tr)
ax3.scatter3D(data_tr[:,0:1], data_tr[:,2:3], data_tr[:,4:5], marker=".", c='b')
ax4.scatter3D(data_tr[:,1:2], data_tr[:,3:4], data_tr[:,5:6], marker=".", c='b')

data_val[:,4:6] = np.array(pred_val)
data_val = norm.inverse_transform(data_val)
ax3.scatter3D(data_val[:,0:1], data_val[:,2:3], data_val[:,4:5], marker=".", c='r')
ax4.scatter3D(data_val[:,1:2], data_val[:,3:4], data_val[:,5:6], marker=".", c='r')

plt.show()