# A Super-Simple Neural Network Demo

In [None]:
import math
import tensorflow as tf
import numpy as np
from matplotlib import pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline

### Create some 2-dimensional data

In [None]:
def ground_truth(x,y):
    return (1.3*x-.5)*(1.3*x-.5) + (y-.5)*(y-.5) < .05

def createSamples(N, xl, xr, yu, yo, ground_truth, rnd=True):
    """
        ground_truth is a function that calculates the "true" label, given coordinates x and y
        Produce N samples in the rectangle [xl, xr, yu, yo] with the given ground_truth
    """
    if rnd:
        np.random.seed(1234)
        x = np.random.uniform(xl,xr,N)
        y = np.random.uniform(yu,yo,N)
    else:
        N = int(math.sqrt(N))
        dx = (xr - xl) / N
        dy = (yo - yu) / N
        field = np.array([(xl + dx * xs,yu + dy * ys) for xs in range(N) for ys in range(N)]).T
        x, y = field[0], field[1]

    c = ground_truth(x, y) * 1.0
    return x, y, c

In [None]:
# Before rerunning, close the previous session. Ignore error the very first time
try: 
    sess.close()
except NameError:
    print("Don't worry. Need to ignore this error once")
sess = tf.InteractiveSession()
FLAGS=lambda: None

### The 2-dimensional input data, classes are represented by colors

In [None]:
sx, sy, sc = createSamples(10000, 0, 1, 0, 1, ground_truth, rnd=False)
points=np.array([sx, sy])
tr_samples = points.T # Need transposed for use with Tensorflow

In [None]:
matplotlib.rcParams['figure.figsize'] = (8,8)
plt.scatter(sx, sy, c=sc, cmap="bwr", marker='.', s=1)

### The Neural Network
We'll be creating a simple feed-forward network with two hidden layers.

![NN](images/NN_2x3x3x2_small.png)

Our neural network will be defined as

$ f(x) = \Theta^{(3)} \cdot \sigma(\Theta^{(2)} \cdot \sigma(\Theta^{(1)} \cdot x + b^{(1)} ) + b^{(2)}) + b^{(3)}$

Note, that we omit the final non-linearity at this point. That's for mere technical reasons and doesn't change the story.

Below you see the neural network in code, featuring some illustrative initial values.

You see: We have 2 input nodes, 3 nodes in each of the hidden layers and again 2 nodes in the output layer

### Weight matrices and biases initialized to some values

In [None]:
Theta1 = np.array([[1.6, 4], [1.6, -1.2], [-3.6, 1.6]])              # 3x2 weight Matrix towards the first hidden layer
b1 = np.array([[-1, 1, 6]]).T                                        # bias of the first hidden layer
Theta2 = np.array([[1, 2, -3], [.5, .2, -3], [2, 1, -.2]])           # 3x3 weight Matrix towards the second hidden layer
b2 = np.array([[.2, .1, -.4]]).T                                     # bias of the 2nd hidden layer
Theta3 = np.array([[.5, 2, -.03], [.2, 1, -.2]])                     # 2x3 weight Matrix towards the output layer
b3 = np.array([[.2, .3]]).T                                          # bias of the output layer

In [None]:
#
# NOTE: You need to initialize with the transpose of the weight matrix, otherwise TF mixes up columns and rows
#       Of course that's not a problem, since typically, all numbers are randomly initialized, anyway.
# (can you imagine the frustration until I found out?)
#
W1_i=tf.constant_initializer(Theta1.T)
b1_i=tf.constant_initializer(b1)
W2_i=tf.constant_initializer(Theta2.T)
b2_i=tf.constant_initializer(b2)
W3_i=tf.constant_initializer(Theta3.T)
b3_i=tf.constant_initializer(b3)

def feed_forward(x):
    _dense1=tf.layers.Dense(3, activation=tf.nn.sigmoid, kernel_initializer=W1_i, bias_initializer=b1_i)
    _dense2=tf.layers.Dense(3, activation=tf.nn.sigmoid, kernel_initializer=W2_i, bias_initializer=b2_i)
    _logits=tf.layers.Dense(2, kernel_initializer=W3_i, bias_initializer=b3_i)    

    dense1 = _dense1(x)
    dense2 = _dense2(dense1)
    y = _logits(dense2)
    return dense1, dense2, y

### Placeholder for the different data sets

In [None]:
X = tf.placeholder(dtype=tf.float64, shape=[None, 2])
L = tf.placeholder(dtype=tf.int64, shape=[None])

### Construct the computational graph for the optimization

In [None]:
classes = tf.one_hot(L, depth=2)
hidden1, hidden2, output = feed_forward(X)
probs = tf.nn.softmax(output)
objective = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=classes))

In [None]:
optimizer = tf.train.AdamOptimizer(learning_rate=1e-2)
train = optimizer.minimize(objective)
preds = tf.argmax(probs,axis=1)
accuracy = tf.reduce_mean(tf.cast(tf.equal(preds, L), dtype=tf.float64))

### Setting up the training

In [None]:
init = tf.global_variables_initializer()
sess.run(init)
losses = []
accies = []
n_batch = 50

def single_batch(n_batch):
    for _ in range(n_batch):
        _, _all_output, _objective, _accuracy = sess.run([train, output, objective, accuracy], feed_dict={X: tr_samples, L: sc.astype(int)})

    print ("Loss: %s - Accuracy: %s" % (_objective, _accuracy))
    losses.append(_objective)
    accies.append(_accuracy)
    return _all_output

### First, let's look at the hidden layers before the training

In [None]:
y, h1, h2 = sess.run([preds, hidden1, hidden2], feed_dict={X: tr_samples, L: sc.astype(int)})
h1 = h1.T
h2 = h2.T

### The first hidden layer before the training

In [None]:
matplotlib.rcParams['figure.figsize'] = (12,12)
plt.axes(projection='3d', elev=30, azim=110)
plt.scatter(*h1, c=sc, cmap='bwr', marker='.')

### The Second Hidden Layer

In [None]:
plt.axes(projection='3d', elev=40, azim=240)
plt.scatter(*h2, c=sc, cmap='bwr', marker='.')

### Do 50 Training Runs

In [None]:
for _ in range (25): 
    all_output = single_batch(n_batch)

### Learning to Linearly Separate
The network learned to tweak the manifold of the last hidden layer such that a hyperplane can separate red from blue points

In [None]:
y, h1, h2 = sess.run([preds, hidden1, hidden2], feed_dict={X: tr_samples, L: sc.astype(int)})
h1 = h1.T
h2 = h2.T

### The First Hidden Layer After Training

In [None]:
plt.axes(projection='3d', elev=20, azim=70)
plt.scatter(*h1, c=sc, cmap='bwr', marker='.')

### The Second Hidden Layer After the Training

In [None]:
plt.axes(projection='3d', elev=40, azim=240)
plt.scatter(*h2, c=sc, cmap='bwr', marker='.')

### After some period of stagnation, the network learned to predict the colors from the coordinates

In [None]:
plt.plot(accies)
plt.plot(losses)

### Inferring the classes (colors) of some given test coordinates

In [None]:
N=20
sx, sy, sc = createSamples(N, 0, 1, 0, 1, ground_truth)
points=np.array([sx, sy])
plt.scatter(sx, sy, c=sc, cmap="bwr", marker='.')

test_samples = np.array([sx, sy]).T
test_labels = sc.astype(int)

### The network correctly infers (most of) the classes of the given test coordinates

In [None]:
test_infered = sess.run(preds, feed_dict={X: test_samples, L: test_labels})

print("true classes   : %s" % test_labels)
print('infered classes: %s' % test_infered)