# Neural Networks

In this tutorial we learn how the most basic neural networks work.

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd

# Training a neural network

## Generate data
Create toy dataset, define class for weights intialization etc.

In [18]:
# set the true weights, use them for data generation later:
w_1 = np.array(np.transpose([[2, 1], [1, 0], [1, -1]]))
b_1 = np.array(np.expand_dims([1, 0, 2], axis=0))
w_2 = np.array([[1,], [1,], [-2,]])
b_2 = 1

w_1.shape, b_1.shape, w_2.shape

# Print the ouput nicely
print("WEIGHTS OF THE MODEL \n")
print("Hidden layer weights: \n{} \n".format(w_1))
print("Hidden layer bias: \n{} \n".format(b_1))
print("Output layer weights: \n{} \n".format(w_2))
print("Hidden layer bias: \n{} \n".format(b_2))

WEIGHTS OF THE MODEL 

Hidden layer weights: 
[[ 2  1  1]
 [ 1  0 -1]] 

Hidden layer bias: 
[[1 0 2]] 

Output layer weights: 
[[ 1]
 [ 1]
 [-2]] 

Hidden layer bias: 
1 



In [19]:
# define class for initialization of weights - fix seed to use the same numbers always
class MyInit(tf.keras.initializers.Initializer):

  def __init__(self, mean, std):
    self.mean = mean
    self.std = std

  def __call__(self, shape, dtype=None, **kwargs):
    tf.random.set_seed(11)
    return tf.cast(tf.cast(tf.random.normal(
        shape, mean=self.mean, stddev=self.std, dtype=dtype), tf.int32), tf.float32)

  def get_config(self):  # To support serialization
    return {"mean": self.mean, "std": self.std}

In [20]:
# define forward propagation
def forward_propagation(x, w_1, b_1, w_2, b_2):
    h = np.matmul(x, w_1) + b_1
    h_relu = np.where(h < 0, 0, h)
    y = np.matmul(h_relu, w_2) + b_2
    
    return {'hidden': h, 'hidden_relu': h_relu, 'prediction': y}

In [21]:
# generate training data using the true weights
np.random.seed(seed=73)
x_1 = [-4] + [int(x) for x in np.random.uniform(-2, 4, 100)]
x_2 = [1] + [int(x) for x in np.random.uniform(-2, 4, 100)]
x = np.transpose(np.array([x_1, x_2]))

y = forward_propagation(x, w_1, b_1, w_2, b_2)['prediction']

In [22]:
# convert training data to tf.data.Dataset with batch size
batch_size = 1
train_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)

## Initialize model

In [23]:
# create a simple NN with the pre-defined intialization:
inputs = tf.keras.Input(shape=(2,), name="input_values")

x1 = tf.keras.layers.Dense(3,
                           activation="relu",
                           name="hidden",
                           kernel_initializer=MyInit(1, 1),
                           bias_initializer=MyInit(2, 1))(inputs)

outputs = tf.keras.layers.Dense(1,
                                name="predictions",
                                kernel_initializer=MyInit(1, 3),
                                bias_initializer=MyInit(2, 0))(x1)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# set the loss and the optimizer
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MSE

In [24]:
# Look at the initialized weights
w1 = model.layers[1].weights[0].numpy()
b1 = model.layers[1].weights[1].numpy()

w2 = model.layers[2].weights[0].numpy()
b2 = model.layers[2].weights[1].numpy()

# Print the ouput nicely
print("WEIGHTS OF THE MODEL \n")
print("Hidden layer weights: \n{} \n".format(w1))
print("Hidden layer bias: \n{} \n".format(b1))
print("Output layer weights: \n{} \n".format(w2))
print("Hidden layer bias: \n{} \n".format(b2))

WEIGHTS OF THE MODEL 

Hidden layer weights: 
[[0. 1. 0.]
 [2. 1. 1.]] 

Hidden layer bias: 
[0. 2. 1.] 

Output layer weights: 
[[-3.]
 [ 3.]
 [ 0.]] 

Hidden layer bias: 
[2.] 



In [25]:
# check the forward propagation
fp = forward_propagation(x[0:1], w1, b1, w2, b2)

# Print the ouput nicely
print("PROPAGATED VALUES \n")
print("Input vector \n{} \n".format(x[0]))
print("Hidden layer before activation \n{} \n".format(fp["hidden"]))
print("Hidden layer after relu \n{} \n".format(fp["hidden_relu"]))
print("Prediction of output layer \n{} \n".format(fp["prediction"]))

PROPAGATED VALUES 

Input vector 
[-4  1] 

Hidden layer before activation 
[[ 2. -1.  2.]] 

Hidden layer after relu 
[[2. 0. 2.]] 

Prediction of output layer 
[[-4.]] 



In [26]:
# take the first batch
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
    break

x_batch_train.numpy(), y_batch_train.numpy()

(array([[-4,  1]]), array([[1]]))

In [27]:
# Calculate the gradients on the batch:
with tf.GradientTape() as tape:
    prediction = model(x_batch_train, training=True)
    loss_value = loss_fn(y_batch_train, prediction)


grads = tape.gradient(loss_value, model.trainable_weights)

# Print the ouput nicely
print("Prediction:{}, Target:{}, Loss:{}".format(prediction, y[0], loss_value))
print("\n>>> GRADIENTS:")
print("Hidden layer weights \n{} \n".format(grads[0].numpy()))
print("Hidden layer bias \n{} \n".format(grads[1].numpy()))
print("Output layer weights \n{} \n".format(grads[2].numpy()))
print("Output layer bias \n{} \n".format(grads[3].numpy()))

Prediction:[[-4.]], Target:[1], Loss:[25.]

>>> GRADIENTS:
Hidden layer weights 
[[-120.    0.    0.]
 [  30.   -0.   -0.]] 

Hidden layer bias 
[30. -0. -0.] 

Output layer weights 
[[-20.]
 [ -0.]
 [-20.]] 

Output layer bias 
[-10.] 



In [28]:
# apply one SGD update
optimizer.apply_gradients(zip(grads, model.trainable_weights))

<tf.Variable 'UnreadVariable' shape=() dtype=int64, numpy=1>

In [17]:
# check the weights after the update:
w1 = model.layers[1].weights[0].numpy()
b1 = model.layers[1].weights[1].numpy()

w2 = model.layers[2].weights[0].numpy()
b2 = model.layers[2].weights[1].numpy()

# Print the ouput nicely
print("UPDATED WEIGHTS OF THE MODEL \n")
print("Hidden layer weights: \n{} \n".format(w1))
print("Hidden layer bias: \n{} \n".format(b1))
print("Output layer weights: \n{} \n".format(w2))
print("Hidden layer bias: \n{} \n".format(b2))


UPDATED WEIGHTS OF THE MODEL 

Hidden layer weights: 
[[1.1999999 1.        0.       ]
 [1.7       1.        1.       ]] 

Hidden layer bias: 
[-0.29999998  2.          1.        ] 

Output layer weights: 
[[-2.8       ]
 [ 3.        ]
 [ 0.19999999]] 

Hidden layer bias: 
[2.1] 



In [29]:
# check the prediction
prediction = model(x_batch_train, training=True)
loss_value = loss_fn(y_batch_train, prediction)

print("Prediction:{}, Target:{}, Loss:{}".format(prediction, y[0], loss_value))

Prediction:[[2.5]], Target:[1], Loss:[2.25]


### Run the training loop

In [30]:
epochs = 16
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:

            pred = model(x_batch_train, training=True)
            loss_value = loss_fn(y_batch_train, pred)

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        mse = np.mean((model(x).numpy() - y)**2)
        
    print('>>> Loss {:.3f}'.format(mse))


Start of epoch 0
>>> Loss 0.720

Start of epoch 1
>>> Loss 0.361

Start of epoch 2
>>> Loss 0.311

Start of epoch 3
>>> Loss 0.272

Start of epoch 4
>>> Loss 0.225

Start of epoch 5
>>> Loss 0.213

Start of epoch 6
>>> Loss 0.156

Start of epoch 7
>>> Loss 0.113

Start of epoch 8
>>> Loss 0.086

Start of epoch 9
>>> Loss 0.045

Start of epoch 10
>>> Loss 0.035

Start of epoch 11
>>> Loss 0.036

Start of epoch 12
>>> Loss 0.047

Start of epoch 13
>>> Loss 0.049

Start of epoch 14
>>> Loss 0.049

Start of epoch 15
>>> Loss 0.017


In [31]:
w1 = model.layers[1].weights[0].numpy()
b1 = model.layers[1].weights[1].numpy()

w2 = model.layers[2].weights[0].numpy()
b2 = model.layers[2].weights[1].numpy()

print("Hidden layer weights: \n{} \n".format(w1))
print("Hidden layer bias: \n{} \n".format(b1))
print("Output layer weights: \n{} \n".format(w2))
print("Hidden layer bias: \n{} \n".format(b2))

Hidden layer weights: 
[[ 0.8380496   1.8128631  -0.25347403]
 [-0.95671076  0.66604346  0.6694228 ]] 

Hidden layer bias: 
[1.8954308  0.5246789  0.66958815] 

Output layer weights: 
[[-1.9003438 ]
 [ 1.4643725 ]
 [ 0.35568428]] 

Hidden layer bias: 
[0.5498195] 



In [32]:
np.mean((model(x).numpy() - y)**2)

0.01748505669685138

### Playing with optimizer and learning rates

http://2.bp.blogspot.com/-q6l20Vs4P_w/VPmIC7sEhnI/AAAAAAAACC4/g3UOUX2r_yA/s400/

In [33]:
batch_size = 10
train_dataset = tf.data.Dataset.from_tensor_slices((x, y)).batch(batch_size)

In [34]:
inputs = tf.keras.Input(shape=(2,), name="input_values")

x1 = tf.keras.layers.Dense(3,
                           activation="relu",
                           name="hidden",
                           kernel_initializer=MyInit(1, 1),
                           bias_initializer=MyInit(2, 1))(inputs)

outputs = tf.keras.layers.Dense(1,
                                name="predictions",
                                kernel_initializer=MyInit(1, 3),
                                bias_initializer=MyInit(2, 0))(x1)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MSE

In [35]:
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer = tf.keras.optimizers.RMSprop(learning_rate=0.001)
optimizer = tf.keras.optimizers.SGD(learning_rate=0.001)

epochs = 128
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:

            pred = model(x_batch_train, training=True)
            loss_value = loss_fn(y_batch_train, pred)

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        mse = np.mean((model(x).numpy() - y)**2)
        
    print('>>> Loss {:.4f}'.format(mse))


Start of epoch 0
>>> Loss 10.0086

Start of epoch 1
>>> Loss 6.3836

Start of epoch 2
>>> Loss 4.1899

Start of epoch 3
>>> Loss 2.4007

Start of epoch 4
>>> Loss 1.8415

Start of epoch 5
>>> Loss 1.3954

Start of epoch 6
>>> Loss 1.1239

Start of epoch 7
>>> Loss 0.9543

Start of epoch 8
>>> Loss 0.8465

Start of epoch 9
>>> Loss 0.7631

Start of epoch 10
>>> Loss 0.6984

Start of epoch 11
>>> Loss 0.6488

Start of epoch 12
>>> Loss 0.6150

Start of epoch 13
>>> Loss 0.5858

Start of epoch 14
>>> Loss 0.5604

Start of epoch 15
>>> Loss 0.5381

Start of epoch 16
>>> Loss 0.5183

Start of epoch 17
>>> Loss 0.5006

Start of epoch 18
>>> Loss 0.4847

Start of epoch 19
>>> Loss 0.4703

Start of epoch 20
>>> Loss 0.4572

Start of epoch 21
>>> Loss 0.4452

Start of epoch 22
>>> Loss 0.4343

Start of epoch 23
>>> Loss 0.4242

Start of epoch 24
>>> Loss 0.4150

Start of epoch 25
>>> Loss 0.4065

Start of epoch 26
>>> Loss 0.3987

Start of epoch 27
>>> Loss 0.3915

Start of epoch 28
>>> Loss 0

In [36]:
w1 = model.layers[1].weights[0].numpy()
b1 = model.layers[1].weights[1].numpy()

w2 = model.layers[2].weights[0].numpy()
b2 = model.layers[2].weights[1].numpy()

print("Hidden layer weights: \n{} \n".format(w1))
print("Hidden layer bias: \n{} \n".format(b1))
print("Output layer weights: \n{} \n".format(w2))
print("Hidden layer bias: \n{} \n".format(b2))

Hidden layer weights: 
[[ 0.68046284  1.3424093   0.07904674]
 [-0.70974535  0.487307    0.7084906 ]] 

Hidden layer bias: 
[1.459933   0.46041295 1.1783967 ] 

Output layer weights: 
[[-2.6617184 ]
 [ 2.0708191 ]
 [ 0.14849895]] 

Hidden layer bias: 
[0.79769206] 



<span style="color:red">**TO DO:** Try different optimizers with different lerning rate, how does the convergence look? Does it converge? How fast? Which one is the best?</span>

## Vanishing gradient

In [37]:
# create a simple NN with the pre-defined intialization:
init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=2, name="input_values")

h = tf.keras.layers.Dense(3, activation="tanh")(inputs)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(3, activation="tanh", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

# set the loss and the optimizer
optimizer = tf.keras.optimizers.Adam()
loss_fn = tf.keras.losses.MSE

model.summary()

Model: "model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_values (InputLayer)   [(None, 2)]               0         
                                                                 
 dense (Dense)               (None, 3)                 9         
                                                                 
 dense_1 (Dense)             (None, 3)                 12        
                                                                 
 dense_2 (Dense)             (None, 3)                 12        
                                                                 
 dense_3 (Dense)             (None, 3)                 12        
                                                                 
 dense_4 (Dense)             (None, 3)                 12        
                                                                 
 dense_5 (Dense)             (None, 3)                 12  

In [38]:
# take the first batch
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
    break

x_batch_train.numpy(), y_batch_train.numpy()

(array([[-4,  1],
        [ 1,  0],
        [ 1, -1],
        [ 1,  2],
        [ 1,  0],
        [ 0, -1],
        [ 0,  0],
        [ 0,  0],
        [ 1,  0],
        [ 2,  1]]),
 array([[ 1],
        [-1],
        [-4],
        [ 5],
        [-1],
        [-5],
        [-2],
        [-2],
        [-1],
        [ 3]]))

In [39]:
# Calculate the gradients on the batch:
with tf.GradientTape() as tape:
    prediction = model(x_batch_train, training=True)
    loss_value = loss_fn(y_batch_train, prediction)

grads = tape.gradient(loss_value, model.trainable_weights)

In [40]:
# Print the ouput nicely
print("Prediction:{}, Target:{}, Loss:{}".format(prediction, y[0], loss_value))
print("\n>>> GRADIENTS:")

for i in range(0, 18, 2):
    l = int(i / 2 + 1)
    print('-' * 100)
    print("# {} layer weights \n{} \n".format(l, grads[i].numpy()))
    print("# {} layer bias \n{} \n".format(l, grads[i+1].numpy()))

Prediction:[[-0.47241157]
 [ 0.47160947]
 [ 0.45250666]
 [ 0.47169375]
 [ 0.47160947]
 [-0.47030455]
 [ 0.        ]
 [ 0.        ]
 [ 0.47160947]
 [ 0.4719543 ]], Target:[1], Loss:[ 2.167996   2.1656344 19.824814  20.505556   2.1656344 20.51814
  4.         4.         2.1656344  6.391015 ]

>>> GRADIENTS:
----------------------------------------------------------------------------------------------------
# 1 layer weights 
[[-0.59448403 -0.93147856 -0.8353819 ]
 [ 0.6401543   0.94288766  0.86660784]] 

# 1 layer bias 
[-22.293747 -31.642372 -30.468355] 

----------------------------------------------------------------------------------------------------
# 2 layer weights 
[[ 0.18893619  0.1239714   0.21618688]
 [-0.07571094 -0.03798299 -0.07319543]
 [ 0.10330427  0.08833829  0.14312653]] 

# 2 layer bias 
[-19.435312 -12.74421  -23.430132] 

----------------------------------------------------------------------------------------------------
# 3 layer weights 
[[0.12783748 0.04943584 0.

In [41]:
epochs = 100
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))

    for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
        with tf.GradientTape() as tape:

            pred = model(x_batch_train, training=True)
            loss_value = loss_fn(y_batch_train, pred)

        grads = tape.gradient(loss_value, model.trainable_weights)
        optimizer.apply_gradients(zip(grads, model.trainable_weights))
        
        mse = np.mean((model(x).numpy() - y)**2)
        
    print('>>> Loss {:.3f}'.format(mse))


Start of epoch 0
>>> Loss 12.566

Start of epoch 1
>>> Loss 12.381

Start of epoch 2
>>> Loss 12.107

Start of epoch 3
>>> Loss 11.637

Start of epoch 4
>>> Loss 11.428

Start of epoch 5
>>> Loss 11.274

Start of epoch 6
>>> Loss 11.134

Start of epoch 7
>>> Loss 11.004

Start of epoch 8
>>> Loss 10.879

Start of epoch 9
>>> Loss 10.761

Start of epoch 10
>>> Loss 10.648

Start of epoch 11
>>> Loss 10.540

Start of epoch 12
>>> Loss 10.436

Start of epoch 13
>>> Loss 10.336

Start of epoch 14
>>> Loss 10.240

Start of epoch 15
>>> Loss 10.148

Start of epoch 16
>>> Loss 10.059

Start of epoch 17
>>> Loss 9.973

Start of epoch 18
>>> Loss 9.891

Start of epoch 19
>>> Loss 9.811

Start of epoch 20
>>> Loss 9.735

Start of epoch 21
>>> Loss 9.661

Start of epoch 22
>>> Loss 9.590

Start of epoch 23
>>> Loss 9.521

Start of epoch 24
>>> Loss 9.455

Start of epoch 25
>>> Loss 9.392

Start of epoch 26
>>> Loss 9.331

Start of epoch 27
>>> Loss 9.272

Start of epoch 28
>>> Loss 9.215

Start 

In [42]:
# Check the gradients after some time of training
print("Prediction:{}, Target:{}, Loss:{}".format(prediction, y[0], loss_value))
print("\n>>> GRADIENTS:")

for i in range(0, 18, 2):
    l = int(i / 2 + 1)
    print('-' * 100)
    print("# {} layer weights \n{} \n".format(l, grads[i].numpy()))
    print("# {} layer bias \n{} \n".format(l, grads[i+1].numpy()))

Prediction:[[-0.47241157]
 [ 0.47160947]
 [ 0.45250666]
 [ 0.47169375]
 [ 0.47160947]
 [-0.47030455]
 [ 0.        ]
 [ 0.        ]
 [ 0.47160947]
 [ 0.4719543 ]], Target:[1], Loss:[1.9294467]

>>> GRADIENTS:
----------------------------------------------------------------------------------------------------
# 1 layer weights 
[[-1.8176455e-04 -8.3771029e-06 -1.2794870e-05]
 [-0.0000000e+00 -0.0000000e+00 -0.0000000e+00]] 

# 1 layer bias 
[-6.0588183e-05 -2.7923675e-06 -4.2649567e-06] 

----------------------------------------------------------------------------------------------------
# 2 layer weights 
[[-6.5348046e-05 -3.0091678e-05 -5.0795934e-05]
 [ 1.1727618e-04  5.4003714e-05  9.1160386e-05]
 [ 1.1654372e-04  5.3666427e-05  9.0591035e-05]] 

# 2 layer bias 
[-1.1837532e-04 -5.4509848e-05 -9.2014765e-05] 

----------------------------------------------------------------------------------------------------
# 3 layer weights 
[[9.0691057e-05 4.8606715e-05 4.2017586e-05]
 [1.4713530

## Two circles example
see https://machinelearningmastery.com/how-to-fix-vanishing-gradients-using-the-rectified-linear-activation-function/

In [43]:
from tensorflow import keras
from sklearn.datasets import make_circles
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.initializers import RandomUniform
from matplotlib import pyplot

In [None]:
# generate 2d classification dataset
X, y = make_circles(n_samples=1000, noise=0.1, random_state=1)
scaler = MinMaxScaler(feature_range=(-1, 1))
X = scaler.fit_transform(X)
# split into train and test
n_train = 500
trainX, testX = X[:n_train, :], X[n_train:, :]
trainy, testy = y[:n_train], y[n_train:]

In [None]:
for i in range(2):
    samples_ix = np.where(y == i)
    pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
pyplot.legend()
pyplot.show()

In [None]:
# Define model run for various model specifications
def run(model, iterations):
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
    loss_fn = tf.keras.losses.BinaryCrossentropy()

    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

    acc_train = []
    acc_test = []
    for r in range(iterations):
        print("After {} episodes:".format((r + 1)* 10))
        history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=10, batch_size=100, verbose=0)
        acc_train.extend(history.history['accuracy'])
        acc_test.extend(history.history['val_accuracy'])

        prediction = model.predict(trainX)
        prediction = np.where(model.predict(trainX) > 0.5, 1, 0)

        for i in range(2):
            samples_ix = np.where(prediction == i)
            pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
        pyplot.legend()
        pyplot.show()

    pyplot.plot(acc_train, label='train')
    pyplot.plot(acc_test, label='test')
    pyplot.legend()
    pyplot.show()
    
    print('Evaluation')
    model.evaluate(testX, testy)

### Models with ReLU

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(2, activation="relu", kernel_initializer=init)(inputs)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
loss_fn = tf.keras.losses.BinaryCrossentropy()

model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

acc_train = []
acc_test = []

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(10, activation="relu", kernel_initializer=init)(inputs)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 70)

### Models with tanh

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(2, activation="tanh", kernel_initializer=init)(inputs)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(10, activation="tanh", kernel_initializer=init)(inputs)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="tanh", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

In [None]:
model.layers

In [None]:
# Check the gradients
with tf.GradientTape() as tape:
    prediction = model(trainX, training=True)
    loss_value = loss_fn(testy, prediction)

grads = tape.gradient(loss_value, model.trainable_weights)

print("\n>>> GRADIENTS:")

for i in range(len(model.layers)):
    l = int((i + 1) / 2)
    print('-' * 100)
    print("# {} layer weights \n{} \n".format(l, grads[i].numpy()))
    print("# {} layer bias \n{} \n".format(l, grads[i+1].numpy()))

### Models without activations

In [None]:
init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)
init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation=None, kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation=None, kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation=None, kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation=None, kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation=None, kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(1)(h)
outputs = tf.keras.activations.sigmoid(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model, 20)

<span style="color:red">**TO DO:** Similar to the accuracy, store and plot also the train and test loss. Inspect the loss evolution.</span>

### Multiclass classification

In [None]:
from sklearn.datasets import make_classification

In [None]:
# generate 2d classification dataset
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, 
                           n_redundant=0, n_classes=4, random_state=1, n_clusters_per_class=1,
                           class_sep=2)
scaler = MinMaxScaler(feature_range=(-1, 1))
X = scaler.fit_transform(X)
# split into train and test
n_train = 500

trainX, testX = X[:n_train, :], X[n_train:, :]
trainy, testy = y[:n_train], y[n_train:]

for i in range(4):
    samples_ix = np.where(y == i)
    pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
pyplot.legend()
pyplot.show()

#### ReLU models

In [None]:
# Define model run for various model specifications
def run(model):
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    acc_train = []
    acc_test = []
    for r in range(20):
        print("After {} episodes:".format((r + 1)* 10))
        history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=10, verbose=0)
        acc_train.extend(history.history['sparse_categorical_accuracy'])
        acc_test.extend(history.history['val_sparse_categorical_accuracy'])

        prediction = model.predict(trainX)
        prediction = np.argmax(model.predict(trainX), axis=1)

        for i in range(4):
            samples_ix = np.where(prediction == i)
            pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
        pyplot.legend()
        pyplot.show()

    pyplot.plot(acc_train, label='train')
    pyplot.plot(acc_test, label='test')
    pyplot.legend()
    pyplot.show()

    print('Evaluation')
    model.evaluate(testX, testy)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(inputs)
outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.GlorotNormal()

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(10, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(10, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(10, activation="relu", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(h)
outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

### Complex multiclass

In [None]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

In [None]:
# Define model run for various model specifications
def run(model):
    optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy()

    model.compile(loss=loss_fn, optimizer=optimizer, metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

    acc_train = []
    acc_test = []
    for r in range(20):
        print("After {} episodes:".format((r + 1)* 10))
        history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=10, verbose=0)
        acc_train.extend(history.history['sparse_categorical_accuracy'])
        acc_test.extend(history.history['val_sparse_categorical_accuracy'])

        prediction = model.predict(trainX)
        prediction = np.argmax(model.predict(trainX), axis=1)

        for i in range(4):
            samples_ix = np.where(prediction == i)
            pyplot.scatter(trainX[samples_ix, 0], trainX[samples_ix, 1], label=str(i))
        pyplot.legend()
        pyplot.show()

    pyplot.plot(acc_train, label='train')
    pyplot.plot(acc_test, label='test')
    pyplot.legend()
    pyplot.show()

    print('Evaluation')
    model.evaluate(testX, testy)

In [None]:
# generate 2d classification dataset
X_1, y_1 = make_moons(n_samples=1000, random_state=42, noise=0.2)
scaler = MinMaxScaler(feature_range=(-1, 1))
X_1 = scaler.fit_transform(X_1)

X_2, y_2 = make_moons(n_samples=1000, random_state=42, noise=0.2)
scaler = MinMaxScaler(feature_range=(-1, 1))
X_2 = scaler.fit_transform(X_1)

X = np.append(X_1, X_2 * np.array([1.2, 0.8]) + np.array([-1, -1]), axis=0)
y = np.append(y_1, y_2 + 2, axis=0)

trainX, testX, trainy, testy = train_test_split(X, y, random_state=3)

for i in range(4):
    samples_ix = np.where(y == i)
    pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
pyplot.legend()
pyplot.show()

#### Model with ReLU

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(5, activation="relu", kernel_initializer=init)(inputs)

outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(100, activation="relu", kernel_initializer=init)(inputs)

outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

In [None]:
tf.random.set_seed(42)

init = tf.keras.initializers.RandomUniform(minval=0, maxval=1)

inputs = tf.keras.Input(shape=(2,), name="input_values")
h = tf.keras.layers.Dense(50, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(50, activation="relu", kernel_initializer=init)(inputs)
h = tf.keras.layers.Dense(50, activation="relu", kernel_initializer=init)(inputs)

outputs = tf.keras.layers.Dense(4)(h)
outputs = tf.keras.activations.softmax(outputs)

model = tf.keras.Model(inputs=inputs, outputs=outputs)

run(model)

In [None]:
# def run(model, iterations):
#     optimizer = tf.keras.optimizers.SGD(learning_rate=0.01, momentum=0.9)
#     loss_fn = tf.keras.losses.BinaryCrossentropy()

#     model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

#     acc_train = []
#     acc_test = []
#     loss_train = []
#     loss_test = []

#     for r in range(iterations):
#         print("After {} episodes:".format((r + 1) * 10))
#         history = model.fit(trainX, trainy, validation_data=(testX, testy), epochs=10, batch_size=100, verbose=0)
#         acc_train.extend(history.history['accuracy'])
#         acc_test.extend(history.history['val_accuracy'])
#         loss_train.extend(history.history['loss'])
#         loss_test.extend(history.history['val_loss'])

#         prediction = model.predict(trainX)
#         prediction = np.where(model.predict(trainX) > 0.5, 1, 0)

#         for i in range(2):
#             samples_ix = np.where(prediction == i)
#             pyplot.scatter(X[samples_ix, 0], X[samples_ix, 1], label=str(i))
#         pyplot.legend()
#         pyplot.show()

#     pyplot.plot(acc_train, label='train')
#     pyplot.plot(acc_test, label='test')
#     pyplot.legend()
#     pyplot.show()

#     pyplot.plot(loss_train, label='train')
#     pyplot.plot(loss_test, label='test')
#     pyplot.legend()
#     pyplot.show()

#     print('Evaluation')
#     model.evaluate(testX, testy)