# Preparations

In [None]:
import numpy as np

In [None]:
batch_size, feature_size, hidden_layer_size_1, output_size = 8, 5, 10, 1

In [None]:
def generate_X(batch_size, feature_size):
    return np.concatenate((5 * np.random.rand(batch_size // 2, feature_size) + 2, 5 * np.random.rand(batch_size//2, feature_size) - 2))

def generate_Y(batch_size, output_size):
    return np.concatenate((np.ones((batch_size//2, output_size)), np.zeros((batch_size//2, output_size))))


In [None]:
# Create input and output data
x = generate_X(batch_size, feature_size)
y = generate_Y(batch_size, output_size)

In [None]:
# Randomly initialize weights
w1 = np.random.randn(feature_size, hidden_layer_size_1)
w2 = np.random.randn(hidden_layer_size_1, output_size)

In [None]:
def sigmoid(x):
    return 1. / (1 + np.exp(-x))

def relu(x):
    return np.maximum(x, 0)

def sigmoid_derivative(x):
    return sigmoid(x)*(1 - sigmoid(x))

def relu_derivative(x):
    return relu(x)

# Forward propagation

In [None]:
learning_rate = 1e-6

In [None]:
# Forward pass: compute predicted y
h1 = x.dot(w1)
h1_relu = relu(h1)
h2 = h1_relu.dot(w2)
y_pred = sigmoid(h2)
y_pred

In [None]:
def log_loss(y_pred, y):
    return (-y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred))

def grad_log_loss(y_pred, y):
    return y_pred - y

In [None]:
# Compute and print loss
loss = log_loss(y_pred, y).sum()
loss

# Backpropagation

In [None]:
# Backprop to compute gradients of w1 and w2 with respect to loss
grad_y_pred = grad_log_loss(y_pred, y)

grad_h2 = sigmoid_derivative(grad_y_pred) * grad_y_pred

# calculate gradient for w2
grad_w2 = h1_relu.T.dot(grad_h2)

# calculate gradient for w1
grad_h1_relu = grad_h2.dot(w2.T)
grad_h1 = relu_derivative(grad_h1_relu) * grad_h1_relu
grad_w1 = x.T.dot(grad_h1)

In [None]:
# Update weights
w1 -= learning_rate * grad_w1
w2 -= learning_rate * grad_w2

In [None]:
# Forward pass: compute predicted y
h1 = x.dot(w1)
h1_relu = relu(h1)
h2 = h1_relu.dot(w2)
y_pred = sigmoid(h2)
y_pred

In [None]:
# Compute and print loss
loss = log_loss(y_pred, y).sum()
loss

# Let's add loop

In [None]:
learning_rate = 1e-5
for t in range(500):
    # Forward pass: compute predicted y
    h1 = x.dot(w1)
    h1_relu = relu(h1)
    h2 = h1_relu.dot(w2)
    y_pred = sigmoid(h2)


    # Compute and print loss
    loss = log_loss(y_pred, y).sum()
    if t % 100 == 0: print(t, "\t", loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = grad_log_loss(y_pred, y)

    grad_h2 = sigmoid_derivative(grad_y_pred) * grad_y_pred

    # calculate gradient for w2
    grad_w2 = h1_relu.T.dot(grad_h2)

    # calculate gradient for w1
    grad_h1_relu = grad_h2.dot(w2.T)
    grad_h1 = relu_derivative(grad_h1_relu) * grad_h1_relu
    grad_w1 = x.T.dot(grad_h1)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
# Forward pass: compute predicted y
h1 = x.dot(w1)
h1_relu = relu(h1)
h2 = h1_relu.dot(w2)
y_pred = sigmoid(h2)
y_pred