# Preparations

In [1]:
import numpy as np

In [2]:
batch_size, feature_size, output_size = 8, 5, 1

In [3]:
def generate_X(batch_size, feature_size):
    return np.concatenate((5 * np.random.rand(batch_size // 2, feature_size) + 2, 5 * np.random.rand(batch_size//2, feature_size) - 2))

def generate_Y(batch_size, output_size):
    return np.concatenate((np.ones((batch_size//2, output_size)), np.zeros((batch_size//2, output_size))))


In [4]:
# Create input and output data
x = generate_X(batch_size, feature_size)
x0 = np.ones((batch_size, 1))
y = generate_Y(batch_size, output_size)

In [5]:
# Randomly initialize weights
w1 = np.random.randn(feature_size, output_size)
b1 = np.random.randn(1, output_size)

In [6]:
def sigmoid(x):
    return 1. / (1 + np.exp(-x))

def sigmoid_derivative(x):
    return sigmoid(x)*(1 - sigmoid(x))

# Forward propagation

In [7]:
learning_rate = 1e-5

In [8]:
# Forward pass: compute predicted y
h1 = x.dot(w1) + b1
y_pred = sigmoid(h1)
y_pred

array([[0.20214854],
       [0.06403529],
       [0.05969032],
       [0.94659729],
       [0.07808954],
       [0.1134001 ],
       [0.37302664],
       [0.07176207]])

In [9]:
def log_loss(y_pred, y):
    return (-y * np.log(y_pred) - (1 - y) * np.log(1 - y_pred))

def grad_log_loss(y_pred, y):
    return y_pred - y

In [10]:
# Compute and print loss
loss = log_loss(y_pred, y)
loss.sum()

7.9635275682126565

# Backpropagation

In [11]:
# Backprop to compute gradients of w1 with respect to loss
grad_y_pred = grad_log_loss(y_pred, y)

grad_sigmoid = sigmoid_derivative(grad_y_pred) * grad_y_pred

grad_w1 = x.T.dot(grad_sigmoid)
grad_b1 = x0.T.dot(grad_sigmoid)

In [12]:
# Update weights
w1 -= learning_rate * grad_w1
b1 -= learning_rate * grad_b1

In [13]:
# Forward pass: compute predicted y
h1 = x.dot(w1) + b1
y_pred = sigmoid(h1)
y_pred

array([[0.2022189 ],
       [0.0640735 ],
       [0.05972626],
       [0.94663026],
       [0.07809277],
       [0.11341119],
       [0.37304639],
       [0.0717628 ]])

In [14]:
# Compute and print loss
loss = log_loss(y_pred, y)
loss.sum()

7.961994473123256

# Let's add loop

In [15]:
learning_rate = 1e-5
for t in range(500):
    # Forward pass: compute predicted y
    h1 = x.dot(w1) + b1
    y_pred = sigmoid(h1)

    # Compute and print loss
    loss = log_loss(y_pred, y)
    if t % 100 == 0: print(t, "\t", loss.sum())

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = grad_log_loss(y_pred, y)
    grad_sigmoid = sigmoid_derivative(grad_y_pred) * grad_y_pred

    # calculate gradient for w1
    grad_w1 = x.T.dot(grad_sigmoid)
    grad_b1 = x0.T.dot(grad_sigmoid)

    # Update weights
    w1 -= learning_rate * grad_w1
    b1 -= learning_rate * grad_b1

0 	 7.961994473123256
100 	 7.8096078154666095
200 	 7.6590581651886644
300 	 7.510361855539488
400 	 7.363537748962605


In [16]:
# Forward pass: compute predicted y
h1 = x.dot(w1) + b1
y_pred = sigmoid(h1)
y_pred

array([[0.23915826],
       [0.08571688],
       [0.08013961],
       [0.96073853],
       [0.0797036 ],
       [0.11899488],
       [0.38283717],
       [0.07212657]])