In [1]:
import numpy as np
import random

random.seed(0)
np.random.seed(0)


# Our sample dataset
def create_data(n, k):
    X = np.zeros((n*k, 2))  # data matrix (each row = single example)
    y = np.zeros(n*k, dtype='uint8')  # class labels
    for j in range(k):
        ix = range(n*j, n*(j+1))
        r = np.linspace(0.0, 1, n)  # radius
        t = np.linspace(j*4, (j+1)*4, n) + np.random.randn(n)*0.2  # theta
        X[ix] = np.c_[r*np.sin(t*2.5), r*np.cos(t*2.5)]
        y[ix] = j
    return X, y


# Dense layer
class Layer_Dense:

    # Layer initialization
    def __init__(self, inputs, neurons):
        # Initialize weights and biases
        self.weights = 0.01 * np.random.randn(inputs, neurons)
        self.biases = np.zeros((1, neurons))

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from input ones, weights and biases
        self.output = np.dot(inputs, self.weights) + self.biases

    # Backward pass
    def backward(self, dvalues):
        # Gradients on parameters
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
        # Gradient on values
        self.dvalues = np.dot(dvalues, self.weights.T)


# ReLU activation
class Activation_ReLU:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs
        # Calculate output values from input ones
        self.output = np.maximum(0, inputs)

    # Backward pass
    def backward(self, dvalues):
        # Since we need to modify original variable, 
        # let's make a copy of values first
        self.dvalues = dvalues.copy()

        # Zero gradient where input values were negative 
        self.dvalues[self.inputs <= 0] = 0 


# Softmax activation
class Activation_Softmax:

    # Forward pass
    def forward(self, inputs):
        # Remember input values
        self.inputs = inputs

        # get unnormalized probabilities
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True))
        # normalize them for each sample
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)

        self.output = probabilities

    # Backward pass
    def backward(self, dvalues):
        self.dvalues = dvalues.copy()


# Cross-entropy loss
class Loss_CategoricalCrossentropy:

    # Forward pass
    def forward(self, y_pred, y_true):

        # Number of samples in a batch
        samples = y_pred.shape[0]

        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            y_pred = y_pred[range(samples), y_true]

        # Losses
        negative_log_likelihoods = -np.log(y_pred)

        # Mask values - only for one-hot encoded labels
        if len(y_true.shape) == 2:
            negative_log_likelihoods *= y_true

        # Overall loss
        data_loss = np.sum(negative_log_likelihoods) / samples
        return data_loss

    # Backward pass
    def backward(self, dvalues, y_true):

        samples = dvalues.shape[0]

        self.dvalues = dvalues.copy()  # Copy so we can safely modify
        self.dvalues[range(samples), y_true] -= 1
        self.dvalues = self.dvalues / samples
class Optimizer_Adam:

    # Initialize optimizer - set settings
    def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7, beta_1=0.9, beta_2=0.999):
        self.learning_rate = learning_rate
        self.current_learning_rate = learning_rate
        self.decay = decay
        self.iterations = 0
        self.epsilon = epsilon
        self.beta_1 = beta_1
        self.beta_2 = beta_2

    # Call once before any parameter updates
    def pre_update_params(self):
        if self.decay:
            self.current_learning_rate = self.current_learning_rate * (1. / (1. + self.decay * self.iterations))

    # Update parameters
    def update_params(self, layer):

        # If layer does not contain cache arrays,
        # create them filled with zeros
        if not hasattr(layer, 'weight_cache'):
            layer.weight_momentums = np.zeros_like(layer.weights)
            layer.weight_cache = np.zeros_like(layer.weights)
            layer.bias_momentums = np.zeros_like(layer.biases)
            layer.bias_cache = np.zeros_like(layer.biases)

        # Update momentum  with current gradients
        layer.weight_momentums = (self.beta_1 * layer.weight_momentums +
                                 (1 - self.beta_1) * layer.dweights)
        layer.bias_momentums = (self.beta_1 * layer.bias_momentums +
                               (1 - self.beta_1) * layer.dbiases)
        # Get corrected momentum
        # self.iteration is 0 at first pass
        # and we need to start with 1 here
        weight_momentums_corrected = (layer.weight_momentums /
            (1 - self.beta_1 ** (self.iterations + 1)))
        bias_momentums_corrected = (layer.bias_momentums /
            (1 - self.beta_1 ** (self.iterations + 1)))
        # Update cache with squared current gradients
        layer.weight_cache = (self.beta_2 * layer.weight_cache +
            (1 - self.beta_2) * layer.dweights**2)
        layer.bias_cache = (self.beta_2 * layer.bias_cache +
            (1 - self.beta_2) * layer.dbiases**2)
        # Get corrected cachebias
        weight_cache_corrected = (layer.weight_cache /
            (1 - self.beta_2 ** (self.iterations + 1)))
        bias_cache_corrected = (layer.bias_cache /
            (1 - self.beta_2 ** (self.iterations + 1)))

        # Vanilla SGD parameter update + normalization with square rooted cache
        layer.weights += (-self.current_learning_rate *
                         weight_momentums_corrected /
                         (np.sqrt(weight_cache_corrected) +
                             self.epsilon))
        layer.biases += (-self.current_learning_rate *
                         bias_momentums_corrected /
                         (np.sqrt(bias_cache_corrected) +
                             self.epsilon))

    # Call once after any parameter updates
    def post_update_params(self):
        self.iterations += 1


# Create dataset
X, y = create_data(100, 3)

# Create Dense layer with 2 input features and 3 output values
dense1 = Layer_Dense(2, 64)  # first dense layer, 2 inputs (each sample has 2 features), 3 outputs

# Create ReLU activation (to be used with Dense layer):
activation1 = Activation_ReLU()

# Create second Dense layer with 3 input features (as we take output of previous layer here) and 3 output values (output values)
dense2 = Layer_Dense(64, 3)  # second dense layer, 3 inputs, 3 outputs

# Create Softmax activation (to be used with Dense layer):
activation2 = Activation_Softmax()

# Create loss function
loss_function = Loss_CategoricalCrossentropy()

# Create optimizer
optimizer = Optimizer_Adam(learning_rate=0.0575, decay=1e-8)

# Train in loop
for epoch in range(10001):

    # Make a forward pass of our training data thru this layer
    dense1.forward(X)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation1.forward(dense1.output)

    # Make a forward pass thru second Dense layer - it takes outputs of activation function of first layer as inputs
    dense2.forward(activation1.output)

    # Make a forward pass thru activation function - we take output of previous layer here
    activation2.forward(dense2.output)

    # Calculate loss from output of activation2 so softmax activation
    loss = loss_function.forward(activation2.output, y)

    # Calculate accuracy from output of activation2 and targets
    predictions = np.argmax(activation2.output, axis=1)  # calculate values along first axis
    accuracy = np.mean(predictions==y)

    if not epoch % 100:
        print(f'epoch: {epoch}, acc: {accuracy:.3f}, loss: {loss:.3f}, lr: {optimizer.current_learning_rate}')

    # Backward pass
    loss_function.backward(activation2.output, y)
    activation2.backward(loss_function.dvalues)
    dense2.backward(activation2.dvalues)
    activation1.backward(dense2.dvalues)
    dense1.backward(activation1.dvalues)

    # Update weights
    optimizer.pre_update_params()
    optimizer.update_params(dense1)
    optimizer.update_params(dense2)
    optimizer.post_update_params()

        
        

epoch: 0, acc: 0.360, loss: 1.099, lr: 0.0575
epoch: 100, acc: 0.753, loss: 0.607, lr: 0.05749715382138749
epoch: 200, acc: 0.787, loss: 0.479, lr: 0.057488558646061005
epoch: 300, acc: 0.863, loss: 0.353, lr: 0.0574742170579969
epoch: 400, acc: 0.893, loss: 0.293, lr: 0.05745413336430066
epoch: 500, acc: 0.900, loss: 0.260, lr: 0.05742831359305091
epoch: 600, acc: 0.920, loss: 0.230, lr: 0.05739676549028411
epoch: 700, acc: 0.920, loss: 0.208, lr: 0.05735949851612153
epoch: 800, acc: 0.930, loss: 0.201, lr: 0.05731652384004162
epoch: 900, acc: 0.920, loss: 0.200, lr: 0.05726785433530062
epoch: 1000, acc: 0.937, loss: 0.179, lr: 0.05721350457250595
epoch: 1100, acc: 0.943, loss: 0.172, lr: 0.057153490812346115
epoch: 1200, acc: 0.950, loss: 0.167, lr: 0.05708783099748289
epoch: 1300, acc: 0.953, loss: 0.160, lr: 0.05701654474361081
epoch: 1400, acc: 0.957, loss: 0.155, lr: 0.05693965332969104
epoch: 1500, acc: 0.957, loss: 0.150, lr: 0.05685717968736526
epoch: 1600, acc: 0.960, loss: 0