In [1]:
import numpy as np
import matplotlib.pyplot as plt
import math

# Dense Layer

**Sum Function**
$$
f(x, y) = x + y \\
\frac{\partial}{\partial x}f(x, y) = 1 \\
\frac{\partial}{\partial y}f(x, y) = 1
$$

**Multiplication**
$$
f(x,y) = x \cdot y \\
\frac{\partial}{\partial x}f(x, y) = y \\
\frac{\partial}{\partial y}f(x, y) = x
$$

In [17]:
class Layer_Dense:
    def __init__(self, n_inputs, n_neurons):
        # random initialization of weights on a Gaussian * 0.01
        self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
        self.biases = np.zeros((1, n_neurons))
        
    def forward(self, inputs):
        self.inputs = inputs
        # Simple linear output
        self.output = np.dot(self.inputs, self.weights) + self.biases
        
    def backward(self, dvalues):
        self.dweights = np.dot(self.inputs.T, dvalues)
        self.dinputs = np.dot(dvalues, self.weights.T)
        self.dbiases = np.sum(dvalues, axis=0, keepdims=True)

# Activation Functions

**ReLU**
$$
f(x) = \text{max}(x, 0) \\
\frac{d}{dx}f(x) = 1(x \gt 0)
$$

In [16]:
class Activation_ReLU:
    def forward(self, inputs):
        self.inputs = inputs
        self.output = np.maximum(0, self.inputs)
        
    def backward(self, dvalues):
        self.dinputs = dvalues.copy()
        self.dinputs[self.inputs <= 0] = 0

# Softmax Implmenentation

In [5]:
# Scratch
layer_out = [4.8, 1.21, 2.385]
E = 2.71828182846
exp_values = []
for out in layer_out:
    exp_values.append(E ** out)
print(exp_values)

norm_base = sum(exp_values)
norm_values = []
for value in exp_values:
    norm_values.append(value / norm_base)
print(norm_values)
sum(norm_values)

# Numpy
layer_out = [4.8, 1.21, 2.385]
exp_values = np.exp(layer_out)
print(exp_values)

norm_values = exp_values / np.sum(exp_values)
print(norm_values)
np.sum(norm_values)

[121.51041751893969, 3.3534846525504487, 10.85906266492961]
[0.8952826639573506, 0.024708306782070668, 0.08000902926057876]
[121.51041752   3.35348465  10.85906266]
[0.89528266 0.02470831 0.08000903]


0.9999999999999999

In [6]:
class Activation_Softmax:
    def forward(self, inputs):
        self.inputs = inputs
        exp_values = np.exp(inputs - np.max(inputs, axis=1, keepdims=True)) # We subtract the largest neuron to avoid dead nueron problem
        probabilities = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        self.output = probabilities
    
    def backward(self, dvalues):
        self.dinputs = np.empy_like(dvalues)
        for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
            single_output = single_output.reshape(-1, 1)
            jacobian_matrix = np.diaglat(single_output) - np.dot(single_output, single_output.T)
            self.dinputs[index] = np.dot(jacobian_matrix, single_dvalues)

# Categorical Cross-Entropy
$L_i = -\sum y_{i,j}\log(\hat{y}_{i,j})$
Where $L_i$ denotes sample loss value, $i$ is the $i$-th sample in the set, $j$ is the label/output index, $y$
denotes the target values, and $\hat y$ denotes the predicted values.

We can simplify it further to $-\log(\text{correct_class_confidence})$, the formula for which is:
$L_i = -\log(\hat{y}_{i,k})$ where $k$ is an index of "true" probability

This is also a special case of the cross-entropy calculation, called categorical cross-entropy

In [7]:
softmax_output = [0.7, 0.1, 0.2]
target_output = [1, 0, 0] #one-hot encoded
loss = -(math.log(softmax_output[0]) * target_output[0] + 
         math.log(softmax_output[1]) * target_output[1] +
         math.log(softmax_output[2]) * target_output[2])
print(loss)

# We can technically omit all the zero-multiplications and only return the log of the one-hot encoded index \
loss = -math.log(softmax_output[0])
print(loss)

0.35667494393873245
0.35667494393873245


In [8]:
# batch data
softmax_outputs = np.array([[0.7, 0.1, 0.2],[0.1, 0.5, 0.4],[0.02, 0.9, 0.08]])
class_targets = np.array([0, 1, 1]) # dog, cat, cat

for target_idx, distribution in zip(class_targets, softmax_outputs):
    print(distribution[target_idx])
    
print(softmax_outputs[[0,1,2], class_targets])

0.7
0.5
0.9
[0.7 0.5 0.9]


In [9]:
class Loss:
    def __init__(self):
        pass
    
    def calculate(self, output, y):
        sample_losses = self.forward(output, y)
        data_loss = np.mean(sample_losses)
        return data_loss

In [10]:
class Loss_CategoricalCrossEntropy(Loss):
    def __init__(self):
        super().__init__()
    
    # Forward pass
    def forward(self, y_pred, y_true):
        # Number of samples in a batch
        samples = len(y_pred)
        
        # Clip data to prevent division by 0
        # Clip both sides to not drag mean towards any value and prevent log(0) and log(1+)
        y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
        
        # Probabilities for target values - only if categorical labels
        if len(y_true.shape) == 1:
            correct_confidences = y_pred_clipped[range(samples),y_true]
        
        # Mask values - only for one-hot encoded labels
        elif len(y_true.shape) == 2:
            correct_confidences = np.sum(y_pred_clipped*y_true,axis=1)
        
        # Losses
        negative_log_likelihoods = -np.log(correct_confidences)
        return negative_log_likelihoods
    
    def backward(self, y_pred, y_true):
        samples = len(y_pred)
        labels = len(y_pred[0])
        if len(y_true.shape) == 1:
            y_true = np.eye(labels)[y_true]
        
        self.dinputs = -y_true / y_pred
        self.dinputs = self.dinputs / samples

In [11]:
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy:

    # Creates activation and loss function objects
    def __init__(self):
        self.activation = Activation_Softmax()
        self.loss = Loss_CategoricalCrossentropy()

    # Forward pass
    def forward(self, inputs, y_true):
        # Output layer's activation function
        self.activation.forward(inputs)
        # Set the output
        self.output = self.activation.output
        # Calculate and return loss value
        return self.loss.calculate(self.output, y_true)


    # Backward pass
    def backward(self, y_pred, y_true):

        # Number of samples
        samples = len(y_pred)

        # If labels are one-hot encoded,
        # turn them into discrete values
        if len(y_true.shape) == 2:
            y_true = np.argmax(y_true, axis=1)

        # Copy so we can safely modify
        self.dinputs = y_pred.copy()
        # Calculate gradient
        self.dinputs[range(samples), y_true] -= 1
        # Normalize gradient
        self.dinputs = self.dinputs / samples


# Accuracy

In [12]:
softmax_outputs = np.array([[0.2, 0.7, 0.1],[0.5, 0.1, 0.4],[0.02, 0.9, 0.08]])
class_targets = np.array([0, 1, 1])

predictions = np.argmax(softmax_outputs, axis=1)
if len(class_targets.shape) == 2:
    class_targets = np.argmax(class_targets, axis=1)
accuracy = np.mean(predictions == class_targets)
print(accuracy)

0.3333333333333333


(200, 1)