# Numerical Gradient Checking

We would highly recommend looking at `neural_networks.grad_check.check_gradients` and making sure you understand how numerical gradient checking is being carried out. This function is used in the notebook to check the gradients of the neural network layers you write. Make sure to check the gradient of a layer after finishing its implementation.

The function returns the relative error of the numerical gradient (approximated using finite differences) with respect to the analytical gradient (computed via backpropagation). Correct implementations should get very small errors, usually less than `1e-8` for 64-bit float matrices (the default).

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
from neural_networks.utils.grad_check import check_gradients
from neural_networks.layers import FullyConnected, Elman, Conv2D
from neural_networks.activations import Linear, Sigmoid, TanH, ReLU, SoftMax

## Gradient Checks for Activation Functions

### Linear Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
linear_activation = Linear()
_ = linear_activation.forward(X)
grad = linear_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for linear activation:",
    check_gradients(
        fn=linear_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Sigmoid Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
sigmoid_activation = Sigmoid()
_ = sigmoid_activation.forward(X)
grad = sigmoid_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for sigmoid activation:",
    check_gradients(
        fn=sigmoid_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Tanh Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
tanh_activation = TanH()
_ = tanh_activation.forward(X)
grad = tanh_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for tanh activation:",
    check_gradients(
        fn=tanh_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### ReLU Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
relu_activation = ReLU()
out = relu_activation.forward(X)
grad = relu_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for relu activation:",
    check_gradients(
        fn=relu_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

### Softmax Activation

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 3)

# initialize a fully connected layer
# and perform a forward and backward pass
softmax_activation = SoftMax()
_ = softmax_activation.forward(X)
grad = softmax_activation.backward(X, dLdY)

# check the gradients w.r.t. each parameter
print(
    f"Relative error for softmax activation:",
    check_gradients(
        fn=softmax_activation.forward,  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=X,        # the variable w.r.t. which we are taking the gradient
        dLdf=dLdY,  # gradient at previous layer
    )
)

## Gradient Checks for Full Layers (Linear Activations)

### Fully Connected Layer

In [None]:
X = np.random.randn(2, 3)
dLdY = np.random.randn(2, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
fc_layer = FullyConnected(n_out=4, activation="linear")
_ = fc_layer.forward(X)
_ = fc_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in fc_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=fc_layer.forward_with_param(param, X),  # the function we are checking
            grad=fc_layer.gradients[param],  # the analytically computed gradient
            x=fc_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

### Elman Recurrent Layer

In [None]:
X = np.random.randn(2, 3, 4)
dLdY = np.random.randn(2, 5)

# initialize a recurrent layer
# and perform a forward and backward pass
elman_layer = Elman(n_out=5, activation="linear")
_ = elman_layer.forward(X)
_ = elman_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in elman_layer.parameters:
    # check the gradient
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=elman_layer.forward_with_param(param, X),  # the function we are checking
            grad=elman_layer.gradients[param],  # the analytically computed gradient
            x=elman_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

### Conv Layer

In [None]:
X = np.random.randn(2, 5, 6, 7)
dLdY = np.random.randn(2, 5, 6, 4)

# initialize a fully connected layer
# and perform a forward and backward pass
conv_layer = Conv2D(
    n_out=4,
    kernel_shape=(3, 3),
    activation="linear",
    weight_init="uniform",
    pad="same",
)
_ = conv_layer.forward(X)
_ = conv_layer.backward(dLdY)

# check the gradients w.r.t. each parameter
for param in conv_layer.parameters:
    print(
        f"Relative error for {param}:",
        check_gradients(
            fn=conv_layer.forward_with_param(param, X),  # the function we are checking
            grad=conv_layer.gradients[param],  # the analytically computed gradient
            x=conv_layer.parameters[param],  # the variable w.r.t. which we are taking the gradient
            dLdf=dLdY,                     # gradient at previous layer
        )
    )

In [None]:
from neural_networks.losses import CrossEntropy

num_pts = 5
num_classes = 6

# one-hot encoded y
y_idxs = np.random.randint(0, num_classes, (num_pts,))
y = np.zeros((num_pts, num_classes))
y[range(num_pts), y_idxs] = 1

# normalized predictions
scores = np.random.uniform(0, 1, size=(num_pts, num_classes))
y_hat = scores / scores.sum(axis=1, keepdims=True)

cross_entropy_loss = CrossEntropy("cross_entropy")

def forward_fn(Y, Y_hat):    
    def inner_forward(Y_hat):
        return cross_entropy_loss.forward(Y, Y_hat)
    return inner_forward

loss = cross_entropy_loss.forward(y, y_hat)
grad = cross_entropy_loss.backward(y, y_hat)

print(
    f"Relative error for cross entropy loss:",
    check_gradients(
        fn=forward_fn(y, y_hat),  # the function we are checking
        grad=grad,  # the analytically computed gradient
        x=y_hat,        # the variable w.r.t. which we are taking the gradient
        dLdf=1,  # gradient at previous layer
    )
)