Book: https://fulyankin.github.io/deep_learning_masha_book/problem_set_03_backprop/problem_05.html

In [59]:
import numpy as np
import math
from abc import ABC, abstractmethod
import random
random.seed(42)

In [60]:
def sigmoid(x, dx = 0):
    if dx:
        y = sigmoid(x, dx = 0)
        return y * (1 - y)
    return 1 / (1 + np.exp(-x))

def mse_loss(y, y_h):
    loss = ((y_h - y) ** 2).mean()
    dL_dY = 2 * (y_h - y) / y.shape[0] #d wrt y_h
    return loss, dL_dY

In [61]:
class Layer(ABC):
    @abstractmethod
    def forward(self):
        pass
    
    @abstractmethod
    def backward(self):
        pass

class Linear(Layer):
    def __init__(self, in_dim, out_dim):
        self.W = np.random.rand(in_dim, out_dim) * 0.1

    def forward(self, X):
        self.X = X
        return X @ self.W

    def backward(self, dY):
        self.dW = self.X.T @ dY #d wrt W
        return dY @ self.W.T #d wrt X

class Sigmoid(Layer):
    def forward(self, X):
        self.Y = sigmoid(X)
        return self.Y

    def backward(self, dY):
        return dY * (self.Y * (1 - self.Y))

In [62]:
class Model:
    def __init__(self): #Model
        self.layers = []
        for i in range(2):
            self.layers.append(Linear(2, 2))
            self.layers.append(Sigmoid())
        self.layers.append(Linear(2, 1))
    
    def predict(self, X):
        for i in self.layers:
            X = i.forward(X)
        return X
    
    def SGD(self, dL_dY, lr=0.1):
        u = len(self.layers) - 1
        self.dX = dL_dY #Grad from mse_loss

        u = len(self.layers) - 1
        while(u >= 0): #Chain rule to find all derrivatives
            self.dX = self.layers[u].backward(self.dX)
            if(isinstance(self.layers[u], Linear)):
                self.layers[u].W -= lr * self.layers[u].dW
            u -= 1

In [63]:
model = Model()

n = 5 # Number of observations
X = np.random.randn(2, n)
Y = (X[0] * 0.05 + X[1] * 0.1 + 0.34).reshape(-1, 1)

for step in range(2): #Training loop
    y_hat = model.predict(X.T)
    loss, dL_dY = mse_loss(Y, y_hat) #Grad of loss
    model.SGD(dL_dY, lr=0.1)

Final check, small ε.
dL/dw ​≈ ( L(w+ε) − L(w−ε) )​ / 2ε. Mean rel_error ~ 3e-13

In [64]:
np.random.seed(24)

In [65]:
def check(layer, out = 0):
    i, j = random.randint(0, layer.W.shape[0] - 1), random.randint(0, layer.W.shape[1] - 1)

    y_hat = model.predict(X.T)
    loss, dL_dY = mse_loss(Y, y_hat)
    model.SGD(dL_dY, lr=0) #Just grad, don't change weights

    grad = layer.dW[i, j]

    eps = 1e-5
    val = layer.W[i, j]

    #L(w + ε)
    layer.W[i, j] = val + eps
    y_hat_plus = model.predict(X.T)
    loss_plus, _ = mse_loss(Y, y_hat_plus)

    #L(w - ε)
    layer.W[i, j] = val - eps
    y_hat_minus = model.predict(X.T)
    loss_minus, _ = mse_loss(Y, y_hat_minus)

    #Original value
    layer.W[i, j] = val

    num = (loss_plus - loss_minus) / (2 * eps)
    rel_error = abs(grad - num) / max(1.0, abs(grad), abs(num))

    if out:
        print("Analytic:", grad)
        print("Numeric:", num)
        print("Rel. err:", rel_error)
    return rel_error


In [66]:
layers = model.layers[-1:-6:-2] #All linear layers

for i in layers: #Check
    for j in range(1):
        check(i, out=1)


Analytic: -0.29547590450288164
Numeric: -0.29547590450276773
Rel. err: 1.1390888232654106e-13
Analytic: -0.005963739059503266
Numeric: -0.005963739060227401
Rel. err: 7.241351615561165e-13
Analytic: 0.00013458151073060678
Numeric: 0.00013458150982526007
Rel. err: 9.053467068913246e-13


In [67]:
res = []
for i in layers: #Calculate mean rel_error
    for j in range(5):
        res.append( check(i) )

print(np.array(res).mean())

3.320476633812223e-13
