In [54]:
import torch
import torch.nn.functional as F
import numpy as np

### Simple functions

$$ y = f(x) = \sum{(x^2 + 2 \cdot x)} $$

In [2]:
x = torch.arange(10, dtype=torch.float, requires_grad=True)

y = torch.sum(x ** 2 + 2 * x)

In [3]:
dy_dx_analytic = 2 * x + 2 

In [4]:
y.backward(retain_graph=True)  # calculates gradient w.r.t. graph nodes

In [5]:
dy_dx_numeric = x.grad.clone()

In [6]:
bool(torch.all(dy_dx_numeric == dy_dx_analytic))

True

$$ y = W_{hy} h $$
$$ p = softmax(y) $$
$$ loss = -log(p) $$

In [7]:
n = 10
m = 20

w = torch.randn(n, m, requires_grad=True)
h = torch.randint(3, (20, 1), dtype=torch.float)
y = torch.matmul(w, h)
p = F.softmax(y, dim=0)

label = torch.zeros_like(p)
label[5] = 1.

loss = -torch.sum(label * torch.log(p))

In [8]:
loss

tensor(10.4249, grad_fn=<NegBackward>)

In [9]:
loss.backward()

In [10]:
w_analytic_grad = torch.matmul((p - label) , h.view(1, -1))

In [11]:
bool(torch.all(w_analytic_grad == w.grad.data))

True

In [12]:
torch.equal(w_analytic_grad,  w.grad.data)

True

In [184]:
w_analytic_grad.detach().numpy()

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         2.67595407e-02,  0.00000000e+00,  5.35190813e-02,
         2.67595407e-02,  2.67595407e-02,  5.35190813e-02,
         2.67595407e-02,  5.35190813e-02,  2.67595407e-02,
         2.67595407e-02,  5.35190813e-02,  2.67595407e-02,
         2.67595407e-02,  5.35190813e-02,  5.35190813e-02,
         2.67595407e-02,  2.67595407e-02],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         3.22196365e-06,  0.00000000e+00,  6.44392730e-06,
         3.22196365e-06,  3.22196365e-06,  6.44392730e-06,
         3.22196365e-06,  6.44392730e-06,  3.22196365e-06,
         3.22196365e-06,  6.44392730e-06,  3.22196365e-06,
         3.22196365e-06,  6.44392730e-06,  6.44392730e-06,
         3.22196365e-06,  3.22196365e-06],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00,
         1.12921158e-02,  0.00000000e+00,  2.25842316e-02,
         1.12921158e-02,  1.12921158e-02,  2.25842316e-02,
         1.12921158e-02,  2.2

### Simple NN

**Forward pass:**

$$ 
h_1 = w_1 \cdot x \\
z_1 = \sigma(h_1)  \\ 
h_2 = w_2 \cdot z_1 \\
z_2 = softmax (h_2)
$$

**Loss - Cross Entropy:**

$$ J = -label \cdot \log(z_2) $$

**Backward pass:**

$$
\frac {\partial J} {\partial w_2} = 
\frac {\partial J} {\partial h_2} 
\frac {\partial h_2} {\partial w_2} = 
(z_2 - label) \cdot z_1^T
$$

$$
\frac {\partial J} {\partial w_1} = 
\frac {\partial J} {\partial h_1} 
\frac {\partial h_1} {\partial w_1} = 
\frac {\partial J} {\partial h_1}  \cdot x^T
$$


$$
\frac {\partial J} {\partial h_1} = 
\frac {\partial J} {\partial h_2} 
\frac {\partial h_2} {\partial z_1}
\frac {\partial z_1} {\partial h_1}
= 
\big (w_1^T \cdot (z_2 - label) \big) z_1(1 - z_1)
$$

In [254]:
class SimpleNN:
    
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        self.w_1 = torch.randn(hidden_size, input_size, dtype=torch.float)
        self.w_2 = torch.randn(output_size, hidden_size, dtype=torch.float)
        
        self.cache = {}
    
    def forward(self, x: torch.tensor):
        h_1 = torch.mm(self.w_1, x)
        z_1 = torch.sigmoid(h_1)
        h_2 = torch.mm(self.w_2, z_1)
        z_2 = F.softmax(h_2, dim=0)
        
        self.cache['z_1'] = z_1
        self.cache['z_2'] = z_2
        return z_2
    
    def loss(self, x: torch.tensor, label: torch.tensor):
        pred = self.forward(x)
        return -torch.sum(label * torch.log(pred))
    
    def backward(self, x: torch.tensor, label: torch.tensor):
        self.forward(x)
        
        z_1, z_2 = self.cache['z_1'], self.cache['z_2']

        dh_2 = z_2 - label
        dw_2 = torch.mm(dh_2, z_1.t())
        dh_1 = torch.mm(self.w_2.t(), dh_2) * (z_1 * (1 - z_1))
        dw_1 = torch.mm(dh_1, x.t())
        return dw_1, dw_2 
    
    
    def numerical_gradients(self, x: torch.tensor, label: torch.tensor, epsilon: float):
        d_params = (torch.zeros_like(self.w_1, dtype=torch.float), torch.zeros_like(self.w_2, dtype=torch.float))
        params = (self.w_1, self.w_2)

        # calculating numerical gradients for each parameter
        for d_param, param in zip(d_params, params):

            # iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
            it = np.nditer(param.numpy(), flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                ix = it.multi_index

                # keeping the original value so we can reset it later
                original_value = param[ix]
                print(original_value.item())

                # estimating numeric gradients

                # x + epsilon
                param[ix] = original_value.item() + epsilon
                print(param[ix].item())
                
                loss_plus = self.loss(x, label)

                # x - epsilon
                param[ix] = original_value - epsilon
                loss_minus = self.loss(x, label)

                # numeric_gradient = (f(x + delta) - f(x - delta)) / (2 * delta)
                d_param[ix] = ((loss_plus - loss_minus) / (2 * epsilon)).item()

                # resetting parameter to original value
                param[ix] = original_value
                print(param[ix].item())
                print()

                it.iternext()

        return d_params

        
    def gradient_check(self, 
                       x: torch.tensor,
                       label: torch.tensor,
                       epsilon: float = 1e-1,
                       threshold: float = 1e-5):
        """
        Performs gradient checking for model parameters:
         - computes the analytic gradients using our back-propagation implementation
         - computes the numerical gradients using the two-sided epsilon method
         - computes the relative difference between numerical and analytical gradients
         - checks that the relative difference is less than threshold
         - if the last check is failed, then raises an error
        """
        
        def check_relative_difference(a: torch.tensor, b: torch.tensor, threshold: float) -> bool:
            """Returns True if (|a - b| / (|a| + |b|)) > threshold else False."""
            a, b = a.numpy(), b.numpy()
            return bool(np.all(np.abs(a - b) > threshold * (np.abs(a) + np.abs(b))))
            return bool(torch.all(torch.abs(a - b) > threshold * (torch.abs(a) + torch.abs(b))))
        
        params = ('w_1', 'w_2')

        # calculating the gradients using backpropagation, aka analytic gradients
        self.cache = {}
        analytic_gradients = self.backward(x, label)

        # calculating numerical gradients
        self.cache = {}
        numeric_gradients = self.numerical_gradients(x, label, epsilon)

        # gradient check for each parameter
        for p_name, d_analytic, d_numeric in zip(params, analytic_gradients, numeric_gradients):
            print(f"\nPerforming gradient check for parameter {p_name} "
                  f"with size = {np.prod(d_analytic.shape)}.")
            
            
            if (not d_analytic.shape == d_numeric.shape or
                    check_relative_difference(d_analytic, d_numeric, threshold)):
                print(d_analytic != d_numeric)
               
                raise ValueError(f'Gradient check for {p_name} is failed.')

            print(f"Gradient check for parameter {p_name} is passed.")
        

In [255]:
print('Testing implementation.')

nn = SimpleNN(10, 20, 3)

x = torch.arange(10, dtype=torch.float).view(10, 1)
label = torch.tensor([0, 0, 1.]).reshape(3, 1)

pred = nn.forward(x)
print(sum(pred))

assert pred.shape == label.shape
assert bool(abs(sum(pred) - 1.) < 1e-6)

loss = nn.loss(x, label)
assert torch.equal(loss, -torch.log(pred[2, 0]))

dw_1, dw_2 = nn.backward(x, label)
assert dw_1.shape == nn.w_1.shape
assert dw_2.shape == nn.w_2.shape


nn.gradient_check(x, label, epsilon=1e-3, threshold=1e-3)

Testing implementation.
tensor([1.0000])
-0.2275388389825821
-0.22653883695602417
-0.2275388389825821

-0.5861692428588867
-0.58516925573349
-0.5861692428588867

-0.7420188784599304
-0.7410188913345337
-0.7420188784599304

-0.3866904377937317
-0.38569045066833496
-0.3866904377937317

-0.34474894404411316
-0.34374895691871643
-0.34474894404411316

-0.6654117703437805
-0.6644117832183838
-0.6654117703437805

-0.5662697553634644
-0.5652697682380676
-0.5662697553634644

-0.49711188673973083
-0.4961118996143341
-0.49711188673973083

-1.9876224994659424
-1.9866224527359009
-1.9876224994659424

-0.31582775712013245
-0.3148277699947357
-0.31582775712013245

2.596698760986328
2.59769868850708
2.596698760986328

-0.3325255215167999
-0.3315255343914032
-0.3325255215167999

0.382114440202713
0.38311442732810974
0.382114440202713

-1.2076480388641357
-1.2066479921340942
-1.2076480388641357

-0.3860839903354645
-0.38508400321006775
-0.3860839903354645

-0.28397709131240845
-0.2829771041870117
-0.283

0.9828513264656067
0.9838513135910034
0.9828513264656067

0.9583219885826111
0.9593219757080078
0.9583219885826111

0.02476472593843937
0.025764726102352142
0.02476472593843937

-0.16062599420547485
-0.15962599217891693
-0.16062599420547485

-1.7757437229156494
-1.774743676185608
-1.7757437229156494

0.13141369819641113
0.13241370022296906
0.13141369819641113

-0.514609694480896
-0.5136097073554993
-0.514609694480896

-0.33514273166656494
-0.3341427445411682
-0.33514273166656494

-0.16588203608989716
-0.16488203406333923
-0.16588203608989716

0.17887331545352936
0.17987331748008728
0.17887331545352936

0.42456331849098206
0.4255633056163788
0.42456331849098206

0.045173294842243195
0.04617329314351082
0.045173294842243195

-0.7398527264595032
-0.7388527393341064
-0.7398527264595032

-0.04572797939181328
-0.044727981090545654
-0.04572797939181328

0.3701740801334381
0.37117406725883484
0.3701740801334381

-0.750196635723114
-0.7491966485977173
-0.750196635723114

0.8481701612472534
0.84

ValueError: Gradient check for w_2 is failed.

In [204]:
class SimpleNN:
    
    def __init__(self, input_size: int, hidden_size: int, output_size: int):
        self.w_1 = np.random.randn(hidden_size, input_size)
        self.w_2 = np.random.randn(output_size, hidden_size)
        
        self.cache = {}
    
    def forward(self, x: torch.tensor):
        h_1 = np.dot(self.w_1, x)
        z_1 = 1. / (1 + np.exp(-h_1))
        h_2 = np.dot(self.w_2, z_1)
        z_2 = np.exp(h_2) / np.exp(h_2).sum()
        
        self.cache['z_1'] = z_1
        self.cache['z_2'] = z_2
        return z_2
    
    def loss(self, x: torch.tensor, label: torch.tensor):
        pred = self.forward(x)
        return -np.sum(label * np.log(pred))
    
    def backward(self, x: torch.tensor, label: torch.tensor):
        self.forward(x)
        
        z_1, z_2 = self.cache['z_1'], self.cache['z_2']

        dh_2 = z_2 - label
        dw_2 = np.dot(dh_2, z_1.T)
        dh_1 = np.dot(self.w_2.T, dh_2) * (z_1 * (1 - z_1))
        dw_1 = np.dot(dh_1, x.T)
        return dw_1, dw_2 
    
    
    def numerical_gradients(self, x: torch.tensor, label: torch.tensor, epsilon: float):
        d_params = (np.zeros_like(self.w_1), np.zeros_like(self.w_2))
        params = (self.w_1, self.w_2)

        # calculating numerical gradients for each parameter
        for d_param, param in zip(d_params, params):

            # iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
            it = np.nditer(param, flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                ix = it.multi_index

                # keeping the original value so we can reset it later
                original_value = param[ix]

                # estimating numeric gradients

                # x + epsilon
                param[ix] = original_value + epsilon
                loss_plus = self.loss(x, label)

                # x - epsilon
                param[ix] = original_value - epsilon
                loss_minus = self.loss(x, label)

                # numeric_gradient = (f(x + delta) - f(x - delta)) / (2 * delta)
                d_param[ix] = (loss_plus - loss_minus) / (2 * epsilon)

                # resetting parameter to original value
                param[ix] = original_value

                it.iternext()

        return d_params

        
    def gradient_check(self, 
                       x: torch.tensor,
                       label: torch.tensor,
                       epsilon: float = 1e-3,
                       threshold: float = 1e-5):
        """
        Performs gradient checking for model parameters:
         - computes the analytic gradients using our back-propagation implementation
         - computes the numerical gradients using the two-sided epsilon method
         - computes the relative difference between numerical and analytical gradients
         - checks that the relative difference is less than threshold
         - if the last check is failed, then raises an error
        """
        
        def check_relative_difference(a: torch.tensor, b: torch.tensor, threshold: float) -> bool:
            """Returns True if (|a - b| / (|a| + |b|)) > threshold else False."""
            return bool(np.all(np.abs(a - b) > threshold * (np.abs(a) + np.abs(b))))
        
        params = ('w_1', 'w_2')

        # calculating the gradients using backpropagation, aka analytic gradients
        self.cache = {}
        analytic_gradients = self.backward(x, label)
#         analytic_gradients = (x.clone() for x in analytic_gradients)

        # calculating numerical gradients
        self.cache = {}
        numeric_gradients = self.numerical_gradients(x, label, epsilon)

        # gradient check for each parameter
        for p_name, d_analytic, d_numeric in zip(params, analytic_gradients, numeric_gradients):
            print(f"\nPerforming gradient check for parameter {p_name} "
                  f"with size = {np.prod(d_analytic.shape)}.")
            
            
            print(d_numeric.shape == d_analytic.shape)
            if (not d_analytic.shape == d_numeric.shape or
                    check_relative_difference(d_analytic, d_numeric, threshold)):
                print(d_analytic)
                print(d_numeric)
               
                raise ValueError(f'Gradient check for {p_name} is failed.')

            print(f"Gradient check for parameter {p_name} is passed.")
        

In [205]:
print('Testing implementation.')

nn = SimpleNN(10, 20, 3)

x = np.arange(10).reshape(10, 1) * 1.
label = np.array([0, 0, 1.]).reshape(3, 1)

pred = nn.forward(x)
print(sum(pred))

assert pred.shape == label.shape
assert bool(abs(sum(pred) - 1.) < 1e-6)

loss = nn.loss(x, label)
assert np.equal(loss, -np.log(pred[2, 0]))

dw_1, dw_2 = nn.backward(x, label)
assert dw_1.shape == nn.w_1.shape
assert dw_2.shape == nn.w_2.shape


nn.gradient_check(x, label, epsilon=1e-5, threshold=1e-10)

Testing implementation.
[1.]

Performing gradient check for parameter w_1 with size = 200.
True
Gradient check for parameter w_1 is passed.

Performing gradient check for parameter w_2 with size = 60.
True
Gradient check for parameter w_2 is passed.


In [139]:
x.n

tensor([[0., 1., 2., 3., 4., 5., 6., 7., 8., 9.]])

In [35]:
pred

tensor([[1.7634e-04],
        [2.8016e-02],
        [9.7181e-01]])

In [177]:
label

array([[0.],
       [0.],
       [1.]])

In [36]:
from sklearn.datasets import load_breast_cancer

In [37]:
data = load_breast_cancer()

In [40]:
list(data.keys())

['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename']

In [44]:
data['data'].shape

(569, 30)