In [18]:
import torch
import torch.nn.functional as F
import numpy as np

from dataset import get_mnist_data

### Simple functions

$$ y = f(x) = \sum{(x^2 + 2 \cdot x)} $$

In [19]:
x = torch.arange(10, dtype=torch.float, requires_grad=True)

y = torch.sum(x ** 2 + 2 * x)

In [20]:
dy_dx_analytic = 2 * x + 2 

In [21]:
y.backward(retain_graph=True)  # calculates gradient w.r.t. graph nodes

In [22]:
dy_dx_numeric = x.grad.clone()

In [23]:
bool(torch.all(dy_dx_numeric == dy_dx_analytic))

True

$$ y = W_{hy} h $$
$$ p = softmax(y) $$
$$ loss = -log(p) $$

In [24]:
n = 10
m = 20

w = torch.randn(n, m, requires_grad=True)
h = torch.randint(3, (20, 1), dtype=torch.float)
y = torch.matmul(w, h)
p = F.softmax(y, dim=0)

label = torch.zeros_like(p)
label[5] = 1.

loss = -torch.sum(label * torch.log(p))

In [25]:
loss

tensor(19.7621, grad_fn=<NegBackward>)

In [26]:
loss.backward()

In [27]:
w_analytic_grad = torch.matmul((p - label) , h.view(1, -1))

In [28]:
torch.allclose(w_analytic_grad,  w.grad.data)

True

### Simple NN

N, K, H, O - batch_size, input_size, hidden_size, output_size


* $x$: input, shape: (N, K)

* $w_1$: hidden layer weights, shape: (K, H)

* $w_2$: output layer weights, shape: (H, O)

* $z_2$: output, shape (N, O)



**Forward pass:**

$$ 
h_1 = x \cdot w_{1}\\
z_1 = \sigma(h_1)  \\ 
h_2 =  z_1 \cdot w_{2} \\
z_2 = softmax (h_2)
$$

**Loss - Cross Entropy:**

$$ J = -label \cdot \log(z_2) $$

**Backward pass:**

$$
\frac {\partial J} {\partial w_2} = 
\frac {\partial J} {\partial h_2} 
\frac {\partial h_2} {\partial w_2} = 
z_1^T \cdot  (z_2 - label)
$$

$$
\frac {\partial J} {\partial w_1} = 
\frac {\partial J} {\partial h_1} 
\frac {\partial h_1} {\partial w_1} = 
x^T \cdot \frac {\partial J} {\partial h_1} 
$$


$$
\frac {\partial J} {\partial h_1} = 
\frac {\partial J} {\partial h_2} 
\frac {\partial h_2} {\partial z_1}
\frac {\partial z_1} {\partial h_1}
= 
\big ((z_2 - label) \cdot w_1^T \big) z_1(1 - z_1)
$$

Here is the implementation of 2 layer simple neural network, which does the backpropogation mannually.

In [29]:
def check_relative_difference(a: torch.tensor, b: torch.tensor, threshold: float) -> bool:
    """Returns True if (|a - b| / (|a| + |b|)) > threshold else False."""
    numeratore = torch.abs(a - b)
    denominatore = torch.abs(a) + torch.abs(b)
    result = numeratore / denominatore
    result[torch.isnan(result)] = 0
    return bool(torch.any(result > threshold))

In [30]:
class NeuralNetwork:
    """
    Simple Neural Network with one hidden layer for classification.
    The backpropagation is implemented manually.

    It uses sigmoid as an activation function for hidden layer and log_softmax for output layer.
    Loss function is a cross entropy loss.
    """

    def __init__(self, input_size: int, hidden_size: int, output_size: int, dtype: torch.dtype):

        self.w_1 = torch.randn(input_size, hidden_size, dtype=dtype) * 0.01
        self.w_2 = torch.randn(hidden_size, output_size, dtype=dtype) * 0.01

        self.dtype = dtype
        self.cache = {}

    def forward(self, x: torch.Tensor):
        """
        Forward pass function.

        x shape: (batch_size, input_size)
        Returns log prediction.
        """

        h_1 = torch.matmul(x, self.w_1)
        z_1 = torch.sigmoid(h_1)

        h_2 = torch.matmul(z_1, self.w_2)
        z_2 = F.log_softmax(h_2, dim=1)

        self.cache['z_1'] = z_1
        self.cache['z_2'] = z_2
        return z_2

    def loss(self, x: torch.Tensor, label: torch.Tensor) -> torch.Tensor:
        """
        Cross entropy loss function.

        x shape: (batch_size, input_size)
        label shape: (batch_size, output_size)
        """
        log_prediction = self.forward(x)
        return -torch.sum(label * log_prediction)

    def backward(self, x: torch.Tensor, label: torch.Tensor):
        """
        Performs backpropagation, aka calculates loss gradient w.r.t. network weights.

        x shape: (batch_size, input_size)
        label shape: (batch_size, output_size)
        """
        self.forward(x)

        z_1, z_2 = self.cache['z_1'], self.cache['z_2']

        dh_2 = torch.exp(z_2) - label
        dw_2 = torch.matmul(z_1.t(), dh_2)
        dh_1 = torch.matmul(dh_2, self.w_2.t()) * (z_1 * (1 - z_1))
        dw_1 = torch.matmul(x.t(), dh_1)
        return dw_1, dw_2

    def sgd_step(self, x: torch.Tensor, label: torch.Tensor, lr: float):
        """Performs simple stochastic gradient descent step."""
        dw_1, dw_2 = self.backward(x, label)
        self.w_1 -= lr * dw_1
        self.w_2 -= lr * dw_2

    def numerical_gradients(self, x: torch.Tensor, label: torch.Tensor, epsilon: float):
        """Numerically calculates gradients."""
        d_params = (
            torch.zeros_like(self.w_1, dtype=self.dtype),
            torch.zeros_like(self.w_2, dtype=self.dtype)
        )
        params = (self.w_1, self.w_2)

        # calculating numerical gradients for each parameter
        for d_param, param in zip(d_params, params):

            # iterate over each element of the parameter matrix, e.g. (0,0), (0,1), ...
            it = np.nditer(param, flags=['multi_index'], op_flags=['readwrite'])
            while not it.finished:
                ix = it.multi_index

                # keeping the original value so we can reset it later
                original_value = param[ix].item()

                # estimating numeric gradients

                # x + epsilon
                param[ix] = original_value + epsilon
                loss_plus = self.loss(x, label)

                # x - epsilon
                param[ix] = original_value - epsilon
                loss_minus = self.loss(x, label)

                # numeric_gradient = (f(x + epsilon) - f(x - epsilon)) / (2 * epsilon)
                d_param[ix] = ((loss_plus - loss_minus) / (2 * epsilon)).item()

                # resetting parameter to original value
                param[ix] = original_value
                it.iternext()

        return d_params

    def gradient_check(self,
                       x: torch.Tensor,
                       label: torch.Tensor,
                       epsilon: float = 1e-1,
                       threshold: float = 1e-5):
        """
        Performs gradient checking for model parameters:
         - computes the analytic gradients using our back-propagation implementation
         - computes the numerical gradients using the two-sided epsilon method
         - computes the relative difference between numerical and analytical gradients
         - checks that the relative difference is less than threshold
         - if the last check is failed, then raises an error
        """
        params = ('w_1', 'w_2')

        # calculating the gradients using backpropagation, aka analytic gradients
        self.cache = {}
        analytic_gradients = self.backward(x, label)

        # calculating numerical gradients
        self.cache = {}
        numeric_gradients = self.numerical_gradients(x, label, epsilon)

        # gradient check for each parameter
        for p_name, d_analytic, d_numeric in zip(params, analytic_gradients, numeric_gradients):
            print(f"\nPerforming gradient check for parameter {p_name} "
                  f"with size = {np.prod(d_analytic.shape)}.")

            if (not d_analytic.shape == d_numeric.shape or
                    check_relative_difference(d_analytic, d_numeric, threshold)):
                raise ValueError(f'Gradient check for {p_name} is failed.')

            print(f"Gradient check for parameter {p_name} is passed.")

In [31]:
threshold = 1e-4

print('Testing implementation.')

batch_size, input_size, hidden_size, output_size = 64, 784, 20, 10
data, _ = get_mnist_data(batch_size=batch_size)
x, label = next(data)

model = NeuralNetwork(input_size, hidden_size, output_size, x.dtype)

log_pred = model.forward(x)
pred = torch.exp(log_pred)

assert pred.shape == label.shape == (batch_size, output_size)
assert abs(torch.sum(pred[0]).item() - 1.) < threshold

diff = abs(torch.sum(pred).item() - batch_size)
try:
    assert diff < threshold
except AssertionError:
    print(diff)

loss = model.loss(x, label)
_, indexes = np.where(label > 0.)
diff = abs(loss.item() + log_pred[torch.arange(batch_size), indexes].sum().item())
try:
    assert diff < threshold
except AssertionError:
    print(diff)

dw_1, dw_2 = model.backward(x, label)
assert dw_1.shape == model.w_1.shape == (input_size, hidden_size)
assert dw_2.shape == model.w_2.shape == (hidden_size, output_size)

print('\nShapes are correct.')
dtype = torch.float64

x = torch.arange(10, dtype=dtype).view(1, 10)
label = torch.tensor([0, 0, 1.], dtype=dtype).reshape(1, 3)
model = NeuralNetwork(10, 20, 3, dtype)
model.gradient_check(x, label, epsilon=1e-3, threshold=1e-4)


Testing implementation.

Shapes are correct.

Performing gradient check for parameter w_1 with size = 200.
Gradient check for parameter w_1 is passed.

Performing gradient check for parameter w_2 with size = 60.
Gradient check for parameter w_2 is passed.


Now lets apply the network to simple data set.

In [55]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split


data = load_breast_cancer()
X = data['data']
label = data['target'] 

print(f'Data shape: {X.shape}')
print(f'Targets shape: {label.shape}')

Data shape: (569, 30)
Targets shape: (569,)


In [56]:
X = X.reshape(X.shape[0], X.shape[1])

one_hot_encoder = OneHotEncoder(categories='auto', sparse=False)
label = one_hot_encoder.fit_transform(label.reshape(-1, 1))
label = label.reshape(label.shape[0], label.shape[1])

In [57]:
X_train, X_test, label_train, label_test = train_test_split(X, label, test_size=0.25)

In [58]:
X_train.shape, label_train.shape, X_test.shape, label_test.shape

((426, 30), (426, 2), (143, 30), (143, 2))

In [59]:
X_train = torch.from_numpy(X_train)
X_test = torch.from_numpy(X_test)
label_train = torch.from_numpy(label_train)
label_test = torch.from_numpy(label_test)

In [69]:
nn = NeuralNetwork(30, 50, 2, torch.float64)

epochs = 300

train_loss, test_loss = [], []
for epoch in range(epochs):
    
    train_loss.append(nn.loss(X_train, label_train).item())
    test_loss.append(nn.loss(X_test, label_test).item())
    nn.sgd_step(X_train, label_train, 1e-5)
    
    print('epoch: {}, train loss: {:.5f}, test loss: {:.5f}'.format(epoch, train_loss[-1], test_loss[-1]))
    


epoch: 0, train loss: 299.11671, test loss: 100.44311
epoch: 1, train loss: 295.02428, test loss: 99.07286
epoch: 2, train loss: 292.12758, test loss: 98.13322
epoch: 3, train loss: 290.39829, test loss: 97.56727
epoch: 4, train loss: 289.05726, test loss: 97.11508
epoch: 5, train loss: 287.88123, test loss: 96.74006
epoch: 6, train loss: 286.80731, test loss: 96.36627
epoch: 7, train loss: 285.82486, test loss: 96.06173
epoch: 8, train loss: 284.92359, test loss: 95.72295
epoch: 9, train loss: 284.24779, test loss: 95.51300
epoch: 10, train loss: 283.44286, test loss: 95.24892
epoch: 11, train loss: 282.62906, test loss: 95.00211
epoch: 12, train loss: 282.47804, test loss: 94.90637
epoch: 13, train loss: 281.90954, test loss: 94.73262
epoch: 14, train loss: 281.23807, test loss: 94.54917
epoch: 15, train loss: 280.11915, test loss: 94.14836
epoch: 16, train loss: 280.07979, test loss: 94.18333
epoch: 17, train loss: 279.93053, test loss: 94.05089
epoch: 18, train loss: 280.10840, tes

epoch: 186, train loss: 210.92705, test loss: 73.09375
epoch: 187, train loss: 226.73112, test loss: 83.61175
epoch: 188, train loss: 264.67274, test loss: 89.40726
epoch: 189, train loss: 219.77296, test loss: 77.05337
epoch: 190, train loss: 207.58955, test loss: 73.10495
epoch: 191, train loss: 204.67041, test loss: 72.63364
epoch: 192, train loss: 205.66408, test loss: 72.40909
epoch: 193, train loss: 209.66482, test loss: 75.00938
epoch: 194, train loss: 208.86509, test loss: 72.57415
epoch: 195, train loss: 203.29318, test loss: 73.09055
epoch: 196, train loss: 205.20915, test loss: 71.65828
epoch: 197, train loss: 210.79451, test loss: 76.55531
epoch: 198, train loss: 212.33250, test loss: 73.87820
epoch: 199, train loss: 209.27862, test loss: 74.51210
epoch: 200, train loss: 198.10413, test loss: 69.61563
epoch: 201, train loss: 204.96777, test loss: 74.74300
epoch: 202, train loss: 210.36660, test loss: 73.27634
epoch: 203, train loss: 211.83925, test loss: 76.78741
epoch: 204