# Практическое задание

## Данные о студенте

1. **ФИО**: Денисов Даниил Михайлович
2. **Факультет**: Механико-математический
3. **Курс**: 2 (магистратура)
4. **Группа**: М2

## Замечания

* Заполненный ноутбук необходимо сдать боту
* Соблюдаем кодекс чести (по нулям и списавшему, и давшему списать)
* Можно (и нужно!) применять для реализации только библиотеку **Numpy**
* Ничего, крому Numpy, нельзя использовать для реализации 
* **Keras** используется только для тестирования Вашей реализации
* Если какой-то из классов не проходит приведенные тесты, то соответствующее задание не оценивается
* Возможно использование дополнительных (приватных) тестов
 

## Реализация собственного нейросетевого пакета для запуска и обучения нейронных сетей

Задание состоит из трёх частей:
1. Реализация прямого вывода нейронной сети (5 баллов)
2. Реализация градиентов по входу и распространения градиента по сети (5 баллов)
3. Реализация градиентов по параметрам и метода обратного распространения ошибки с обновлением парметров сети (10 баллов)

Дополнительные баллы можно получить при реализации обучения сети со свёрточными слоями (10 баллов), с транспонированной свёрткой (10 баллов), дополнительного оптимизатора (5 баллов). 

###  1. Реализация вывода собственной нейронной сети

1.1 Внимательно ознакомьтесь с интерфейсом слоя. Любой слой должен содержать как минимум три метода:
- конструктор
- прямой вывод 
- обратный вывод, производные по входу и по параметрам

In [1]:
class Layer(object):
    def __init__(self):
        self.name = 'Layer'
    
    def forward(self, input_data):
        pass

    def backward(self, input_data):
        return [self.grad_x(input_data), self.grad_param(input_data)]
    
    def grad_x(self, input_data):
        pass

    def grad_param(self, input_data):
        return []
    
    def update_param(self, grads, learning_rate):
        pass

1.2 Ниже предствален интерфейс класса  Network. Обратите внимание на реализацию метода predict, который последовательно обрабатывает входные данные слой за слоем.

In [2]:
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class Network(object):
    def __init__(self, layers, loss=None):
        self.name = 'Network'
        self.layers = layers
        self.loss = loss
    
    def forward(self, input_data):
        return self.predict(input_data)
    
    def grad_x(self, input_data, labels):
        # Intermediary gradients (forward pass)
        grads_inter = []
        current_input = input_data
        for layer in self.layers:
            grads_inter.append(layer.grad_x(current_input))
            current_input = layer.forward(current_input)

        # Target gradient (backward pass)
        grad = self.loss.grad_x(current_input, labels)
        for layer_grad in reversed(grads_inter):
            grad = np.einsum('bi,bij->bj', grad, layer_grad)

        return grad
    
    def grad_param(self, input_data, labels):
        # Intermediary gradients (forward pass)
        grads_inter = []
        current_input = input_data
        for layer in self.layers:
            grads_inter.append(layer.backward(current_input))
            current_input = layer.forward(current_input)
        
        # Target gradients (backward pass)
        grads_targ = []
        grad = self.loss.grad_x(current_input, labels)
        for layer_grad_x, layer_grad_param in reversed(grads_inter):
            grads_targ.append([np.einsum('bi,bij->bj', grad, grad_param) 
                               for grad_param in layer_grad_param])
            grad = np.einsum('bi,bij->bj', grad, layer_grad_x)

        return grads_targ[::-1]

    def update(self, grads, learning_rate):
        for layer, grad in zip(self.layers, grads):
            layer.update_param(grad, learning_rate)
    
    def predict(self, input_data):
        current_input = input_data
        for layer in self.layers:
            current_input = layer.forward(current_input)
        return current_input
    
    def calculate_loss(self, input_data, labels):
        return self.loss.forward(self.predict(input_data), labels)
    
    def train_step(self, input_data, labels, learning_rate=0.001):
        grads = self.grad_param(input_data, labels)
        self.update(grads, learning_rate)
    
    def fit(self, trainX, trainY, validation_split=0.25, 
            batch_size=1, nb_epoch=1, learning_rate=0.01):
        
        train_x, val_x, train_y, val_y = train_test_split(trainX, trainY, 
                                                          test_size=validation_split,
                                                          random_state=42)
        
        for epoch in range(nb_epoch):
            for i in tqdm(range(int(len(train_x)/batch_size))):
                batch_x = train_x[i*batch_size: (i+1)*batch_size]
                batch_y = train_y[i*batch_size: (i+1)*batch_size]
                self.train_step(batch_x, batch_y, learning_rate)
            
            print('%d epoch: val %.2f' %(epoch + 1, self.evaluate(val_x, val_y)))
            
    def evaluate(self, testX, testY):
        y_pred = np.argmax(self.predict(testX), axis=1)
        y_true = np.argmax(testY, axis=1)
        return np.sum(y_pred == y_true) / len(y_true)

#### 1.1 Необходимо реализовать метод forward для вычисления следующих слоёв:

- DenseLayer
- ReLU
- Softmax
- FlattenLayer

In [3]:
import numpy as np

In [4]:
class DenseLayer(Layer):
    def __init__(self, input_dim, output_dim, W_init=None, b_init=None):
        self.name = 'Dense'
        self.input_dim = input_dim
        self.output_dim = output_dim

        # Use LeCun initialization by default
        self.W = W_init if W_init is not None \
                 else np.sqrt(3 / input_dim) * (2 * np.random.random((input_dim, output_dim)) - 1)
        self.b = b_init if b_init is not None \
                 else np.zeros(output_dim, 'float32')
    
    def forward(self, input_data):
        return np.einsum('bi,ij->bj', input_data, self.W) + self.b

    def grad_x(self, input_data):
        # dy/dx = W^T
        return np.tile(self.W.T, reps=(len(input_data), 1, 1))

    def grad_W(self, input_data):
        # dy/dW = (x_1 * I ... x_n * I)
        W_rows, W_cols = self.W.shape
        grad = np.zeros((len(input_data), W_rows, W_cols, W_cols))
        diag = np.einsum('bijj->bij', grad)
        diag[:] = input_data[..., None]
        return grad.transpose(0, 2, 1, 3).reshape(len(input_data), W_cols, -1)

    def grad_b(self, input_data):
        # dy/db = I
        return np.tile(np.eye(len(self.b)), reps=(len(input_data), 1, 1))

    def grad_param(self, input_data):
        return [self.grad_W(input_data), self.grad_b(input_data)]
    
    def update_W(self, grad, learning_rate):
        self.W -= learning_rate * np.mean(grad, axis=0).reshape(self.W.shape)
    
    def update_b(self, grad, learning_rate):
        self.b -= learning_rate * np.mean(grad, axis=0)
        
    def update_param(self, params_grad, learning_rate):
        self.update_W(params_grad[0], learning_rate)
        self.update_b(params_grad[1], learning_rate)

In [5]:
class ReLU(Layer):
    def __init__(self):
        self.name = 'ReLU'
    
    def forward(self, input_data):
        return np.clip(input_data, 0, None)

    def grad_x(self, input_data):
        batch, *dims = input_data.shape
        size = np.prod(dims)
        grad = np.zeros((batch, size, size))
        diag = np.einsum('bii->bi', grad)
        diag[:] = np.ceil(np.clip(input_data, 0, 1)).reshape(batch, -1)
        return grad

In [6]:
class Softmax(Layer):
    def __init__(self):
        self.name = 'Softmax'
    
    def forward(self, input_data):
        exps = np.e ** input_data
        return exps / np.sum(exps, axis=1, keepdims=True)
    
    def grad_x(self, input_data):
        # dy/dx = diag(e^x/S) - (e^x/S)^T * (e^x/S)
        forward = self.forward(input_data)
        grad = -np.einsum('bi,bj->bij', forward, forward)
        diag = np.einsum('bii->bi', grad)
        diag[:] = forward + diag
        return grad

In [7]:
class FlattenLayer(Layer):
    def __init__(self):
        self.name = 'Flatten'
        
    def forward(self, input_data):
        return input_data.reshape(len(input_data), -1)
    
    def grad_x(self, input_data):
        batch, *dims = input_data.shape
        size = np.prod(dims)
        return np.tile(np.eye(size), reps=(batch, 1, 1))

#### 1.2 Реализуйте теперь свёрточный слой и транспонированную свёртку  (опционально)

In [8]:
b, c_in, c_out, h_in, w_in = 1, 2, 2, 4, 4
input_data = np.arange(b*c_in*h_in*w_in).reshape((b, c_in, h_in, w_in)).astype('float32')
input_data

array([[[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.]],

        [[16., 17., 18., 19.],
         [20., 21., 22., 23.],
         [24., 25., 26., 27.],
         [28., 29., 30., 31.]]]], dtype=float32)

In [9]:
pad_l, pad_r = 0, 0
input_data = np.pad(input_data, ((0, 0), (0, 0), (pad_l, pad_r), (pad_l, pad_r)))
input_data

array([[[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.]],

        [[16., 17., 18., 19.],
         [20., 21., 22., 23.],
         [24., 25., 26., 27.],
         [28., 29., 30., 31.]]]], dtype=float32)

In [10]:
pad = pad_l + pad_r
h_in += pad
w_in += pad

In [11]:
k, s = 3, 1
kernel = np.tile(np.eye(k), reps=(c_in, 1, 1)) * np.array([[[1]], [[2]]])
kernel = np.tile(kernel, reps=(c_out, 1, 1, 1))
kernel = kernel.transpose(2, 3, 1, 0)
kernel

array([[[[1., 1.],
         [2., 2.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.]],

        [[1., 1.],
         [2., 2.]],

        [[0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[1., 1.],
         [2., 2.]]]])

In [12]:
h_out, w_out = (h_in - k) // s + 1, (w_in - k) // s + 1
h_out, w_out

(2, 2)

In [13]:
idh_in = np.arange(k) + np.expand_dims(s * np.arange(h_out), axis=1).repeat(k, axis=1)
idh_in = np.expand_dims(idh_in.repeat(w_out, axis=0), axis=2)
idh_in

array([[[0],
        [1],
        [2]],

       [[0],
        [1],
        [2]],

       [[1],
        [2],
        [3]],

       [[1],
        [2],
        [3]]])

In [14]:
idw_in = np.arange(k) + (s * np.arange(w_out))[:, None].repeat(k, axis=1)
idw_in = np.expand_dims(np.tile(idw_in, reps=(h_out, 1)), axis=1)
idw_in

array([[[0, 1, 2]],

       [[1, 2, 3]],

       [[0, 1, 2]],

       [[1, 2, 3]]])

In [15]:
patches = input_data[:, :, idh_in, idw_in]
patches

array([[[[[ 0.,  1.,  2.],
          [ 4.,  5.,  6.],
          [ 8.,  9., 10.]],

         [[ 1.,  2.,  3.],
          [ 5.,  6.,  7.],
          [ 9., 10., 11.]],

         [[ 4.,  5.,  6.],
          [ 8.,  9., 10.],
          [12., 13., 14.]],

         [[ 5.,  6.,  7.],
          [ 9., 10., 11.],
          [13., 14., 15.]]],


        [[[16., 17., 18.],
          [20., 21., 22.],
          [24., 25., 26.]],

         [[17., 18., 19.],
          [21., 22., 23.],
          [25., 26., 27.]],

         [[20., 21., 22.],
          [24., 25., 26.],
          [28., 29., 30.]],

         [[21., 22., 23.],
          [25., 26., 27.],
          [29., 30., 31.]]]]], dtype=float32)

In [16]:
bias = np.array([1])
conv = np.tensordot(patches, kernel, axes=([1, 3, 4], [2, 0, 1]))
conv = conv.transpose(0, 2, 1).reshape(b, c_out, h_out, w_out) + np.expand_dims(bias, axis=(0, 2, 3))
conv

array([[[[142., 151.],
         [178., 187.]],

        [[142., 151.],
         [178., 187.]]]])

In [17]:
h_in -= pad
w_in -= pad
grad = np.arange(b*c_out*h_out*w_out*c_in*h_in*w_in).reshape((b, c_out, h_out, w_out, c_in, h_in, w_in))
grad

array([[[[[[[  0,   1,   2,   3],
            [  4,   5,   6,   7],
            [  8,   9,  10,  11],
            [ 12,  13,  14,  15]],

           [[ 16,  17,  18,  19],
            [ 20,  21,  22,  23],
            [ 24,  25,  26,  27],
            [ 28,  29,  30,  31]]],


          [[[ 32,  33,  34,  35],
            [ 36,  37,  38,  39],
            [ 40,  41,  42,  43],
            [ 44,  45,  46,  47]],

           [[ 48,  49,  50,  51],
            [ 52,  53,  54,  55],
            [ 56,  57,  58,  59],
            [ 60,  61,  62,  63]]]],



         [[[[ 64,  65,  66,  67],
            [ 68,  69,  70,  71],
            [ 72,  73,  74,  75],
            [ 76,  77,  78,  79]],

           [[ 80,  81,  82,  83],
            [ 84,  85,  86,  87],
            [ 88,  89,  90,  91],
            [ 92,  93,  94,  95]]],


          [[[ 96,  97,  98,  99],
            [100, 101, 102, 103],
            [104, 105, 106, 107],
            [108, 109, 110, 111]],

           [[112, 113, 114

In [18]:
h_in += pad
w_in += pad
grad = np.pad(grad, ((0, 0), (0, 0), (0, 0), (0, 0), (0, 0), (pad_l, pad_r), (pad_l, pad_r)))
grad

array([[[[[[[  0,   1,   2,   3],
            [  4,   5,   6,   7],
            [  8,   9,  10,  11],
            [ 12,  13,  14,  15]],

           [[ 16,  17,  18,  19],
            [ 20,  21,  22,  23],
            [ 24,  25,  26,  27],
            [ 28,  29,  30,  31]]],


          [[[ 32,  33,  34,  35],
            [ 36,  37,  38,  39],
            [ 40,  41,  42,  43],
            [ 44,  45,  46,  47]],

           [[ 48,  49,  50,  51],
            [ 52,  53,  54,  55],
            [ 56,  57,  58,  59],
            [ 60,  61,  62,  63]]]],



         [[[[ 64,  65,  66,  67],
            [ 68,  69,  70,  71],
            [ 72,  73,  74,  75],
            [ 76,  77,  78,  79]],

           [[ 80,  81,  82,  83],
            [ 84,  85,  86,  87],
            [ 88,  89,  90,  91],
            [ 92,  93,  94,  95]]],


          [[[ 96,  97,  98,  99],
            [100, 101, 102, 103],
            [104, 105, 106, 107],
            [108, 109, 110, 111]],

           [[112, 113, 114

In [19]:
idh_out = np.expand_dims(np.arange(h_out), axis=(1, 2, 3, 4))
idw_out = np.expand_dims(np.arange(w_out), axis=(0, 2, 3, 4))
idc_in = np.expand_dims(np.arange(c_in), axis=(0, 1, 3, 4))

idh_in = np.arange(k) + np.expand_dims(s * np.arange(h_out), axis=1).repeat(k, axis=1)
idh_in = np.expand_dims(idh_in, axis=(1, 2, 4))

idw_in = np.arange(k) + np.expand_dims(s * np.arange(w_out), axis=1).repeat(k, axis=1)
idw_in = np.expand_dims(idw_in, axis=(0, 2, 3))

idh_out.shape, idw_out.shape, idc_in.shape, idh_in.shape, idw_in.shape

((2, 1, 1, 1, 1),
 (1, 2, 1, 1, 1),
 (1, 1, 2, 1, 1),
 (2, 1, 1, 3, 1),
 (1, 2, 1, 1, 3))

In [20]:
np.expand_dims(kernel.transpose(3, 2, 0, 1), axis=(0, 2, 3)).shape

(1, 2, 1, 1, 2, 3, 3)

In [21]:
grad[:, :, idh_out, idw_out, idc_in, idh_in, idw_in]

array([[[[[[[  0,   1,   2],
            [  4,   5,   6],
            [  8,   9,  10]],

           [[ 16,  17,  18],
            [ 20,  21,  22],
            [ 24,  25,  26]]],


          [[[ 33,  34,  35],
            [ 37,  38,  39],
            [ 41,  42,  43]],

           [[ 49,  50,  51],
            [ 53,  54,  55],
            [ 57,  58,  59]]]],



         [[[[ 68,  69,  70],
            [ 72,  73,  74],
            [ 76,  77,  78]],

           [[ 84,  85,  86],
            [ 88,  89,  90],
            [ 92,  93,  94]]],


          [[[101, 102, 103],
            [105, 106, 107],
            [109, 110, 111]],

           [[117, 118, 119],
            [121, 122, 123],
            [125, 126, 127]]]]],




        [[[[[128, 129, 130],
            [132, 133, 134],
            [136, 137, 138]],

           [[144, 145, 146],
            [148, 149, 150],
            [152, 153, 154]]],


          [[[161, 162, 163],
            [165, 166, 167],
            [169, 170, 171]],

     

In [22]:
grad_copy = np.zeros(grad.shape)
grad_copy[:, :, idh_out, idw_out, idc_in, idh_in, idw_in] = np.expand_dims(kernel.transpose(3, 2, 0, 1), axis=(0, 2, 3))
grad_copy[:, :, :, :, :, pad_l:h_in-pad_r, pad_l:w_in-pad_r]

array([[[[[[[1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 0.]],

           [[2., 0., 0., 0.],
            [0., 2., 0., 0.],
            [0., 0., 2., 0.],
            [0., 0., 0., 0.]]],


          [[[0., 1., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 1.],
            [0., 0., 0., 0.]],

           [[0., 2., 0., 0.],
            [0., 0., 2., 0.],
            [0., 0., 0., 2.],
            [0., 0., 0., 0.]]]],



         [[[[0., 0., 0., 0.],
            [1., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 1., 0.]],

           [[0., 0., 0., 0.],
            [2., 0., 0., 0.],
            [0., 2., 0., 0.],
            [0., 0., 2., 0.]]],


          [[[0., 0., 0., 0.],
            [0., 1., 0., 0.],
            [0., 0., 1., 0.],
            [0., 0., 0., 1.]],

           [[0., 0., 0., 0.],
            [0., 2., 0., 0.],
            [0., 0., 2., 0.],
            [0., 0., 0., 2.]]]]],




        [[

In [23]:
grad_copy.reshape(b, c_out*h_out*w_out, c_in*h_in*w_in)

array([[[1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0.],
        [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
         0., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2.],
        [1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
         2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
         0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0., 2., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.,
         0., 0., 0., 0., 2., 0., 0., 0., 0., 

In [24]:
patches_tr = patches.transpose(0, 2, 1, 3, 4)
patches_tr

array([[[[[ 0.,  1.,  2.],
          [ 4.,  5.,  6.],
          [ 8.,  9., 10.]],

         [[16., 17., 18.],
          [20., 21., 22.],
          [24., 25., 26.]]],


        [[[ 1.,  2.,  3.],
          [ 5.,  6.,  7.],
          [ 9., 10., 11.]],

         [[17., 18., 19.],
          [21., 22., 23.],
          [25., 26., 27.]]],


        [[[ 4.,  5.,  6.],
          [ 8.,  9., 10.],
          [12., 13., 14.]],

         [[20., 21., 22.],
          [24., 25., 26.],
          [28., 29., 30.]]],


        [[[ 5.,  6.,  7.],
          [ 9., 10., 11.],
          [13., 14., 15.]],

         [[21., 22., 23.],
          [25., 26., 27.],
          [29., 30., 31.]]]]], dtype=float32)

In [25]:
ind = np.eye(c_out)
ind = np.expand_dims(ind, axis=(0, 2, 4, 5, 6))
ind = np.tile(ind, reps=(b, 1, h_out*w_out, 1, c_in, k, k))
ind

array([[[[[[[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]],

           [[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],


          [[[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]]]],



         [[[[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]],

           [[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],


          [[[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]]]],



         [[[[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]],

           [[1., 1., 1.],
            [1., 1., 1.],
            [1., 1., 1.]]],


          [[[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]],

           [[0., 0., 0.],
            [0., 0., 0.],
            [0., 0., 0.]]]],



         [[[[1., 1., 1

In [26]:
patches_tr = np.expand_dims(patches_tr, axis=(1, 3))
ind.shape, patches_tr.shape

((1, 2, 4, 2, 2, 3, 3), (1, 1, 4, 1, 2, 3, 3))

In [27]:
patches_tr = patches_tr * ind
patches_tr

array([[[[[[[ 0.,  1.,  2.],
            [ 4.,  5.,  6.],
            [ 8.,  9., 10.]],

           [[16., 17., 18.],
            [20., 21., 22.],
            [24., 25., 26.]]],


          [[[ 0.,  0.,  0.],
            [ 0.,  0.,  0.],
            [ 0.,  0.,  0.]],

           [[ 0.,  0.,  0.],
            [ 0.,  0.,  0.],
            [ 0.,  0.,  0.]]]],



         [[[[ 1.,  2.,  3.],
            [ 5.,  6.,  7.],
            [ 9., 10., 11.]],

           [[17., 18., 19.],
            [21., 22., 23.],
            [25., 26., 27.]]],


          [[[ 0.,  0.,  0.],
            [ 0.,  0.,  0.],
            [ 0.,  0.,  0.]],

           [[ 0.,  0.,  0.],
            [ 0.,  0.,  0.],
            [ 0.,  0.,  0.]]]],



         [[[[ 4.,  5.,  6.],
            [ 8.,  9., 10.],
            [12., 13., 14.]],

           [[20., 21., 22.],
            [24., 25., 26.],
            [28., 29., 30.]]],


          [[[ 0.,  0.,  0.],
            [ 0.,  0.,  0.],
            [ 0.,  0.,  0.]],

       

In [28]:
patches_tr = patches_tr.transpose(0, 1, 2, 5, 6, 4, 3)
patches_tr

array([[[[[[[ 0.,  0.],
            [16.,  0.]],

           [[ 1.,  0.],
            [17.,  0.]],

           [[ 2.,  0.],
            [18.,  0.]]],


          [[[ 4.,  0.],
            [20.,  0.]],

           [[ 5.,  0.],
            [21.,  0.]],

           [[ 6.,  0.],
            [22.,  0.]]],


          [[[ 8.,  0.],
            [24.,  0.]],

           [[ 9.,  0.],
            [25.,  0.]],

           [[10.,  0.],
            [26.,  0.]]]],



         [[[[ 1.,  0.],
            [17.,  0.]],

           [[ 2.,  0.],
            [18.,  0.]],

           [[ 3.,  0.],
            [19.,  0.]]],


          [[[ 5.,  0.],
            [21.,  0.]],

           [[ 6.,  0.],
            [22.,  0.]],

           [[ 7.,  0.],
            [23.,  0.]]],


          [[[ 9.,  0.],
            [25.,  0.]],

           [[10.,  0.],
            [26.,  0.]],

           [[11.,  0.],
            [27.,  0.]]]],



         [[[[ 4.,  0.],
            [20.,  0.]],

           [[ 5.,  0.],
          

In [29]:
patches_rs = patches_tr.reshape(b, c_out*h_out*w_out, k*k*c_in*c_out)
patches_rs

array([[[ 0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0., 18.,  0.,
          4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,  0.,
          8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0.],
        [ 1.,  0., 17.,  0.,  2.,  0., 18.,  0.,  3.,  0., 19.,  0.,
          5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0., 23.,  0.,
          9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,  0., 27.,  0.],
        [ 4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,  0.,
          8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0.,
         12.,  0., 28.,  0., 13.,  0., 29.,  0., 14.,  0., 30.,  0.],
        [ 5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0., 23.,  0.,
          9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,  0., 27.,  0.,
         13.,  0., 29.,  0., 14.,  0., 30.,  0., 15.,  0., 31.,  0.],
        [ 0.,  0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0., 18.,
          0.,  4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,
          0.,  8.,  0., 24.,  

In [30]:
np.tile(patches_rs, reps=(1, 2, 1, 1, 1, 1, 1))

array([[[[[[[ 0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0., 18.,
              0.,  4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0.,
             22.,  0.,  8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,
              0., 26.,  0.],
            [ 1.,  0., 17.,  0.,  2.,  0., 18.,  0.,  3.,  0., 19.,
              0.,  5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0.,
             23.,  0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,
              0., 27.,  0.],
            [ 4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,
              0.,  8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,  0.,
             26.,  0., 12.,  0., 28.,  0., 13.,  0., 29.,  0., 14.,
              0., 30.,  0.],
            [ 5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0., 23.,
              0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,  0.,
             27.,  0., 13.,  0., 29.,  0., 14.,  0., 30.,  0., 15.,
              0., 31.,  0.],
            [ 0.,  0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0.,


In [31]:
patches.transpose(0, 2, 3, 4, 1)

array([[[[[ 0., 16.],
          [ 1., 17.],
          [ 2., 18.]],

         [[ 4., 20.],
          [ 5., 21.],
          [ 6., 22.]],

         [[ 8., 24.],
          [ 9., 25.],
          [10., 26.]]],


        [[[ 1., 17.],
          [ 2., 18.],
          [ 3., 19.]],

         [[ 5., 21.],
          [ 6., 22.],
          [ 7., 23.]],

         [[ 9., 25.],
          [10., 26.],
          [11., 27.]]],


        [[[ 4., 20.],
          [ 5., 21.],
          [ 6., 22.]],

         [[ 8., 24.],
          [ 9., 25.],
          [10., 26.]],

         [[12., 28.],
          [13., 29.],
          [14., 30.]]],


        [[[ 5., 21.],
          [ 6., 22.],
          [ 7., 23.]],

         [[ 9., 25.],
          [10., 26.],
          [11., 27.]],

         [[13., 29.],
          [14., 30.],
          [15., 31.]]]]], dtype=float32)

In [32]:
patches_new = patches.transpose(0, 2, 3, 4, 1)
patches_new

array([[[[[ 0., 16.],
          [ 1., 17.],
          [ 2., 18.]],

         [[ 4., 20.],
          [ 5., 21.],
          [ 6., 22.]],

         [[ 8., 24.],
          [ 9., 25.],
          [10., 26.]]],


        [[[ 1., 17.],
          [ 2., 18.],
          [ 3., 19.]],

         [[ 5., 21.],
          [ 6., 22.],
          [ 7., 23.]],

         [[ 9., 25.],
          [10., 26.],
          [11., 27.]]],


        [[[ 4., 20.],
          [ 5., 21.],
          [ 6., 22.]],

         [[ 8., 24.],
          [ 9., 25.],
          [10., 26.]],

         [[12., 28.],
          [13., 29.],
          [14., 30.]]],


        [[[ 5., 21.],
          [ 6., 22.],
          [ 7., 23.]],

         [[ 9., 25.],
          [10., 26.],
          [11., 27.]],

         [[13., 29.],
          [14., 30.],
          [15., 31.]]]]], dtype=float32)

In [33]:
grad = np.tile(np.eye(c_out), reps=(b, h_out*w_out, k, k, c_in, 1, 1))
diag = np.einsum('bijklmm->bijklm', grad)
diag[:] = patches_new[..., None]
grad.transpose(0, 5, 1, 2, 3, 4, 6).reshape(b, c_out*h_out*w_out, k*k*c_in*c_out)

array([[[ 0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0., 18.,  0.,
          4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,  0.,
          8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0.],
        [ 1.,  0., 17.,  0.,  2.,  0., 18.,  0.,  3.,  0., 19.,  0.,
          5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0., 23.,  0.,
          9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,  0., 27.,  0.],
        [ 4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,  0.,
          8.,  0., 24.,  0.,  9.,  0., 25.,  0., 10.,  0., 26.,  0.,
         12.,  0., 28.,  0., 13.,  0., 29.,  0., 14.,  0., 30.,  0.],
        [ 5.,  0., 21.,  0.,  6.,  0., 22.,  0.,  7.,  0., 23.,  0.,
          9.,  0., 25.,  0., 10.,  0., 26.,  0., 11.,  0., 27.,  0.,
         13.,  0., 29.,  0., 14.,  0., 30.,  0., 15.,  0., 31.,  0.],
        [ 0.,  0.,  0., 16.,  0.,  1.,  0., 17.,  0.,  2.,  0., 18.,
          0.,  4.,  0., 20.,  0.,  5.,  0., 21.,  0.,  6.,  0., 22.,
          0.,  8.,  0., 24.,  

In [34]:
class Conv2DLayer(Layer):
    def __init__(self, kernel_size=3, input_channels=2, output_channels=3, 
                 padding='same', stride=1, K_init=None, b_init=None):
        # padding: 'same' или 'valid'
        # Работаем с квадратными ядрами, поэтому kernel_size - одно число
        # Работаем с единообразным сдвигом, поэтому stride - одно число
        # Фильтр размерности [kernel_size, kernel_size, input_channels, output_channels]
        self.name = 'Conv2D'
        self.kernel_size = kernel_size
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel = K_init if K_init is not None \
                      else np.random.random((kernel_size, kernel_size, 
                                             input_channels, output_channels))
        self.bias = b_init if b_init is not None \
                    else np.zeros(output_channels)
        self.padding = padding        
        self.stride = stride
    
    def forward(self, input_data):
        # На входе - четырехмерный тензор вида [batch, input_channels, height, width]
        # Вначале нужно проверить на согласование размерностей входных данных и ядра!
        # Нужно заполнить Numpy-тензор out

        # Consistency check
        self.check_input(input_data)

        # Apply padding if needed
        if self.padding == 'same':
            input_data = self.pad(input_data, self.get_pads())

        b, c_in, h_in, w_in = input_data.shape
        c_out, h_out, w_out = self.output_channels, *self.get_output_shape((h_in, w_in))
        
        # Get patches
        patches = self.get_patches(input_data, (h_out, w_out))

        # Apply convolution w/ bias
        conv = np.tensordot(patches, self.kernel, axes=([2, 3, 4], [0, 1, 2])) + self.bias

        return conv.transpose(0, 2, 1).reshape(b, c_out, h_out, w_out)

    def grad_x(self, input_data):
        # Consistency check
        self.check_input(input_data)

        # Apply padding if needed
        if self.padding == 'same':
            pad_l, pad_r = self.get_pads()  # used later
            input_data = self.pad(input_data, (pad_l, pad_r))

        k, s = self.kernel_size, self.stride
        b, c_in, h_in, w_in = input_data.shape
        c_out, h_out, w_out = self.output_channels, *self.get_output_shape((h_in, w_in))
        
        # Input height indexer
        indh_in = np.arange(k) + np.expand_dims(s * np.arange(h_out), axis=1).repeat(k, axis=1)
        indh_in = np.expand_dims(indh_in, axis=(1, 2, 4))

        # Input width indexer
        indw_in = np.arange(k) + np.expand_dims(s * np.arange(w_out), axis=1).repeat(k, axis=1)
        indw_in = np.expand_dims(indw_in, axis=(0, 2, 3))

        # Input channels indexer
        indc_in = np.expand_dims(np.arange(c_in), axis=(0, 1, 3, 4))

        # Output height/width indexers
        indh_out = np.expand_dims(np.arange(h_out), axis=(1, 2, 3, 4))
        indw_out = np.expand_dims(np.arange(w_out), axis=(0, 2, 3, 4))

        # Prepare kernel
        kernel = np.expand_dims(self.kernel.transpose(3, 2, 0, 1), axis=(0, 2, 3))

        # Get the gradient in the form of feature maps
        grad = np.zeros((b, c_out, h_out, w_out, c_in, h_in, w_in))
        grad[..., indh_out, indw_out, indc_in, indh_in, indw_in] = kernel

        # Remove padding if needed
        if self.padding == 'same':
            pad = pad_l + pad_r
            h_in -= pad
            w_in -= pad
            grad = grad[..., pad_l:h_in+pad_l, pad_l:w_in+pad_l]

        # Reshape to a batch of matrices
        grad = grad.reshape(b, c_out * h_out * w_out, c_in * h_in * w_in)

        return grad
    
    def grad_kernel(self, input_data):
        self.check_input(input_data)

        # Apply padding if needed
        if self.padding == 'same':
            input_data = self.pad(input_data, self.get_pads())

        k = self.kernel_size
        b, c_in, h_in, w_in = input_data.shape
        c_out, h_out, w_out = self.output_channels, *self.get_output_shape((h_in, w_in))
        
        # Get patches
        patches = self.get_patches(input_data, (h_out, w_out))

        # Get the gradient
        grad = np.tile(np.eye(c_out), reps=(b, h_out*w_out, k, k, c_in, 1, 1))
        diag = np.einsum('bijklmm->bijklm', grad)
        diag[:] = patches[..., None]

        return grad.transpose(0, 5, 1, 2, 3, 4, 6).reshape(b, c_out*h_out*w_out, k*k*c_in*c_out)

    def grad_param(self, input_data):
        return [self.grad_kernel(input_data)]

    def update_kernel(self, grad, learning_rate):
        self.kernel -= learning_rate * np.mean(grad, axis=0).reshape(self.kernel.shape)
        
    def update_param(self, params_grad, learning_rate):
        self.update_kernel(params_grad[0], learning_rate)

    def check_input(self, input_data):
        b, c_in, h_in, w_in = input_data.shape
        if c_in != self.input_channels:
            raise ValueError(f"Input channels mismatch: \
                               got {c_in}, expected {self.input_channels}")
        if h_in < self.kernel_size or w_in < self.kernel_size:
            raise ValueError(f"Dimensions mismatch: \
                               got {h_in, w_in}, expected at least {self.kernel_size, self.kernel_size}")

    def get_output_shape(self, input_shape):
        return (np.array(input_shape) - self.kernel_size) // self.stride + 1

    def get_pads(self):
        pad = self.kernel_size - 1
        pad_l = pad // 2
        pad_r = pad - pad_l
        return pad_l, pad_r

    def pad(self, data, pads):
        return np.pad(data, ((0, 0), (0, 0), pads, pads))

    def get_patches(self, input_data, output_shape):
        k, s = self.kernel_size, self.stride
        h_out, w_out = output_shape

        # Height indexer
        indh = np.arange(k) + np.expand_dims(s * np.arange(h_out), axis=1).repeat(k, axis=1)
        indh = indh.repeat(w_out, axis=0)
        indh = np.expand_dims(indh, axis=2)

        # Width indexer
        indw = np.arange(k) + np.expand_dims(s * np.arange(w_out), axis=1).repeat(k, axis=1)
        indw = np.tile(indw, reps=(h_out, 1))
        indw = np.expand_dims(indw, axis=1)

        # Extract patches; keep batch and channels
        patches = input_data[..., indh, indw]

        return patches.transpose(0, 2, 3, 4, 1)

In [35]:
b, c_in, c_out, h_in, w_in = 1, 2, 2, 4, 4
input_data = np.arange(b*c_in*h_in*w_in).reshape((b, c_in, h_in, w_in)).astype('float32')
input_data

array([[[[ 0.,  1.,  2.,  3.],
         [ 4.,  5.,  6.,  7.],
         [ 8.,  9., 10., 11.],
         [12., 13., 14., 15.]],

        [[16., 17., 18., 19.],
         [20., 21., 22., 23.],
         [24., 25., 26., 27.],
         [28., 29., 30., 31.]]]], dtype=float32)

In [36]:
k, s = 3, 1
kernel = np.tile(np.eye(k), reps=(c_in, 1, 1)) * np.array([[[1]], [[2]]])
kernel = np.tile(kernel, reps=(c_out, 1, 1, 1))
kernel = kernel.transpose(2, 3, 1, 0)
kernel

array([[[[1., 1.],
         [2., 2.]],

        [[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.]],

        [[1., 1.],
         [2., 2.]],

        [[0., 0.],
         [0., 0.]]],


       [[[0., 0.],
         [0., 0.]],

        [[0., 0.],
         [0., 0.]],

        [[1., 1.],
         [2., 2.]]]])

In [37]:
layer = Conv2DLayer(k, c_in, c_out, 'valid', s, kernel)
layer.forward(input_data)

array([[[[141., 150.],
         [177., 186.]],

        [[141., 150.],
         [177., 186.]]]])

In [38]:
layer = Conv2DLayer(k, c_in, c_out, 'valid', s, kernel, b_init=np.array([1, 2]))
layer.forward(input_data)

array([[[[142., 151.],
         [178., 187.]],

        [[143., 152.],
         [179., 188.]]]])

In [39]:
class Conv2DTrLayer(Layer):
    def __init__(self, kernel_size=3, input_channels=2, output_channels=3, 
                 padding=0, stride=1, K_init=None, b_init=None):      
        # padding: число (сколько отрезать от модифицированной входной карты)
        # Работаем с квадратными ядрами, поэтому kernel_size - одно число
        # stride - одно число (коэффициент расширения)
        # Фильтр размерности [kernel_size, kernel_size, input_channels, output_channels]
        self.name = 'Conv2DTr'
        self.kernel_size = kernel_size
        self.input_channels = input_channels
        self.output_channels = output_channels
        self.kernel = K_init
        self.bias = b_init
        self.padding = padding
        self.stride = stride
    def forward(self, input_data):
        # На входе - четырехмерный тензор вида [batch, input_channels, height, width]
        # Вначале нужно проверить на согласование размерностей входных данных и ядра!
        # Нужно заполнить Numpy-тензор out 
        out = np.empty([])
        return out

    def forward(self, input_data):
        pass
    def grad_x(self):
        pass
    def grad_kernel(self):
        pass

#### 1.4 Теперь настало время теста. 
#### Если вы всё сделали правильно, то запустив следующие ячейки у вас должна появиться надпись: Test PASSED

Переходить к дальнейшим заданиям не имеем никакого смысла, пока вы не добьётесь прохождение теста
    

#### Чтение данных

In [40]:
import numpy as np
from keras.utils import np_utils
from keras.datasets import mnist

np.random.seed(123)
 
(X_train, y_train), (X_test, y_test) = mnist.load_data()

X_train = X_train.reshape(X_train.shape[0], 1, 28, 28).astype('float32') / 255
X_test = X_test.reshape(X_test.shape[0], 1, 28, 28).astype('float32') / 255

Y_train = np_utils.to_categorical(y_train, 10)
Y_test = np_utils.to_categorical(y_test, 10)

print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


(60000, 1, 28, 28) (60000, 10) (10000, 1, 28, 28) (10000, 10)


#### Подготовка моделей

In [41]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Flatten, Input
from keras.layers import Convolution2D, Conv2D, MaxPooling2D
from keras.optimizers import Adam, SGD

print(keras.__version__)

def get_keras_model():
    input_image = Input(shape=(1, 28, 28))
    flatten = Flatten()(input_image)
    dense = Dense(10, activation='softmax')(flatten)
    model = Model(inputs=input_image, outputs=dense)

    sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
    model.compile(loss='categorical_crossentropy',
                  optimizer=sgd,
                  metrics=['accuracy'])

    history = model.fit(X_train, Y_train, validation_split=0.25, 
                        batch_size=32, nb_epoch=2, verbose=1)
    return model

2.2.4


In [42]:
def get_our_model(keras_model):
    W_keras, b_keras = keras_model.get_weights()
    flatten = FlattenLayer()
    dense = DenseLayer(784, 10, W_init=W_keras, b_init=b_keras)
    softmax = Softmax()
    return Network([flatten, dense, softmax])

In [43]:
keras_model = get_keras_model()
our_model = get_our_model(keras_model)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 45000 samples, validate on 15000 samples
Epoch 1/2




Epoch 2/2


In [44]:
keras_pred = keras_model.predict(X_test)
our_pred = our_model.predict(X_test)
if np.sum(np.abs(keras_pred - our_pred)) < 0.01:
    print('Test PASSED')
else:
    print('Something went wrong!')

Test PASSED


### 2. Вычисление производных по входу для слоёв нейронной сети

В данном задании запрещено использовать численные формулы для вычисления производных.

#### 2.1  Реализуйте метод forward для класса CrossEntropy
Напоминание: $$ crossentropy = L(p, y) =  - \sum\limits_i y_i log p_i, $$
где вектор $(p_1, ..., p_k) $ -  выход классификационного алгоритма, а $(y_1,..., y_k)$ - правильные метки класса в унарной кодировке (one-hot encoding)

In [45]:
class CrossEntropy(object):
    def __init__(self):
        self.name = 'CrossEntropy'
    
    def forward(self, input_data, labels):
        return -np.sum(labels * np.log(input_data), axis=1)
    
    def grad_x(self, input_data, labels):
        return -labels / input_data

#### 2.2  Реализуйте метод grad_x класса CrossEntropy, который возвращает $\frac{\partial L}{\partial p}$

Проверить работоспособность кода поможет следующий тест:

In [46]:
def numerical_diff_loss(loss, x, labels):
    eps = 0.00001
    right_answer = []
    for i in range(len(x[0])):
        delta = np.zeros(len(x[0]))
        delta[i] = eps
        diff = (loss.forward(x + delta, labels) - loss.forward(x - delta, labels)) / (2 * eps)
        right_answer.append(diff)
    return np.array(right_answer).T

def test_loss(loss):
    x = np.array([[0.3, 0.2, 0.5], [0.3, 0.2, 0.5]])
    labels = np.array([[1, 2, 3], [2, 3, 4]])
    
    num_grad = numerical_diff_loss(loss, x, labels)
    grad = loss.grad_x(x, labels)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [47]:
loss = CrossEntropy()
test_loss(loss)

Test PASSED


#### 2.3  Реализуйте метод grad_x класса Softmax, который возвращает $\frac{\partial Softmax}{\partial x}$

Проверить работоспособность кода поможет следующий тест:

In [48]:
def numerical_diff_layer(layer, x):
    eps = 0.00001
    right_answer = []
    for i in range(len(x[0])):
        delta = np.zeros(len(x[0]))
        delta[i] = eps
        diff = (layer.forward(x + delta) - layer.forward(x - delta)) / (2 * eps)
        right_answer.append(diff.T)
    return np.array(right_answer).T

def test_layer(layer):
    x = np.array([[1, 2, 3], [2, -3, 4]])
    
    num_grad = numerical_diff_layer(layer, x)
    grad = layer.grad_x(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [49]:
layer = Softmax()
test_layer(layer)

Test PASSED


#### 2.4  Реализуйте метод grad_x для классов ReLU и DenseLayer

In [50]:
layer = ReLU()
test_layer(layer)

Test PASSED


In [51]:
layer = DenseLayer(3, 4)
test_layer(layer)

Test PASSED


In [52]:
import itertools

def numerical_diff_conv(layer, x):
    eps = 0.00001
    right_answer = []
    for i, j, k in itertools.product(*map(range, x[0].shape)):
        delta = np.zeros(x[0].shape)
        delta[i, j, k] = eps
        diff = (layer.forward(x + delta) - layer.forward(x - delta)) / (2 * eps)
        right_answer.append(diff.reshape(len(x), -1).T)
    return np.array(right_answer).T

def test_conv(layer, b, c, h, w):
    x = np.arange(b*c*h*w).reshape(b, c, h, w)
    
    num_grad = numerical_diff_conv(layer, x)
    grad = layer.grad_x(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [53]:
np.random.seed(42)

b, c_in, h_in, w_in, c_out = 5, 5, 5, 5, 5
k, s = 3, 2
conv = Conv2DLayer(k, c_in, c_out, 'same', s)
test_conv(conv, b, c_in, h_in, w_in)

Test PASSED


#### 2.5 (4 балла) Для класса Network реализуйте метод grad_x, который должен реализовывать взятие производной от лосса по входу

In [54]:
def numerical_diff_net(net, x, labels):
    eps = 0.00001
    right_answer = []
    for i in range(len(x[0])):
        delta = np.zeros(len(x[0]))
        delta[i] = eps
        diff = (net.calculate_loss(x + delta, labels) - net.calculate_loss(x - delta, labels)) / (2 * eps)
        right_answer.append(diff)
    return np.array(right_answer).T

def test_net(net):
    x = np.array([[0.3, 0.2, 0.5], [0.3, 0.2, 0.5]])
    labels = np.array([[1, 2, 3], [2, 3, 4]])
    
    num_grad = numerical_diff_net(net, x, labels)
    grad = net.grad_x(x, labels)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [55]:
net = Network([DenseLayer(3, 10), ReLU(), DenseLayer(10, 3), Softmax()], loss=CrossEntropy())
test_net(net)

Test PASSED


### 3. Реализация градиентов по параметрам и метода обратного распространения ошибки с обновлением парметров сети

#### 3.1  Реализуйте функции grad_b и grad_W. При подготовке теста grad_W предполагается, что W является отномерным вектором.

In [56]:
def numerical_grad_b(input_size, output_size, W, b, x):
    eps = 0.00001
    right_answer = []
    for i in range(len(b)):
        delta = np.zeros(b.shape)
        delta[i] = eps
        dense1 = DenseLayer(input_size, output_size, W_init=W, b_init=b+delta)
        dense2 = DenseLayer(input_size, output_size, W_init=W, b_init=b-delta)
        diff = (dense1.forward(x) - dense2.forward(x)) / (2 * eps)
        right_answer.append(diff.T)
    return np.array(right_answer).T

def test_grad_b():
    input_size, output_size = 3, 4 
    W_init = np.random.random((input_size, output_size))
    b_init = np.random.random((output_size,))
    x = np.random.random((2, input_size))

    num_grad = numerical_grad_b(input_size, output_size, W_init, b_init, x)
    dense = DenseLayer(input_size, output_size, W_init, b_init)
    grad = dense.grad_b(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [57]:
test_grad_b()

Test PASSED


In [58]:
def numerical_grad_W(input_size, output_size, W, b, x):
    eps = 0.00001
    right_answer = []
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            delta = np.zeros(W.shape)
            delta[i, j] = eps
            dense1 = DenseLayer(input_size, output_size, W_init=W+delta, b_init=b)
            dense2 = DenseLayer(input_size, output_size, W_init=W-delta, b_init=b)
            diff = (dense1.forward(x) - dense2.forward(x)) / (2 * eps)
            right_answer.append(diff.T)
    return np.array(right_answer).T

def test_grad_W():
    input_size, output_size = 3, 4
    W_init = np.random.random((input_size, output_size))
    b_init = np.random.random((4,))
    x = np.random.random((2, input_size))
    
    num_grad = numerical_grad_W(input_size, output_size, W_init, b_init, x)
    dense = DenseLayer(input_size, output_size, W_init, b_init)
    grad = dense.grad_W(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [59]:
test_grad_W()

Test PASSED


In [60]:
import itertools

np.random.seed(42)

def numerical_grad_kernel(kernel_size, input_channels, output_channels, 
                          padding, stride, K_init, b_init, x):
    eps = 0.00001
    right_answer = []
    for i, j, k, l in itertools.product(*map(range, K_init.shape)):
        delta = np.zeros(K_init.shape)
        delta[i, j, k, l] = eps
        conv1 = Conv2DLayer(kernel_size, input_channels, output_channels, 
                            padding, stride, K_init+delta, b_init)
        conv2 = Conv2DLayer(kernel_size, input_channels, output_channels, 
                            padding, stride, K_init-delta, b_init)
        diff = (conv1.forward(x) - conv2.forward(x)) / (2 * eps)
        right_answer.append(diff.reshape(len(x), -1).T)
            
    return np.array(right_answer).T

def test_grad_kernel():
    b, c_in, h_in, w_in, c_out = 5, 5, 5, 5, 5
    k, s = 3, 2
    
    K_init = np.random.random((k, k, c_in, c_out))
    b_init = np.random.random(c_out)

    x = np.arange(b*c_in*h_in*w_in).reshape(b, c_in, h_in, w_in)
    
    num_grad = numerical_grad_kernel(k, c_in, c_out, 'same', s, K_init, b_init, x)
    conv = Conv2DLayer(k, c_in, c_out, 'same', s, K_init, b_init)
    grad = conv.grad_kernel(x)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [61]:
test_grad_kernel()

Test PASSED


#### 3.2 Полностью реализуйте метод обратного распространения ошибки в функции train_step класса Network


Рекомендуем реализовать сначала функцию Network.grad_param(), которая возвращает список длиной в количество слоёв и элементом которого является список градиентов по параметрам.
После чего, имея список градиентов, написать функцию обновления параметров для каждого слоя. 

Совет: рекомендуем написать тест для кода подсчета градиента по параметрам, чтобы быть уверенным в том, что градиент через всю сеть считается правильно
    

In [62]:
def numerical_grad_param(net, x, labels):
    eps = 0.00001
    right_answer = []
    W = net.layers[0].W
    for i in range(W.shape[0]):
        for j in range(W.shape[1]):
            delta = np.zeros(W.shape)
            delta[i, j] = eps
            layer1 = DenseLayer(W.shape[0], W.shape[1], W_init=W+delta)
            layer2 = DenseLayer(W.shape[0], W.shape[1], W_init=W-delta)
            net1 = Network([layer1] + net.layers[1:], loss=net.loss)
            net2 = Network([layer2] + net.layers[1:], loss=net.loss)
            diff = (net1.calculate_loss(x, labels) - net2.calculate_loss(x, labels)) / (2 * eps)
            right_answer.append(diff)
    return np.array(right_answer).T.reshape(x.shape[0], W.shape[0], W.shape[1])

def test_grad_param():
    net = Network([DenseLayer(768, 20), ReLU(), DenseLayer(20, 3), Softmax()], loss=CrossEntropy())
    x = np.random.random((2, 768))
    labels = np.array([[1, 2, 3], [2, 3, 4]])

    num_grad = numerical_grad_param(net, x, labels)
    grad = net.grad_param(x, labels)[0][0].reshape((x.shape[0],) + net.layers[0].W.shape)
    if np.sum(np.abs(num_grad - grad)) < 0.01:
        print('Test PASSED')
    else:
        print('Something went wrong!')
        print('Numerical grad is')
        print(num_grad)
        print('Your grad is')
        print(grad)

In [63]:
test_grad_param()

Test PASSED


#### 3.3 Ознакомьтесь с реализацией функции fit класса Network. Запустите обучение модели. Если всё работает правильно, то точность на валидации должна будет возрастать

In [62]:
net = Network([DenseLayer(784, 10), Softmax()], loss=CrossEntropy())
trainX = X_train.reshape(len(X_train), -1)
net.fit(trainX[::3], Y_train[::3], validation_split=0.25, 
        batch_size=16, nb_epoch=5, learning_rate=0.01)

100%|██████████| 937/937 [00:08<00:00, 113.92it/s]


1 epoch: val 0.85


100%|██████████| 937/937 [00:08<00:00, 114.45it/s]


2 epoch: val 0.87


100%|██████████| 937/937 [00:08<00:00, 114.83it/s]


3 epoch: val 0.88


100%|██████████| 937/937 [00:08<00:00, 114.95it/s]


4 epoch: val 0.89


100%|██████████| 937/937 [00:08<00:00, 114.09it/s]


5 epoch: val 0.89


In [63]:
net = Network([DenseLayer(784, 20), ReLU(), DenseLayer(20, 10), Softmax()], loss=CrossEntropy())
trainX = X_train.reshape(len(X_train), -1)
net.fit(trainX[::6], Y_train[::6], validation_split=0.25, 
        batch_size=16, nb_epoch=5, learning_rate=0.001)

100%|██████████| 468/468 [00:15<00:00, 29.40it/s]


1 epoch: val 0.27


100%|██████████| 468/468 [00:17<00:00, 27.06it/s]


2 epoch: val 0.41


100%|██████████| 468/468 [00:16<00:00, 29.01it/s]


3 epoch: val 0.49


100%|██████████| 468/468 [00:15<00:00, 29.68it/s]


4 epoch: val 0.57


100%|██████████| 468/468 [00:15<00:00, 29.51it/s]


5 epoch: val 0.65


In [None]:
convs = [Conv2DLayer(input_channels=1, output_channels=1)]
denses = [DenseLayer(784, 10), ReLU()]
net = Network(convs + [FlattenLayer()] + denses + [Softmax()], loss=CrossEntropy())
net.fit(X_train[::6], Y_train[::6], validation_split=0.25, 
        batch_size=16, nb_epoch=5, learning_rate=0.001)

#### 3.5 Продемонстрируйте, что ваша реализация позволяет обучать более глубокие нейронные сети 

In [64]:
convs = [Conv2DLayer(kernel_size=3, input_channels=1, output_channels=1, padding='same', stride=1),
         Conv2DLayer(kernel_size=2, input_channels=1, output_channels=1, padding='valid', stride=2), 
         Conv2DLayer(kernel_size=3, input_channels=1, output_channels=1, padding='same', stride=1), 
         Conv2DLayer(kernel_size=2, input_channels=1, output_channels=1, padding='valid', stride=2)]
denses = [DenseLayer(49, 20), ReLU(), 
          DenseLayer(20, 10), ReLU()]
net = Network(convs + [FlattenLayer()] + denses + [Softmax()], loss=CrossEntropy())
net.fit(X_train[::10], Y_train[::10], validation_split=0.25, 
        batch_size=16, nb_epoch=5, learning_rate=0.001)

100%|██████████| 281/281 [00:30<00:00,  9.32it/s]


1 epoch: val 0.35


100%|██████████| 281/281 [00:30<00:00,  9.28it/s]


2 epoch: val 0.43


100%|██████████| 281/281 [00:30<00:00,  9.29it/s]


3 epoch: val 0.48


100%|██████████| 281/281 [00:30<00:00,  9.26it/s]


4 epoch: val 0.50


100%|██████████| 281/281 [00:30<00:00,  9.27it/s]


5 epoch: val 0.51


In [67]:
convs = [Conv2DLayer(kernel_size=3, input_channels=1, output_channels=1, padding='same', stride=1),
         Conv2DLayer(kernel_size=2, input_channels=1, output_channels=1, padding='valid', stride=2), 
         Conv2DLayer(kernel_size=3, input_channels=1, output_channels=1, padding='same', stride=1), 
         Conv2DLayer(kernel_size=2, input_channels=1, output_channels=1, padding='valid', stride=2)]
denses = [DenseLayer(49, 20), ReLU(),  
          DenseLayer(20, 10), ReLU()]
net = Network(convs + [FlattenLayer()] + denses + [Softmax()], loss=CrossEntropy())
net.fit(X_train[::6], Y_train[::6], validation_split=0.25, 
        batch_size=16, nb_epoch=5, learning_rate=0.001)

100%|██████████| 468/468 [00:53<00:00,  8.77it/s]


1 epoch: val 0.39


100%|██████████| 468/468 [00:52<00:00,  8.85it/s]


2 epoch: val 0.44


100%|██████████| 468/468 [00:52<00:00,  8.85it/s]


3 epoch: val 0.49


100%|██████████| 468/468 [00:53<00:00,  8.83it/s]


4 epoch: val 0.51


100%|██████████| 468/468 [00:53<00:00,  8.78it/s]


5 epoch: val 0.54
