In [48]:
import numpy as np
from scipy.special import erf

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [49]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box)
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`:

        output = module.forward(input)

    The module should be able to perform a backward pass: to differentiate the `forward` function.
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule.

        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.

        This includes
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.

        Make sure to both store the data in `output` field and return it.
        """

        # The easiest case:

        # self.output = input
        # return self.output

        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input.
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.

        The shape of `gradInput` is always the same as the shape of `input`.

        Make sure to both store the gradients in `gradInput` field and return it.
        """

        # The easiest case:

        # self.gradInput = gradOutput
        # return self.gradInput

        pass

    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass

    def zeroGradParameters(self):
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass

    def getParameters(self):
        """
        Returns a list with its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True

    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [92]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially.

         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`.
    """

    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []

    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:

            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})


        Just write a little loop.
        """

        # Your code goes here. ################################################
        output = input
        self.output_modules = [output]
        for module in self.modules:
            output = module.forward(output)
            self.output_modules.append(output)
            self.output = output
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:

            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)
            gradInput = module[0].backward(input, g_1)


        !!!

        To ech module you need to provide the input, module saw while forward pass,
        it is used while computing gradients.
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass)
        and NOT `input` to this Sequential module.

        !!!

        """
        # Your code goes here. ################################################
        gradInput = gradOutput
        for module, output in zip(reversed(self.modules), reversed(self.output_modules[:-1])):
            gradInput = module.backward(output, gradInput)
        self.gradInput = gradInput
        return self.gradInput


    def zeroGradParameters(self):
        for module in self.modules:
            module.zeroGradParameters()

    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]

    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]

    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string

    def __getitem__(self,x):
        return self.modules.__getitem__(x)

    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()

    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

# Layers

## 1 (0.2). Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [80]:
class Linear(Module):
    """
    A module which applies a linear transformation
    A common name is fully-connected layer, InnerProductLayer in caffe.

    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()

        # This is a nice initialization
        stdv = 1./np.sqrt(n_in) #std отклонение
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in)) #веса
        self.b = np.random.uniform(-stdv, stdv, size = n_out) #смещения

        self.gradW = np.zeros_like(self.W) #grad по весам
        self.gradb = np.zeros_like(self.b) #grad по смещениям

    def updateOutput(self, input):
        # Your code goes here. ################################################
        # output = x (входная матрица) * W' (транспонированная) + b (вектор смещения)
        self.output = input @ self.W.T + self.b #Умножаем матрицы и прибавляем смещение
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # gradInput = gradOutput * W (dL/dx = dL/dy * W)
        self.gradInput = gradOutput @ self.W
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        # Your code goes here. ################################################
        # Накапливает градинты параметров
        # dL/dW = (dL/dy)' * x (input); dL/db = sum dL/dy по axis0 
        self.gradW += gradOutput.T @ input
        self.gradb += gradOutput.sum(axis=0)
        #pass

    def zeroGradParameters(self):
        self.gradW.fill(0) #Обнуление градиента весов
        self.gradb.fill(0) #Обнуление град смещения

    def getParameters(self):
        return [self.W, self.b] #список параметров

    def getGradParameters(self):
        return [self.gradW, self.gradb] #список градиентов параметров

    def __repr__(self):
        s = self.W.shape # размерность
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

## 2. (0.2) SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [52]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True)) #Вычитаем max значение из каждой строки

        # Your code goes here. ################################################
        exp_out = np.exp(self.output) # вычисляем exp
        self.output = exp_out/np.sum(exp_out, axis=1, keepdims = True) #получили вероятности
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        batch_size = input.shape[0]
        temp = np.zeros_like(self.output) # временный массив

        for i in range(batch_size):
            s = self.output[i].reshape(-1,1) #вектор-столбец вероятностей для каждого i
            jacobi = np.diagflat(s) - s @ s.T #матрица Якоби
            temp[i] = jacobi @ gradOutput[i] #умножаем ее на градиент 

        self.gradInput = temp
        return self.gradInput

    def __repr__(self):
        return "SoftMax"

## 3. (0.2) LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [53]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        input_norm = np.subtract(input, input.max(axis=1, keepdims=True))

        # Your code goes here. ################################################
        exp_val = np.exp(input_norm) #вычисляем експоненты
        sum_exp = np.sum(exp_val, axis=1, keepdims=True) #сумма 
        self.output = np.subtract(input_norm, np.log(sum_exp)) #log softmax
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        exp_out = np.exp(self.output) #експоненты
        #вычисление градиента dL/dx = dL/dlog
        self.gradInput = gradOutput - exp_out * np.sum(gradOutput, axis=1, keepdims = True)
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"

## 4. (0.3) Batch normalization
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance:
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance.

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [54]:
class BatchNormalization(Module):
    EPS = 1e-3
    def __init__(self, alpha = 0.):
        super(BatchNormalization, self).__init__()
        self.alpha = alpha #коэффициент
        self.moving_mean = None #скользящее среднее для ср знач
        self.moving_variance = None #скользящее среднее для дисперсии

    def updateOutput(self, input):
        # Your code goes here. ################################################
        # use self.EPS please
        if self.training: # режим обучения
            mean_batch = np.mean(input, axis=0, keepdims=True) #среднее по батчу
            var_batch = np.var(input, axis=0, keepdims=True) #дисперсия по батчу
            self.output = (input-mean_batch)/np.sqrt(var_batch + self.EPS) # нормализуем

            if self.moving_mean is None: #обновляем скользящеие средние
                self.moving_mean = mean_batch.copy()
                self.moving_variance = var_batch.copy()
            else:
                self.moving_mean = self.moving_mean * self.alpha + mean_batch * (1-self.alpha)
                self.moving_variance = self.moving_variance * self.alpha + var_batch * (1-self.alpha)

        else: # режим оценки
            self.output = (input - self.moving_mean) / np.sqrt(self.moving_variance + self.EPS)
            
        return self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        if self.training:
            mean_batch = np.mean(input, axis=0, keepdims=True) #среднее
            var_batch = np.var(input, axis=0, keepdims=True) #дисперсия
            
            x_centr = input - mean_batch
            std = np.sqrt(var_batch + self.EPS)

            N = input.shape[0]
            #вычисляем градиент
            self.gradInput = (N*gradOutput - np.sum(gradOutput, axis=0, keepdims = True)
                              - x_centr/(std**2)*np.sum(gradOutput*x_centr, axis=0, keepdims=True))/(N*std)
        else:
            std = np.sqrt(self.moving_variance + self.EPS)
            self.gradInput = gradOutput/std
            
        return self.gradInput

    def __repr__(self):
        return "BatchNormalization"

In [55]:
class ChannelwiseScaling(Module):
    r"""
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out) #вектор масштабов
        self.beta = np.random.uniform(-stdv, stdv, size=n_out) #вектор сдвигов

        self.gradGamma = np.zeros_like(self.gamma) #градиент для гаммы
        self.gradBeta = np.zeros_like(self.beta) #град для беты

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. (0.3) Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [89]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()

        self.p = p # p - вероятность "выключения" нейрона
        self.mask = None

    def updateOutput(self, input):
        # Your code goes here. ################################################
        its_numpy = isinstance(input, np.ndarray)
        if self.training:
            
            if its_numpy:
                input_tensor = torch.from_numpy(input)
                self.mask = (torch.rand_like(input_tensor) > self.p).float() # бинарная маска
            else:
                self.mask = (torch.rand_like(input) > self.p).float()
                
            self.mask = self.mask/(1.0 - self.p) #делим на 1-p
            
            if its_numpy:
                self.output = input * self.mask.numpy()
            else:
                self.output = input * self.mask # применяем маску ко входным данным
        else:
            self.output = input # в тесте не изменяем данные
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        if self.training:
            if isinstance(gradOutput, np.ndarray):
                self.gradInput = gradOutput * self.mask.numpy() # если train, умножаем градиент на маску
            else:
                self.gradInput = gradOutput * self.mask.to(gradOutput.device)

        else:
            self.gradInput = gradOutput # в тесте без изменений
            
        return self.gradInput

    def __repr__(self):
        return "Dropout"

# 6. (2.0) Conv2d
Implement [**Conv2d**](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). Use only this list of parameters: (in_channels, out_channels, kernel_size, stride, padding, bias, padding_mode) and fix dilation=1 and groups=1.

In [93]:
class Conv2d(Module):
    def __init__(self, in_channels, out_channels, kernel_size,
                 stride=1, padding=0, bias=True, padding_mode='zeros'):
        super(Conv2d, self).__init__()

        self.in_channels = in_channels # количество входных каналов
        self.out_channels = out_channels # количество выходных каналов
        
        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size
        
        if isinstance(stride, int):
            self.stride = (stride, stride)
        else:
            self.stride = stride #шаг свертки
            
        self.padding = padding #дополнение нулями
        self.bias = bias # смещение
        self.padding_mode = padding_mode # режим дополнения

        stdv = 1 / np.sqrt(in_channels * self.kernel_size[0] * self.kernel_size[1])
        self.weight = np.random.uniform(
            -stdv, stdv, size = (out_channels, in_channels, self.kernel_size[0], self.kernel_size[1]))

        if bias:
            self.bias = np.random.uniform(-stdv, stdv, size = out_channels)
        else:
            self.bias = None

        self.gradWeight = np.zeros_like(self.weight)
        if bias:
            self.gradBias = np.zeros_like(self.bias)

    def _calculate_output_size(self, H_in, W_in):
        if self.padding == 'same':
            return (H_in, W_in)
            
        if isinstance(self.padding, int):
            pad_h = pad_w = self.padding
        else:
            pad_h, pad_w = self.padding
            
        H_out = (H_in + 2*pad_h - self.kernel_size[0]) // self.stride[0] + 1
        W_out = (W_in + 2*pad_w - self.kernel_size[1]) // self.stride[1] + 1
        
        return (H_out, W_out)

    def _pad_input(self, input):
        if self.padding == 0 or self.padding == 'same':
            return input
            
        if isinstance(self.padding, int):
            pad_h = pad_w = self.padding
        else:
            pad_h, pad_w = self.padding
            
        if self.padding_mode == 'zeros':
            return np.pad(input,
                        ((0,0), (0,0),
                        (pad_h, pad_h),
                        (pad_w, pad_w)),
                        mode='constant')
        elif self.padding_mode == 'reflect':
            return np.pad(input,
                        ((0,0), (0,0),
                        (pad_h, pad_h),
                        (pad_w, pad_w)),
                        mode='reflect')
        elif self.padding_mode == 'replicate':
            return np.pad(input,
                        ((0,0), (0,0),
                        (pad_h, pad_h),
                        (pad_w, pad_w)),
                        mode='edge')
        else:
            raise ValueError(f"Unsupported padding mode: {self.padding_mode}")

    def updateOutput(self, input):
        # Your code goes here. ################################################
        batch_size = input.shape[0]
        H_in = input.shape[2]
        W_in = input.shape[3]

        H_out, W_out = self._calculate_output_size(H_in, W_in)

        self.output = np.zeros ((batch_size, self.out_channels, H_out, W_out))

        padded_input = self._pad_input(input)

        for i in range(batch_size):
            for oc in range (self.out_channels):
                for oh in range (H_out):
                    for ow in range (W_out):
                        H_start = oh * self.stride[0]
                        H_end = H_start + self.kernel_size[0]
                        W_start = ow * self.stride[1]
                        W_end = W_start + self.kernel_size[1]

                        window = padded_input[i, :, H_start:H_end, W_start:W_end]

                        self.output[i, oc, oh, ow] = np.sum(window * self.weight[oc])

                if  self.bias is not None:
                    self.output[i, oc] += self.bias[oc]
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        batch_size = input.shape[0]
        in_channels = input.shape[1]
        H_in = input.shape[2]
        W_in = input.shape[3]

        self.gradInput = np.zeros_like(input)
        padded_grad = np.zeros((batch_size, in_channels, H_in + 2*self.padding, W_in + 2*self.padding))

        if self.padding > 0:
            padded_input = np.pad(input,
                                  ((0,0), (0,0),
                                   (self.padding, self.padding),
                                   (self.padding, self.padding)),
                                  mode = self.padding_mode)
        else:
            padded_input = input

        for i in range(batch_size):
            for ic in range (in_channels):
                for oc in range (self.out_channels):
                    for oh in range(gradOutput.shape[2]):
                        for ow in range(gradOutput.shape[3]):
                            H_start = oh * self.stride
                            H_end = H_start + self.kernel_size[0]
                            W_start = ow * self.stride
                            W_end = W_start + self.kernel_size[1]

                            padded_grad[i, ic, H_start:H_end, W_start:W_end] += self.weight[oc, ic] * gradOutput[i, oc, oh, ow]

        if self.padding > 0:
            self.gradInput = padded_grad[:, :, self.padding:-self.padding, self.padding:-self.padding]
        else:
            self.gradInput = padded_grad
                            
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        batch_size = input.shape[0]

        if self.padding > 0:
            padded_input = np.pad(input,
                                  ((0,0), (0,0),
                                   (self.padding, self.padding),
                                   (self.padding, self.padding)),
                                  mode = self.padding_mode)
        else:
            padded_input = input

        for oc in range(self.out_channels):
            for ic in range(self.in_channels):
                for kh in range(self.kernel_size[0]):
                    for kw in range(self.kernel_size[1]):
                        windows = []
                        grad = []
                        for i in range(batch_size):
                            for oh in range(gradOutput.shape[2]):
                                for ow in range(gradOutput.shape[3]):
                                    H_start = oh * self.stride + kh
                                    W_start = ow * self.stride + kw

                                    windows.append(padded_input[i, ic, H_start, W_start])
                                    grad.append(gradOutput[i,oc,oh,ow])
                        self.gradWeight[oc,ic,kh,kw] += np.sum(np.array(windows) * np.array(grad))

        if self.bias is not None:
            self.gradBias += gradOutput.sum(axis=(0,2,3))

    def zeroGradParameters(self):
        self.gradWeight.fill(0)
        if self.bias is not None:
            self.gradBias.fill(0)

    def getParameters(self):
        if self.bias is not None:
            return [self.weight, self.bias]
        return [self.weight]

    def getGradParameters(self):
        if self.bias is not None:
            return [self.gradWeight, self.gradBias]
        return [self.gradWeight]                            

    def __repr__(self):
        return "Conv2d"

# 7 . (0.5) 
Implement [**MaxPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) and [**AvgPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html). Use only parameters like kernel_size, stride, padding (negative infinity for maxpool and zero for avgpool) and other parameters fixed as in framework.

In [73]:
class MaxPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(MaxPool2d, self).__init__()

        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size

        self.stride = stride
        self.padding = padding
        self.mask_maxx = None # маска для запоминания максимума

    def updateOutput(self, input):
        # Your code goes here. ################################################
        # Прямой проход максимального пуллинга
        batch_size = input.shape[0]
        in_channels = input.shape[1]
        H_in = input.shape[2]
        W_in = input.shape[3]

        #размеры входа
        H_out = (H_in + 2*self.padding - self.kernel_size[0]) // self.stride + 1
        W_out = (W_in + 2*self.padding - self.kernel_size[1]) // self.stride + 1

        if self.padding > 0:
            padded_input = np.full((batch_size, in_channels,
                                    H_in + 2*self.padding, W_in + 2*self.padding),
                                   -np.inf)
            padded_input[:, :, self.padding:self.padding+H_in, self.padding:self.padding+W_in] = input
        else:
            padded_input = input

        #инициализируем вход и маску
        self.output = np.zeros((batch_size, in_channels, H_out, W_out))
        self.mask_maxx = np.zeros_like(padded_input)

        #максимальный пулинг
        for i in range(batch_size):
            for j in range(in_channels):
                for oh in range(H_out):
                    for ow in range(W_out):
                        
                        H_start = oh * self.stride #вычисляем окно
                        H_end = H_start + self.kernel_size[0]
                        W_start = ow * self.stride
                        W_end = W_start + self.kernel_size[1]

                        window = padded_input[i,j, H_start:H_end, W_start:W_end]

                        max_value = np.max(window) #максимум в окне
                        self.output[i,j,oh,ow] = max_value

                        #позиции максимума в окне
                        self.mask_maxx[i,j,H_start:H_end, W_start:W_end] = (window == max_value).astype(float)

        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        #обратный проход максимального пулинга: градиент проходит через max элементы
        batch_size = input.shape[0]
        in_channels = input.shape[1]
        H_in = input.shape[2]
        W_in = input.shape[3]

        H_k = self.kernel_size[0]
        W_k = self.kernel_size[1]

        if self.padding > 0:
            padded_grad = np.zeros((batch_size, in_channels,
                                    H_in + 2*self.padding,
                                    W_in + 2*self.padding))

        else:
            padded_grad =np.zeros_like(input)

        for i in range(batch_size):
            for j in range(in_channels):
                for oh in range(gradOutput.shape[2]):
                    for ow in range(gradOutput.shape[3]):
                        
                        H_start = oh * self.stride 
                        H_end = H_start + H_k
                        W_start = ow * self.stride
                        W_end = W_start + W_k

                        padded_grad[i,j,H_start:H_end, W_start:W_end] += gradOutput[i,j,oh,ow] * self.mask_maxx[i,j,H_start:H_end,W_start:W_end]

        if self.padding > 0: #убираем padding из grad
            self.gradInput = padded_grad[:, :, self.padding:self.padding + H_in, self.padding:self.padding +W_in]
        else:
            self.gradInput = padded_grad

        
        return self.gradInput

    def __repr__(self):
        return "MaxPool2d"

class AvgPool2d(Module):
    def __init__(self, kernel_size, stride, padding):
        super(AvgPool2d, self).__init__()

        if isinstance(kernel_size, int):
            self.kernel_size = (kernel_size, kernel_size)
        else:
            self.kernel_size = kernel_size

        self.stride = stride
        self.padding = padding
        self.window_count = None #кол-во элементов в каждом окне

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #Прямой проход усредняющего пулинга
        batch_size = input.shape[0]
        in_channels = input.shape[1]
        H_in = input.shape[2]
        W_in = input.shape[3]

        H_k = self.kernel_size[0]
        W_k = self.kernel_size[1]

        H_out = (H_in + 2*self.padding - H_k) // self.stride + 1
        W_out = (W_in + 2*self.padding - W_k) // self.stride + 1

        if self.padding > 0:
            padded_input = np.zeros((batch_size, in_channels,
                                     H_in + 2*self.padding,
                                     W_in + 2*self.padding))
            padded_input[:, :, self.padding:self.padding + H_in, self.padding:self.padding +W_in] = input
        else:
            padded_input = input

        self.output = np.zeros((batch_size, in_channels, H_out, W_out))
        self.window_count = np.zeros((batch_size, in_channels, H_out, W_out))

        for i in range(batch_size):
            for j in range(in_channels):
                for oh in range(H_out):
                    for ow in range(W_out):
                        H_start = oh * self.stride 
                        H_end = H_start + H_k
                        W_start = ow * self.stride
                        W_end = W_start + W_k

                        window = padded_input[i,j,H_start:H_end, W_start:W_end]
                        self.output[i,j,oh,ow] = np.mean(window)
                        self.window_count[i,j,oh,ow] = window.size
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # обратный проход усредняющего пулинга
        batch_size = input.shape[0]
        in_channels = input.shape[1]
        H_in = input.shape[2]
        W_in = input.shape[3]

        H_k = self.kernel_size[0]
        W_k = self.kernel_size[1]

        if self.padding > 0:
            padded_grad = np.zeros((batch_size, in_channels,
                                     H_in + 2*self.padding,
                                     W_in + 2*self.padding))
        else:
            padded_grad = np.zeros_like(input)

        for i in range(batch_size):
            for j in range(in_channels):
                for oh in range(gradOutput.shape[2]):
                    for ow in range(gradOutput.shape[3]):
                        H_start = oh * self.stride 
                        H_end = H_start + H_k
                        W_start = ow * self.stride
                        W_end = W_start + W_k

                        grad = gradOutput[i,j,oh,ow]/self.window_count[i,j,oh,ow]
                        padded_grad[i,j,H_start:H_end, W_start:W_end] += grad

        if self.padding > 0: #убираем padding из grad
            self.gradInput = padded_grad[:, :, self.padding:self.padding + H_in, self.padding:self.padding +W_in]
        else:
            self.gradInput = padded_grad

        return self.gradInput

    def __repr__(self):
        return "AvgPool2d"

# 8. (0.3) 
Implement **GlobalMaxPool2d** and **GlobalAvgPool2d**. They do not have testing and parameters are up to you but they must aggregate information within channels. Write test functions for these layers on your own.

# 9. (0.2) 
Implement [**Flatten**](https://pytorch.org/docs/stable/generated/torch.flatten.html)

In [59]:
class Flatten(Module):
    def __init__(self, start_dim=0, end_dim=-1):
        super(Flatten, self).__init__()

        self.start_dim = start_dim #с какого по какой размер разглаживать, start_dim - начальное измерение
        self.end_dim = end_dim #конечное измерение

        self.original_shape = None #оригинальная форма

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #Прямой проход - преобразуем тензор в более плоскую форму 
        #(input - произвольный тензор; output - тензор с разглаж измерениями между start and end)
        self.original_shape = input.shape

        #обработка отриц индексов
        start = self.start_dim if self.start_dim >= 0 else len(self.original_shape) + self.start_dim
        end = self.end_dim if self.end_dim >= 0 else len(self.original_shape) + self.end_dim

        #формируем новую форму тензора
        new_shape = (*self.original_shape[:start], -1, *self.original_shape[end+1:])

        self.output = input.reshape(*new_shape)
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # Обратный проход - восстанавливаем исходную форму тензора
        #gradout - grad от следующего слоя; gradin - grad в исходной форме

        self.gradInput = gradOutput.reshape(*self.original_shape) #просто возвращаем grad к исходной форме
        return self.gradInput

    def __repr__(self):
        return "Flatten"

# Activation functions

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**):

In [60]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

## 10. (0.1) Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

In [61]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()

        self.slope = slope #slope - коэф наклона для отриц значений

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #input: входной тензор любой формы
        #Формула: f(x) = max(x, slope*x)
        self.output = np.where(input>0, input, self.slope*input)
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        # Производная: f'(x) = 1 при x>0, иначе slope

        der = np.where(input>0, 1.0, self.slope)
        self.gradInput = gradOutput * der #Умножаем град на производную
        return self.gradInput

    def __repr__(self):
        return "LeakyReLU"

## 11. (0.1) ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [78]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()

        self.alpha = alpha #alpha - коэф масштаба для отриц значений

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #Прямой проход: input - входной тензор любой формы
        #Формула: f(x) = x при x > 0, else alpha * (exp(x)-1)

        self.output = np.where(input>0, input, self.alpha*(np.exp(input)-1))
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        #производная: f'(x) = 1 при x>0 else f(x) + alpha
        der = np.where(input>0, 1.0, self.output+self.alpha)
        self.gradInput = gradOutput * der #Умножаем град на производную
        return self.gradInput

    def __repr__(self):
        return "ELU"

## 12. (0.1) SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [63]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #f(x) = log(1+exp(x))
        self.output = np.log1p(np.exp(input))
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        #f'(x) = 1/(1+exp(-x)) - сигмоида
        sigm = 1/(1+np.exp(-input))
        self.gradInput = gradOutput * sigm
        return self.gradInput

    def __repr__(self):
        return "SoftPlus"

# 13. (0.2) Gelu
Implement [**Gelu**](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) activations.

In [79]:
class Gelu(Module):
    def __init__(self):
        super(Gelu, self).__init__()

    def updateOutput(self, input):
        # Your code goes here. ################################################
        #f(x) = x* phi(x) (phi(x) - куммулятивная ф-ция распределения std
        self.output = 0.5 * input * (1.0 + erf(input / np.sqrt(2.0)))
        return  self.output

    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        #Произв = f'(x) = phi(x) + x* hi(x) (hi(x) - плотность std)
        hi = np.exp(-0.5 * input**2)/np.sqrt(2.0 * np.pi)
        phi = 0.5 * (1.0 + erf(input/np.sqrt(2.0)))
        der = phi + input * hi

        self.gradInput = gradOutput * der
        return self.gradInput

    def __repr__(self):
        return "Gelu"

# Criterions

Criterions are used to score the models answers.

In [65]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [66]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 14. (0.2) Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [86]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        super(ClassNLLCriterionUnstable, self).__init__()

    def updateOutput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)

        # Your code goes here. ################################################
        self.output = -np.mean(np.log(input_clamp[np.arange(len(target)), np.argmax(target, axis=1)]))
        return self.output

    def updateGradInput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)

        # Your code goes here. ################################################
        self.gradInput = np.zeros_like(input)
        target_indices = np.argmax(target, axis=1)
        self.gradInput[np.arange(len(target)), target_indices] = -1.0 / (input.shape[0] * input_clamp[np.arange(len(target)), target_indices])
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterionUnstable"

## 15. (0.3) Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [87]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
        # Your code goes here. ################################################
        self.output = -np.mean(input[np.arange(len(target)), np.argmax(target, axis=1)])
        return self.output

    def updateGradInput(self, input, target):
        # Your code goes here. ################################################
        self.gradInput = np.zeros_like(input)
        target_indices = np.argmax(target, axis=1)
        self.gradInput[np.arange(len(target)), target_indices] = -1.0 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion"

1-я часть задания: реализация слоев, лосей и функций активации - 5 баллов. \\
2-я часть задания: реализация моделей на своих классах. Что должно быть:
  1. Выберите оптимизатор и реализуйте его, чтоб он работал с вами классами. - 1 балл.
  2. Модель для задачи мультирегрессии на выбраных вами данных. Использовать FCNN, dropout, batchnorm, MSE. Пробуйте различные фукнции активации. Для первой модели попробуйте большую, среднюю и маленькую модель. - 1 балл.
  3. Модель для задачи мультиклассификации на MNIST. Использовать свёртки, макспулы, флэттэны, софтмаксы - 1 балла.
  4. Автоэнкодер для выбранных вами данных. Должен быть на свёртках и полносвязных слоях, дропаутах, батчнормах и тд. - 2 балла. \\

Дополнительно в оценке каждой модели будет учитываться:
1. Наличие правильно выбранной метрики и лосс функции.
2. Отрисовка графиков лосей и метрик на трейне-валидации. Проверка качества модели на тесте.
3. Наличие шедулера для lr.
4. Наличие вормапа.
5. Наличие механизма ранней остановки и сохранение лучшей модели.
6. Свитч лося (метрики) и оптимайзера.