In [1]:
#conda update sympy

In [4]:
import numpy as np
import torch
from torch.autograd import Variable
import unittest

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [5]:
class Module(object):
    """
    Basically, you can think of a module as of a something (black box)
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`:

        output = module.forward(input)

    The module should be able to perform a backward pass: to differentiate the `forward` function.
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule.

        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.

        This includes
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.

        Make sure to both store the data in `output` field and return it.
        """

        # The easiest case:

        # self.output = input
        # return self.output

        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input.
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.

        The shape of `gradInput` is always the same as the shape of `input`.

        Make sure to both store the gradients in `gradInput` field and return it.
        """

        # The easiest case:

        # self.gradInput = gradOutput
        # return self.gradInput

        pass

    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass

    def zeroGradParameters(self):
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass

    def getParameters(self):
        """
        Returns a list with its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True

    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [6]:
class Sequential(Module):
    """
         This class implements a container, which processes `input` data sequentially. 
         
         `input` is processed by each module (layer) in self.modules consecutively.
         The resulting array is called `output`. 
    """
    
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        """
        Adds a module to the container.
        """
        self.modules.append(module)

    def updateOutput(self, input):
        """
        Basic workflow of FORWARD PASS:
        
            y_0    = module[0].forward(input)
            y_1    = module[1].forward(y_0)
            ...
            output = module[n-1].forward(y_{n-2})   
            
            
        Just write a little loop. 
        """
        self.output = input
        for module in self.modules:
            self.output = module.forward(self.output)
        return self.output

    def backward(self, input, gradOutput):
        """
        Workflow of BACKWARD PASS:
            
            g_{n-1} = module[n-1].backward(y_{n-2}, gradOutput)
            g_{n-2} = module[n-2].backward(y_{n-3}, g_{n-1})
            ...
            g_1 = module[1].backward(y_0, g_2)   
            gradInput = module[0].backward(input, g_1)   
             
             
        !!!
                
        To ech module you need to provide the input, module saw while forward pass, 
        it is used while computing gradients. 
        Make sure that the input for `i-th` layer the output of `module[i]` (just the same input as in forward pass) 
        and NOT `input` to this Sequential module. 
        
        !!!
        
        """
        #прямой проход
        self.forward = []
        current_output = input
        self.forward.append(current_output)
        for module in self.modules:
            current_output = module.forward(current_output)
            self.forward.append(current_output)

        #обратный проход
        current_grad = gradOutput
        for i in range(len(self.modules)-1, 0, -1):
            current_grad = self.modules[i].backward(self.forward[i], current_grad)
        
        self.gradInput = self.modules[0].backward(self.forward[0], current_grad)       
        return self.gradInput
        
    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        """
        Should gather all parameters in a list.
        """
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        """
        Should gather all gradients w.r.t parameters in a list.
        """
        return [x.getGradParameters() for x in self.modules]
    
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)
    
    def train(self):
        """
        Propagates training parameter through all modules
        """
        self.training = True
        for module in self.modules:
            module.train()
    
    def evaluate(self):
        """
        Propagates training parameter through all modules
        """
        self.training = False
        for module in self.modules:
            module.evaluate()

# Layers

## 1 (0.2). Linear transform layer
Also known as dense layer, fully-connected layer, FC-layer, InnerProductLayer (in caffe), affine transform
- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [7]:
class Linear(Module):
    """
    A module which applies a linear transformation 
    A common name is fully-connected layer, InnerProductLayer in caffe. 
    
    The module should work with 2D input of shape (n_samples, n_feature).
    """
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
       
        # This is a nice initialization
        stdv = 1./np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size = (n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size = n_out)
        
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        # Your code goes here. ################################################
        self.output = input @ self.W.T + self.b
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradInput = gradOutput @ self.W
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        # Your code goes here. ################################################
        self.gradW = gradOutput.T @ input
        self.gradb = gradOutput.sum(axis=0)
        pass
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return q

## 2. (0.2) SoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{softmax}(x)_i = \frac{\exp x_i} {\sum_j \exp x_j}$

Recall that $\text{softmax}(x) == \text{softmax}(x - \text{const})$. It makes possible to avoid computing exp() from large argument.

In [8]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        I = np.eye(input.shape[0],input.shape[0])
        den = 1/np.sum(np.exp(input),axis=1)
        den_matrix = den * I
        self.output = den_matrix @ np.exp(input) 
        return self.output

    def updateGradInput(self, input, gradOutput):
        matrix = np.empty(gradOutput.shape)
        I_n=np.eye(gradOutput.shape[1])
        for i in range(matrix.shape[0]):
            dldy=gradOutput[i]
            dydx=(self.output[i] * I_n) @ (I_n-self.output[i])
            dldx= dldy @ dydx
            matrix[i]=dldx
        self.gradInput = matrix
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "SoftMax"

## 3. (0.2) LogSoftMax
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

$\text{logsoftmax}(x)_i = \log\text{softmax}(x)_i = x_i - \log {\sum_j \exp x_j}$

The main goal of this layer is to be used in computation of log-likelihood loss.

In [9]:
class LogSoftMax(Module):
    def __init__(self):
         super(LogSoftMax, self).__init__()

    def updateOutput(self, input):
        # start with normalization for numerical stability
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        self.output = (input.T - np.log(np.sum(np.exp(input),axis=1)).T).T  
        return self.output

    def updateGradInput(self, input, gradOutput): 
        matrix = np.empty(gradOutput.shape)
        I=np.eye(gradOutput.shape[1])
        for i in range(matrix.shape[0]):
            dldy=gradOutput[i]
            dydx=(I - np.exp(self.output[i]))
            dldx= dldy @ dydx
            matrix[i]=dldx
        self.gradInput= matrix
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "LogSoftMax"

## 4. (0.3) Batch normalization (градиент)
One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement the first part of the layer: features normalization. The second part (`ChannelwiseScaling` layer) is implemented below.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

The layer should work as follows. While training (`self.training == True`) it transforms input as $$y = \frac{x - \mu}  {\sqrt{\sigma + \epsilon}}$$
where $\mu$ and $\sigma$ - mean and variance of feature values in **batch** and $\epsilon$ is just a small number for numericall stability. Also during training, layer should maintain exponential moving average values for mean and variance:
```
    self.moving_mean = self.moving_mean * alpha + batch_mean * (1 - alpha)
    self.moving_variance = self.moving_variance * alpha + batch_variance * (1 - alpha)
```
During testing (`self.training == False`) the layer normalizes input using moving_mean and moving_variance.

Note that decomposition of batch normalization on normalization itself and channelwise scaling here is just a common **implementation** choice. In general "batch normalization" always assumes normalization + scaling.

In [10]:
class BatchNormalization(Module):
     EPS = 1e-3
     def __init__(self, alpha = 0.):
         super(BatchNormalization, self).__init__()
         self.alpha = alpha
         self.moving_mean = None
         self.moving_variance = None

     def updateOutput(self, input):
         self.batch_mean = input.mean(axis=0)
         self.batch_variance = input.var(axis=0)  
         if self.training:
             self.output = (input-self.batch_mean)/np.sqrt(self.batch_variance + self.EPS)
             if self.moving_mean is None:
                self.moving_mean = self.batch_mean
                self.moving_variance = self.batch_mean
             else:
                self.moving_mean = self.moving_mean * self.alpha + self.batch_mean * (1 - self.alpha)
                self.moving_variance = self.moving_variance * self.alpha + self.batch_variance * (1 - self.alpha)
         else: 
             self.output = (input-self.moving_mean )/np.sqrt(self.moving_variance + self.EPS)
         return self.output

     def updateGradInput(self, input, gradOutput):
         if not self.training:
             self.gradInput = gradOutput / np.sqrt(self.moving_variance + self.EPS)
             return self.gradInput
             
         first= gradOutput/np.sqrt(self.batch_variance + self.EPS)
        
         const = -0.5 * (self.batch_variance + self.EPS)**(-3/2)
         dldsigma = np.sum(gradOutput * (input-self.batch_mean),axis=0) * const
        
         second = dldsigma * (2*(input-self.batch_mean)/input.shape[0])

         mu_first = np.sum(-first,axis=0)
         mu_second = dldsigma * np.sum(-2*(input-self.batch_mean),axis=0)/(input.shape[0]-1)
         dldmu = mu_first + mu_second
        
         third = dldmu/input.shape[0]
         self.gradInput = first + second + third

        
         return self.gradInput

     def __repr__(self):
         return "BatchNormalization"

In [11]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

Practical notes. If BatchNormalization is placed after a linear transformation layer (including dense layer, convolutions, channelwise scaling) that implements function like `y = weight * x + bias`, than bias adding become useless and could be omitted since its effect will be discarded while batch mean subtraction. If BatchNormalization (followed by `ChannelwiseScaling`) is placed before a layer that propagates scale (including ReLU, LeakyReLU) followed by any linear transformation layer than parameter `gamma` in `ChannelwiseScaling` could be freezed since it could be absorbed into the linear transformation layer.

## 5. (0.3) Dropout
Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. Here $p$ is probability of an element to be zeroed.

This has proven to be an effective technique for regularization and preventing the co-adaptation of neurons.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch), zero out elements and multiply elements by $1 / (1 - p)$. The latter is needed for keeping mean values of features close to mean values which will be in test mode. When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [12]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()

        self.p = p
        self.mask = None

    def updateOutput(self, input):
        if self.training == True:
            self.mask = np.random.binomial(1, 1 - self.p, input.shape)
            self.output = input * self.mask / (1 - self.p)
        else:
            self.output = input
        return  self.output

    def updateGradInput(self, input, gradOutput):
        if self.training == True:
            self.gradInput = gradOutput * self.mask / (1 - self.p)
        else:
            self.gradInput = gradOutput  
        return self.gradInput

    def __repr__(self):
        return "Dropout"

# 6. (2.0) Conv2d
Implement [**Conv2d**](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html). Use only this list of parameters: (in_channels, out_channels, kernel_size, stride, padding, bias, padding_mode) and fix dilation=1 and groups=1.

In [13]:
# class Conv2d(Module):
#     def __init__(self, in_channels, out_channels, kernel_size,
#                  stride=1, padding=0, bias=True, padding_mode='zeros'):
#         super(Conv2d, self).__init__()

#         self.in_channels = in_channels
#         self.out_channels = out_channels
#         self.kernel_size = kernel_size
#         self.stride = stride
#         self.padding = padding
#         self.bias = bias
#         self.padding_mode = padding_mode
        
#         self.weights = np.random.randn(out_channels, in_channels, kernel_size, kernel_size) / np.sqrt(in_channels * kernel_size * kernel_size)
#     def updateOutput(self, input):
        
#         # Your code goes here. ################################################
#         return  self.output

#     def updateGradInput(self, input, gradOutput):
#         # Your code goes here. ################################################
#         return self.gradInput

#     def __repr__(self):
#         return "Conv2d"

# 7. (0.5) Implement [**MaxPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) and [**AvgPool2d**](https://pytorch.org/docs/stable/generated/torch.nn.AvgPool2d.html). Use only parameters like kernel_size, stride, padding (negative infinity for maxpool and zero for avgpool) and other parameters fixed as in framework.

In [14]:
class MaxPool2d(Module):
    def __init__(self, kernel_size, stride=None, padding=0):
        super(MaxPool2d, self).__init__()
        
        self.kernel_size = kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride = stride if isinstance(stride, tuple) else (stride, stride) if stride is not None else self.kernel_size
        self.padding = padding if isinstance(padding, tuple) else (padding, padding)
        
        # Cache for storing max locations for backward pass
        self.max_indices = None
        self.input_shape = None

    def updateOutput(self, input):
        self.input_shape=input.shape
        batch_size, channels, height, width = self.input_shape
        out_h = (height + 2 * self.padding[0] - self.kernel_size[0]) // self.stride[0] + 1
        out_w = (width + 2 * self.padding[1] - self.kernel_size[1]) // self.stride[1] + 1
        

        if np.any(self.padding):
            padded = np.pad(input, 
                          ((0, 0), (0, 0), 
                           (self.padding[0], self.padding[0]), 
                           (self.padding[1], self.padding[1])), 
                          mode='constant')
        else:
            padded = input
            
        col = np.zeros((batch_size, channels, self.kernel_size[0], self.kernel_size[1], out_h, out_w))
        for y in range(self.kernel_size[0]):
            y_max = y + self.stride[0] * out_h
            for x in range(self.kernel_size[1]):
                x_max = x + self.stride[1] * out_w
                col[:, :, y, x, :, :] = padded[:, :, y:y_max:self.stride[0], x:x_max:self.stride[1]]
                
        col = col.transpose(0, 1, 4, 5, 2, 3).reshape(-1, self.kernel_size[0]*self.kernel_size[1])

        max_values = np.max(col, axis=1)
        self.max_indices = np.argmax(col, axis=1)
        

        self.output = max_values.reshape(batch_size, channels, out_h, out_w)

        
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        batch_size, channels, out_h, out_w = gradOutput.shape
        grad_flat = gradOutput.transpose(0, 1, 2, 3).ravel()
        
        # Create gradient matrix
        grad_col = np.zeros((len(self.max_indices), self.kernel_size[0]*self.kernel_size[1]))
        grad_col[np.arange(len(self.max_indices)), self.max_indices] = grad_flat
        
        # Reshape gradient to original window format
        grad_col = grad_col.reshape(batch_size, channels, out_h, out_w, self.kernel_size[0], self.kernel_size[1])
        grad_col = grad_col.transpose(0, 1, 4, 5, 2, 3)
        
        # Reconstruct gradient image
        self.gradInput = np.zeros((batch_size, channels, 
                                 self.input_shape[2] + 2*self.padding[0], 
                                 self.input_shape[3] + 2*self.padding[1]))
        
        for y in range(self.kernel_size[0]):
            y_max = y + self.stride[0] * out_h
            for x in range(self.kernel_size[1]):
                x_max = x + self.stride[1] * out_w
                self.gradInput[:, :, y:y_max:self.stride[0], x:x_max:self.stride[1]] += grad_col[:, :, y, x, :, :]
        
        # Remove padding if needed
        if np.any(self.padding):
            self.gradInput = self.gradInput[:, :, 
                                          self.padding[0]:self.input_shape[2]+self.padding[0], 
                                          self.padding[1]:self.input_shape[3]+self.padding[1]]
        
        return self.gradInput
    
    def __repr__(self):
        return "MaxPool2d"

In [15]:

class AvgPool2d(Module):
    def __init__(self, kernel_size, stride=None, padding=0):
        super(AvgPool2d, self).__init__()
        
        self.kernel_size =kernel_size if isinstance(kernel_size, tuple) else (kernel_size, kernel_size)
        self.stride=stride if isinstance(stride, tuple) else (stride, stride) if stride is not None else self.kernel_size
        self.padding= padding if isinstance(padding, tuple) else (padding, padding)
        
        self.input_shape = None
        
    def updateOutput(self, input):
        self.input_shape = input.shape
        batch_size, channels, height, width =input.shape
        out_h = (height+2* self.padding[0]- self.kernel_size[0])//self.stride[0]+1
        out_w = (width+2*self.padding[1] -self.kernel_size[1])//self.stride[1]+1
        

        if np.any(self.padding):
            padded = np.pad(input, 
                          ((0, 0), (0, 0), 
                           (self.padding[0], self.padding[0]), 
                           (self.padding[1], self.padding[1])), 
                          mode='constant')
        else:
            padded=input
            
        col =np.zeros((batch_size, channels, self.kernel_size[0], self.kernel_size[1], out_h, out_w))
        
        for y in range(self.kernel_size[0]):
            y_max= y +self.stride[0]*out_h
            for x in range(self.kernel_size[1]):
                x_max= x+self.stride[1]*out_w
                col[:, :, y, x, :, :] = padded[:, :, y:y_max:self.stride[0], x:x_max:self.stride[1]]
                
        col=col.transpose(0, 1, 4, 5, 2, 3).reshape(-1, self.kernel_size[0]*self.kernel_size[1])

        avg_values=np.mean(col, axis=1)

        self.output=avg_values.reshape(batch_size, channels, out_h, out_w)
        
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        batch_size, channels, out_h, out_w=gradOutput.shape
        norm_factor=1/(self.kernel_size[0]*self.kernel_size[1])
        grad_flat=gradOutput.transpose(0, 1, 2, 3).ravel() * norm_factor
        

        grad_col=np.zeros((len(grad_flat), self.kernel_size[0]*self.kernel_size[1]))
        grad_col[:]=grad_flat[:, np.newaxis]
        

        grad_col=grad_col.reshape(batch_size, channels, out_h, out_w, self.kernel_size[0], self.kernel_size[1])
        grad_col=grad_col.transpose(0, 1, 4, 5, 2, 3)
        

        self.gradInput=np.zeros((batch_size, channels, 
                                 self.input_shape[2]+2*self.padding[0], 
                                 self.input_shape[3]+2*self.padding[1]))
        
        for y in range(self.kernel_size[0]):
            y_max=y+self.stride[0]*out_h
            for x in range(self.kernel_size[1]):
                x_max=x+self.stride[1]*out_w
                self.gradInput[:, :, y:y_max:self.stride[0], x:x_max:self.stride[1]] +=grad_col[:, :, y, x, :, :]
        

        if np.any(self.padding):
            self.gradInput=self.gradInput[:, :, 
                                          self.padding[0]:self.input_shape[2]+self.padding[0], 
                                          self.padding[1]:self.input_shape[3]+self.padding[1]]
        
        return self.gradInput
    
    def __repr__(self):
        return "AvgPool2d"

# 8. (0.3) Implement **GlobalMaxPool2d** and **GlobalAvgPool2d**. They do not have testing and parameters are up to you but they must aggregate information within channels. Write test functions for these layers on your own.

In [16]:
class GlobalMaxPool2d(Module):
    def __init__(self):
        super(GlobalMaxPool2d, self).__init__()
        self.max_indices =None  
        self.input_shape=None
        
    def updateOutput(self, input):
        self.input_shape=input.shape
        batch_size, channels, height, width =input.shape

        reshaped = input.reshape(batch_size, channels, -1)
        self.output = np.max(reshaped, axis=2)

        self.max_indices = np.argmax(reshaped, axis=2)
        
        return self.output  
    
    def updateGradInput(self, input, gradOutput):
        batch_size, channels = gradOutput.shape
        height, width= self.input_shape[2], self.input_shape[3]

        self.gradInput=np.zeros(self.input_shape)
        grad_reshaped=self.gradInput.reshape(batch_size, channels, -1)
        for b in range(batch_size):
            for c in range(channels):
                grad_reshaped[b, c, self.max_indices[b, c]]=gradOutput[b, c]
        
        return self.gradInput
    
    def __repr__(self):
        return "GlobalMaxPool2d"


class GlobalAvgPool2d(Module):
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
        self.input_shape =None
        
    def updateOutput(self,input):
      
        self.input_shape=input.shape
        batch_size, channels,height,width= input.shape
        
       
        self.output=np.mean(input,axis=(2,3))
        return self.output  
    
    def updateGradInput(self, input, gradOutput):
        batch_size, channels=gradOutput.shape
        height, width =self.input_shape[2],self.input_shape[3]
        norm_factor = 1/(height * width)
        
        self.gradInput=np.zeros(self.input_shape)
        for b in range(batch_size):
            for c in range(channels):
                self.gradInput[b, c]=gradOutput[b, c]*norm_factor
        
        return self.gradInput
    
    def __repr__(self):
        return "GlobalAvgPool2d"





# 9. (0.2) Implement [**Flatten**](https://pytorch.org/docs/stable/generated/torch.flatten.html) 

In [15]:
class Flatten(Module):
    def __init__(self, start_dim=0, end_dim=-1):
        super(Flatten, self).__init__()
        self.start_dim = start_dim
        self.end_dim = end_dim

    def updateOutput(self, input):

        array = []
        for i in input.shape:
            array.append(i)
            
        if (self.start_dim<0):
            self.start_dim=self.start_dim+len(array)
        if self.end_dim<0:
            self.end_dim=self.end_dim+len(array)
            
        end=array[self.end_dim+1:]
        del array[self.start_dim:]
        array.append(np.prod(input.shape[self.start_dim:(self.end_dim+1)]))
        array.extend(end)
        self.output= input.reshape(array)
        return self.output

    def updateGradInput(self, input, gradOutput):
        shape = input.shape
        #num_features = np.prod(shape[self.start_dim:self.end_dim])
        self.gradInput = gradOutput.reshape(shape[0], *shape[1:])
        return self.gradInput

    def __repr__(self):
        return "Flatten"


# Activation functions


Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**):

In [16]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

## 10. (0.1) Leaky ReLU
Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope.

In [17]:
class LeakyReLU(Module):
    def __init__(self, slope = 0.03):
        super(LeakyReLU, self).__init__()

        self.slope = slope

    def updateOutput(self, input):
        self.output= input * ((1 + np.sign(input)) / 2 + self.slope * (1 + np.sign(-input)) / 2)
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = ((1 + np.sign(input)) / 2 + self.slope * (1 + np.sign(-input)) / 2) * gradOutput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "LeakyReLU"

## 11. (0.1) ELU
Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [18]:
class ELU(Module):
    def __init__(self, alpha = 1.0):
        super(ELU, self).__init__()

        self.alpha = alpha

    def updateOutput(self, input):
        self.output = np.where(input>=0, input , self.alpha*(np.exp(input)-1))
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        dydx=np.where(input>=0, 1 , self.alpha*np.exp(input))
        self.gradInput= dydx*gradOutput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "ELU"

## 12. (0.1) SoftPlus
Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [19]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()

    def updateOutput(self, input):
        self.output = np.log(1+np.exp(input))
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        dydx=np.exp(input)/(1+np.exp(input))
        self.gradInput= dydx*gradOutput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "SoftPlus"

## 13. (0.2) Gelu
Implement [**Gelu**](https://pytorch.org/docs/stable/generated/torch.nn.GELU.html) activations.

In [20]:
class Gelu(Module):
    def __init__(self):
        super(Gelu, self).__init__()

    def updateOutput(self, input):
        self.output = 0.5 * input * (1+ np.tanh(np.sqrt(2/np.pi)*(input+0.044715*(input**3))))
        # Your code goes here. ################################################
        return  self.output

    def updateGradInput(self, input, gradOutput):
        tanh_part = np.tanh(np.sqrt(2 / np.pi) * (input + 0.044715 * (input ** 3)))
        tanh_derivative = 1 - tanh_part ** 2  

        # Compute the gradient
        grad = 0.5 * (1 + tanh_part) + (0.5 * input * np.sqrt(2 / np.pi) * (1 + 0.134715 * (input ** 2))) * tanh_derivative
        
        self.gradInput = grad * gradOutput
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "Gelu"

# Criterions

Criterions are used to score the models answers.

In [21]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.
- input:   **`batch_size x n_feats`**
- target: **`batch_size x n_feats`**
- output: **scalar**

In [22]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)*0.5) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

## 14. (0.2) Negative LogLikelihood criterion (numerically unstable)
You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](http://scikit-learn.org/stable/modules/model_evaluation.html#log-loss). Nevertheless there is a sum over `y` (target) in that formula,
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. Also there is a small hack with adding small number to probabilities to avoid computing log(0).
- input:   **`batch_size x n_feats`** - probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**



In [23]:
class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self)
        super(ClassNLLCriterionUnstable, self).__init__()

    def updateOutput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.output = (-1/input_clamp.shape[0]) * np.sum((target*np.log(input_clamp)))
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):

        # Use this trick to avoid numerical errors
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.gradInput = (-1/input_clamp.shape[0])*(target/input_clamp)
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterionUnstable"

## 15. (0.3) Negative LogLikelihood criterion (numerically stable)
- input:   **`batch_size x n_feats`** - log probabilities
- target: **`batch_size x n_feats`** - one-hot representation of ground truth
- output: **scalar**

Task is similar to the previous one, but now the criterion input is the output of log-softmax layer. This decomposition allows us to avoid problems with computation of forward and backward of log().

In [24]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = (-1/input.shape[0]) * np.sum((target*input))
        # Your code goes here. ################################################
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = -target/input.shape[0]
        # Your code goes here. ################################################
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion"
        

# Задание 

1-я часть задания: реализация слоев, лосей и функций активации 
2-я часть задания: реализация моделей на своих классах. Что должно быть:
  1. Выберите оптимизатор и реализуйте его, чтоб он работал с вами классами. 
  2. Модель для задачи мультирегрессии на выбраных вами данных. Использовать FCNN, dropout, batchnorm, MSE. Пробуйте различные фукнции активации. Для первой модели попробуйте большую, среднюю и маленькую модель. .

Дополнительно в оценке каждой модели будет учитываться:
1. Наличие правильно выбранной метрики и лосс функции.
2. Отрисовка графиков лосей и метрик на трейне-валидации. Проверка качества модели на тесте.
3. Наличие шедулера для lr.
4. Наличие вормапа.
5. Наличие механизма ранней остановки и сохранение лучшей модели.
6. Свитч лося (метрики) и оптимайзера.

## вторая часть задания реализована в папке: часть 2(гусева)