In [1]:
import numpy as np
import copy

**Module** is an abstract class which defines fundamental methods necessary for a training a neural network. You do not need to change anything here, just read the comments.

In [44]:
class Module(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        return self.updateOutput(input)

    def updateOutput(self, input):
        assert False, "Base version must not be called"
        pass
    
    def backward(self, input, gradOutput):
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput

    def updateGradInput(self, input, gradOutput):
        pass   
    
    def accGradParameters(self, input, gradOutput):
        pass
    
    def zeroGradParameters(self): 
        pass
        
    def getParameters(self):
        return []
        
    def getGradParameters(self):
        return []
    
    def training(self):
        self.training = True
    
    def evaluate(self):
        self.training = False
    
    def __repr__(self):
        return "Module"

# Sequential container

**Define** a forward and backward pass procedures.

In [49]:
class Sequential(Module):
    def __init__ (self):
        super(Sequential, self).__init__()
        self.modules = []
   
    def add(self, module):
        self.modules.append(module)

    def updateOutput(self, input):
        self.outputs = []
        for i in range(len(self.modules)):
            input = self.modules[i].forward(input)
            self.outputs.append(input)
        self.output = self.outputs[-1]
        return self.output

    def backward(self, input, gradOutput):
        for i in reversed(range(1, len(self.modules))):
            gradOutput = self.modules[i].backward(self.outputs[i - 1], gradOutput)
        self.gradInput = self.modules[0].backward(input, gradOutput)
        return self.gradInput
      
    def zeroGradParameters(self): 
        for module in self.modules:
            module.zeroGradParameters()
    
    def getParameters(self):
        return [x.getParameters() for x in self.modules]
    
    def getGradParameters(self):
        return [x.getGradParameters() for x in self.modules]
    
    def training(self):
        self.training = True
        for i in range(len(self.modules)):
            self.modules[i].training()
    
    def evaluate(self):
        self.training = False
        for i in range(len(self.modules)):
            self.modules[i].evaluate()
        
    def __repr__(self):
        string = "".join([str(x) + '\n' for x in self.modules])
        return string
    
    def __getitem__(self,x):
        return self.modules.__getitem__(x)

# Layers

- input:   **`batch_size x n_feats1`**
- output: **`batch_size x n_feats2`**

In [70]:
class Linear(Module):
    def __init__(self, n_in, n_out):
        super(Linear, self).__init__()
        self.n_in = n_in
        self.n_out = n_out
        stdv = 1 / np.sqrt(n_in)
        self.W = np.random.uniform(-stdv, stdv, size=(n_out, n_in))
        self.b = np.random.uniform(-stdv, stdv, size=n_out)
        self.gradW = np.zeros_like(self.W)
        self.gradb = np.zeros_like(self.b)
        
    def updateOutput(self, input):
        assert len(input.shape) == 2
        assert input.shape[1] == self.n_in
        self.output = np.dot(input, self.W.T) + self.b[np.newaxis, :]
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.dot(gradOutput, self.W)
        assert self.gradInput.shape == input.shape
        return self.gradInput
    
    def accGradParameters(self, input, gradOutput):
        self.gradW = np.dot(input.T, gradOutput).T
        self.gradb = np.sum(gradOutput, axis=0)
    
    def zeroGradParameters(self):
        self.gradW.fill(0)
        self.gradb.fill(0)
        
    def getParameters(self):
        return [self.W, self.b]
    
    def getGradParameters(self):
        return [self.gradW, self.gradb]
    
    def __repr__(self):
        s = self.W.shape
        q = 'Linear %d -> %d' %(s[1],s[0])
        return "Linear"

This one is probably the hardest but as others only takes 5 lines of code in total. 
- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [71]:
class SoftMax(Module):
    def __init__(self):
         super(SoftMax, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        a = np.exp(self.output)
        self.output = a / np.sum(a, axis=1)[:, np.newaxis]
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        G = np.multiply(gradOutput, self.output)
        coeffs = G.sum(axis=1)[:, np.newaxis]
        self.gradInput = G - np.multiply(self.output, coeffs)
        return self.gradInput
    
    def __repr__(self):
        return "SoftMax"

One of the most significant recent ideas that impacted NNs a lot is [**Batch normalization**](http://arxiv.org/abs/1502.03167). The idea is simple, yet effective: the features should be whitened ($mean = 0$, $std = 1$) all the way through NN. This improves the convergence for deep models letting it train them for days but not weeks. **You are** to implement a part of the layer: mean subtraction. That is, the module should calculate mean value for every feature (every column) and subtract it.

Note, that you need to estimate the mean over the dataset to be able to predict on test examples. The right way is to create a variable which will hold smoothed mean over batches (exponential smoothing works good) and use it when forwarding test examples.

When training calculate mean as folowing: 
```
    mean_to_subtract = self.old_mean * alpha + batch_mean * (1 - alpha)
```
when evaluating (`self.training == False`) set $alpha = 1$.


- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [85]:
class BatchMeanSubtraction(Module):
    def __init__(self, alpha=0):
        super(BatchMeanSubtraction, self).__init__()
        self.train_alpha = alpha
        self.alpha = alpha
        self.old_mean = None
        
    def updateOutput(self, input):
        if self.old_mean is None:
            self.old_mean = np.mean(input, axis=0)
        else:
            self.old_mean = self.alpha * self.old_mean + (1 - self.alpha) * np.mean(input, axis=0)
        self.output = input - self.old_mean[np.newaxis, :]
        return self.output 
    
    def training(self):
        self.training = True
        self.alpha = self.train_alpha
    
    def evalute(self, input):
        self.training = False
        self.alpha = 1.0
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = (1 - self.alpha / input.shape[0]) * np.array(gradOutput)
        # self.gradInput = np.array(gradOutput)
        return self.gradInput
    
    def __repr__(self):
        return "BatchMeanNormalization"

Implement [**dropout**](https://www.cs.toronto.edu/~hinton/absps/JMLRdropout.pdf). The idea and implementation is really simple: just multimply the input by $Bernoulli(p)$ mask. 

This is a very cool regularizer. In fact, when you see your net is overfitting try to add more dropout.

While training (`self.training == True`) it should sample a mask on each iteration (for every batch). When testing this module should implement identity transform i.e. `self.output = input`.

- input:   **`batch_size x n_feats`**
- output: **`batch_size x n_feats`**

In [74]:
class Dropout(Module):
    def __init__(self, p=0.5):
        super(Dropout, self).__init__()
        self.p = p
        self.mask = None
        
    def updateOutput(self, input):
        if self.training:
            self.mask = np.random.choice(2, input.shape, p=[self.p, 1 - self.p])
        else:
            self.mask = np.full(input.shape, 1 - self.p)
        self.output = np.multiply(input, self.mask)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(self.mask, gradOutput)
        return self.gradInput
        
    def __repr__(self):
        return "Dropout"

# Activation functions

In [75]:
class Tanh(Module):
    def __init__(self):
         super(Tanh, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.tanh(input)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.dot((1 - self.output ** 2), gradOutput.T)
        return self.gradInput
    
    def __repr__(self):
        return "Tanh"

Here's the complete example for the **Rectified Linear Unit** non-linearity (aka **ReLU**): 

In [55]:
class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput, input > 0)
        return self.gradInput
    
    def __repr__(self):
        return "ReLU"

Implement [**Leaky Rectified Linear Unit**](http://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29%23Leaky_ReLUs). Expriment with slope. 

In [56]:
class LeakyReLU(Module):
    def __init__(self, slope=0.03):
        super(LeakyReLU, self).__init__()
        self.slope = slope
        
    def updateOutput(self, input):
        self.output = np.array(input)
        self.output[self.output < 0] *= self.slope
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.array(gradOutput)
        self.gradInput[input < 0] *= self.slope
        return self.gradInput
    
    def __repr__(self):
        return "LeakyReLU"

Implement [**Exponential Linear Units**](http://arxiv.org/abs/1511.07289) activations.

In [57]:
class ELU(Module):
    def __init__(self, alpha=1.0):
        super(ELU, self).__init__()
        self.alpha = alpha
        
    def updateOutput(self, input):
        self.output = np.array(input)
        self.mask = self.output < 0
        self.output[self.mask] = self.alpha * (np.exp(self.output[self.mask]) - 1)
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.array(gradOutput)
        self.gradInput[self.mask] *= self.alpha * np.exp(input[self.mask])
        return self.gradInput
    
    def __repr__(self):
        return "ELU"

Implement [**SoftPlus**](https://en.wikipedia.org/wiki%2FRectifier_%28neural_networks%29) activations. Look, how they look a lot like ReLU.

In [58]:
class SoftPlus(Module):
    def __init__(self):
        super(SoftPlus, self).__init__()
    
    def updateOutput(self, input):
        self.output = np.log(1 + np.exp(input))
        return self.output
    
    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput, 1.0 / (1 + np.exp(-input)))
        return self.gradInput
    
    def __repr__(self):
        return "SoftPlus"

# Criterions

Criterions are used to score the models answers. 

In [76]:
class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None
        
    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function 
            associated to the criterion and return the result.
            
            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result. 

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)
    
    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput   

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want 
        to have readable description. 
        """
        return "Criterion"

The **MSECriterion**, which is basic L2 norm usually used for regression, is implemented here for you.

In [83]:
class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()
        
    def updateOutput(self, input, target):   
        self.output = np.sum(np.power(input - target, 2)) / input.shape[0]
        return self.output 
 
    def updateGradInput(self, input, target):
        self.gradInput = 2 * (input - target) / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

You task is to implement the **ClassNLLCriterion**. It should implement [multiclass log loss](https://www.kaggle.com/wiki/MultiClassLogLoss). Nevertheless there is a sum over `y` (target) in that formula, 
remember that targets are one-hot encoded. This fact simplifies the computations a lot. Note, that criterions are the only places, where you divide by batch size. 

In [82]:
class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self)
        super(ClassNLLCriterion, self).__init__()
        
    def updateOutput(self, input, target): 
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15))
        self.output = np.sum(np.multiply(target, -np.log(input_clamp))) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        input_clamp = np.maximum(1e-15, np.minimum(input, 1 - 1e-15))
        self.gradInput = -np.divide(target, input_clamp) / input.shape[0]
        return self.gradInput
    
    def __repr__(self):
        return "ClassNLLCriterion"