In [None]:
import numpy as np

class Module(object):
    """
    Basically, you can think of a module as of a something (black box)
    which can process `input` data and produce `ouput` data.
    This is like applying a function which is called `forward`:

        output = module.forward(input)

    The module should be able to perform a backward pass: to differentiate the `forward` function.
    More, it should be able to differentiate it if is a part of chain (chain rule).
    The latter implies there is a gradient from previous step of a chain rule.

        gradInput = module.backward(input, gradOutput)
    """
    def __init__ (self):
        self.output = None
        self.gradInput = None
        self.training = True

    def forward(self, input):
        """
        Takes an input object, and computes the corresponding output of the module.
        """
        return self.updateOutput(input)

    def backward(self,input, gradOutput):
        """
        Performs a backpropagation step through the module, with respect to the given input.

        This includes
         - computing a gradient w.r.t. `input` (is needed for further backprop),
         - computing a gradient w.r.t. parameters (to update parameters while optimizing).
        """
        self.updateGradInput(input, gradOutput)
        self.accGradParameters(input, gradOutput)
        return self.gradInput


    def updateOutput(self, input):
        """
        Computes the output using the current parameter set of the class and input.
        This function returns the result which is stored in the `output` field.

        Make sure to both store the data in `output` field and return it.
        """

        # The easiest case:

        # self.output = input
        # return self.output

        pass

    def updateGradInput(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own input.
        This is returned in `gradInput`. Also, the `gradInput` state variable is updated accordingly.

        The shape of `gradInput` is always the same as the shape of `input`.

        Make sure to both store the gradients in `gradInput` field and return it.
        """

        # The easiest case:

        # self.gradInput = gradOutput
        # return self.gradInput

        pass

    def accGradParameters(self, input, gradOutput):
        """
        Computing the gradient of the module with respect to its own parameters.
        No need to override if module has no parameters (e.g. ReLU).
        """
        pass

    def zeroGradParameters(self):
        """
        Zeroes `gradParams` variable if the module has params.
        """
        pass

    def getParameters(self):
        """
        Returns a list with its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def getGradParameters(self):
        """
        Returns a list with gradients with respect to its parameters.
        If the module does not have parameters return empty list.
        """
        return []

    def train(self):
        """
        Sets training mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = True

    def evaluate(self):
        """
        Sets evaluation mode for the module.
        Training and testing behaviour differs for Dropout, BatchNorm.
        """
        self.training = False

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Module"

In [None]:
class Sequential(Modules):
  def __init__(self):
    super(Sequential, self).__init__()
    self.modules = []

  def add(self, module):
    return self.modules.append(module)

  def updateOutput(self, input):
    current_input = input
    for module in self.modules:
      current_input = module.forward(current_input)
    self.output = current_input
    return self.output

  def backward(self, input, gradOutput):
    current_grad = gradOutput
    current_input = input
    for i in range(len(modules) -1, -1, -1):
      if i ==0:
        current_input = input
      else:
        current_input = self.modules[i-1].output
      current_grad = module.backward(current_input, current_grad)
    self.gradInput = current_grad
    return self.grad_input

  def zeroGradParameters(self):
        for module in self.modules:
            module.zeroGradParameters()

  def getParameters(self):
      """
      Should gather all parameters in a list.
      """
      return [x.getParameters() for x in self.modules]

  def getGradParameters(self):
      """
      Should gather all gradients w.r.t parameters in a list.
      """
      return [x.getGradParameters() for x in self.modules]

  def __repr__(self):
      string = "".join([str(x) + '\n' for x in self.modules])
      return string

  def __getitem__(self,x):
      return self.modules.__getitem__(x)

  def train(self):
      """
      Propagates training parameter through all modules
      """
      self.training = True
      for module in self.modules:
          module.train()

  def evaluate(self):
      """
      Propagates training parameter through all modules
      """
      self.training = False
      for module in self.modules:
          module.evaluate()




In [None]:
class Linear(Module):
  def __init__(self, n_in, n_out):
    super(Linear, self).__init__()
    stdv = 1./np.sqrt(n_in)
    self.W = np.random.uniform(-stdv, stdv, size = (n_in, n_out))
    self.b = np.random.uniform(-stdv, stdv, size = n_out)

    self.gradW = np.zeros_like(self.W)
    self.gradb = np.zeros_like(self.b)

  def update_output(self, input):
    self.output = np.dot(input * self.W.T) + self.b
    return self.output

  def updateGradInput(self, input, gradOutput):
    self.gradInput = np.dot(gradOutput * self.W)
    retrun self.gradInput

  def accGradParameters(self, input, gradOutput):
    self.gradW += np.dot(gradOutput.T, input)
    self.gradb += np.sum(gradOutput, axixs = 0)

  def zeroGradParameters(self):
    self.gradW.fill(0)
    self.gradb.fill(0)

  def getParameters(self):
    return [self.W, self.b]

  def getGradParameters(self):
    return [self.gradW, self.gradb]

  def __repr__(self):
    s = self.W.shape
    q = 'Linear %d -> %d' %(s[1],s[0])
    return q

In [None]:
class SoftMax(Module):
  def __init__(self):
    super(SoftMax, self).__init__()

  def update_output(self,input):
    self.output = np.subtract(input, input.max(axis = 1, keepdims = True))
    self.output = np.exp(self.output)
    self.output = self.output / np.sum(self.output, axis = 1, keepdims = True)

    return self.output

  def updateGradInput(self, input, gradOutput):
    dot_product = np.sum(gradOutput * self.output, axis = 1, keepdims = True)
    self.gradInput = self.output * dot_product - gradOutput * self.output
    return self.gradInput

  def __repr__(self):
    return "SoftMax"

class LogSoftMax(Module):
  def __init__(self):
    super(LogSoftMax, self).__init__()

  def update_output(self, input):
    self.output = np.subtract(input, input.max(axis = 1, keepdims =True))
    exp_exit = np.exp(self.output)
    sum_exp = np.sum(exp_exit, axis = 1, keepdims = True)
    log_sum = np.log(sum_exp)
    self.output = self.output - log_sum
    return self.output

  def updateGradInput(self, input, gradOutput):
    softmax_output = np.exp(self.output)
    sum_grad = np.sum(gradOutput, axis = 1, keepdims = True)
    self.gradInput = gradOutput - softmax_output * sum_grad
    return self.gradInput

  def __repr__(self):
    return "LogSoftMax"

class BatchNormalization(Module):
  EPS = 1e-3
  def __init__(self, alpha = 0.):
    super(BatchNormalization, self).__init__()
    self.alpha = alpha
    self.moving_mean = None
    self.moving_variance = None

  def updateOutput(self, input):
    if self.training:
      self.batch_mean = np.mean(input, axis = 0)
      self.batch_var = np.var(input, axis = 0)
      self.std = np.sqrt(self.batch_var + self.EPS)
      self.normalized = (input - self.batch_mean) / self.std

      if self.moving_mean == None:
        self.moving_mean = self.batch_mean
        self.moving_variance = self.self.batch_var

      else:
        self.moving_mean = self.moving_mean * self.alpha + self.batch_mean * (1 - self.alpha)
        self.moving_variance = self.moving_variance * self.alpha + self.batch_variance * (1 - self.alpha)

      self.output = self.normalized
    else:
      self.output = (input - self.moving_mean) / np.sqrt(self.moving_variance + self.EPS)
    return self.output

  def updateGradInput(self, input, gradOutput):
        if self.training:
            m = input.shape[0]
            dL_dy = gradOutput
            grad_input = dL_dy / self.std
            grad_input -= np.mean(dL_dy, axis=0) / self.std
            grad_input -= self.normalized * np.mean(dL_dy * self.normalized, axis=0) / self.std

            self.gradInput = grad_input
        else:
            self.gradInput = gradOutput / np.sqrt(self.moving_variance + self.EPS)

        return self.gradInput

  def __repr__(self):
        return "BatchNormalization"

In [None]:
class ChannelwiseScaling(Module):
    """
       Implements linear transform of input y = \gamma * x + \beta
       where \gamma, \beta - learnable vectors of length x.shape[-1]
    """
    def __init__(self, n_out):
        super(ChannelwiseScaling, self).__init__()

        stdv = 1./np.sqrt(n_out)
        self.gamma = np.random.uniform(-stdv, stdv, size=n_out)
        self.beta = np.random.uniform(-stdv, stdv, size=n_out)

        self.gradGamma = np.zeros_like(self.gamma)
        self.gradBeta = np.zeros_like(self.beta)

    def updateOutput(self, input):
        self.output = input * self.gamma + self.beta
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = gradOutput * self.gamma
        return self.gradInput

    def accGradParameters(self, input, gradOutput):
        self.gradBeta = np.sum(gradOutput, axis=0)
        self.gradGamma = np.sum(gradOutput*input, axis=0)

    def zeroGradParameters(self):
        self.gradGamma.fill(0)
        self.gradBeta.fill(0)

    def getParameters(self):
        return [self.gamma, self.beta]

    def getGradParameters(self):
        return [self.gradGamma, self.gradBeta]

    def __repr__(self):
        return "ChannelwiseScaling"

In [None]:
class Dropout(Module):
  def __init__(self):
    super(Dropout, self).__init__()
    self.p = p
    self.mask = None

  def updateOutput(self, input):
    if self.trainig:
      self.mask = np.random.binomial(1, 1 - self.p, size = input.shape)
      self.output = (input * self.mask) / (1 - self.p)
    else:
      self.output = input

    return self.output

  def updateGradInput(self, input, gradOutput):
    if self.trainig:
      self.gradInput = gradOutput * self.mask / (1 - self.p)
    else:
      self.gradInput = gradOutput

    return self.gradInput

  def __repr__(self):
    return "Dropout"

class ReLU(Module):
    def __init__(self):
         super(ReLU, self).__init__()

    def updateOutput(self, input):
        self.output = np.maximum(input, 0)
        return self.output

    def updateGradInput(self, input, gradOutput):
        self.gradInput = np.multiply(gradOutput , input > 0)
        return self.gradInput

    def __repr__(self):
        return "ReLU"

class LeakyReLu(Module):
  def __init__(self, slope = 0.03):
    super(LeakyReLu, self).__init__()
    self.slope = slope

  def updateOutput(self, input):
    self.output = np.where(input >= 0, input, self.slope * input)
    return self.output

  def updateGradInput(self, input, gradOutput):
    self.gradInput = gradOutput * np.where(input > 0, 1, self.slope)
    return self.gradInput

  def __repr__(self):
    return "LeakyReLu"

class ELU(Module):
  def __init__(self, alpha = 1.0):
    super(ELU, self).__init__()
    self.alpha = alpha

  def updateOutput(self, input):
    self.output = np.where(input > 0, input, self.alpha * (np.exp ** input - 1))
    return self.output

  def updateGradInput(self, input, gradOutput):
    self.gradInput = gradOutput * np.where(input > 0, 1, self.alpha * np.exp ** input)
    return self.gradInput

  def __repr__(self):
    return "ELU"

class SoftPlus(Module):
  def __init__(self):
    super(SoftPlus, self).__init__()

  def updateOutput(self, input):
    self.output = np.log1p(np.exp(input))
    return self.output

  def updateGradInput(self, input, gradOutput):
    self.gradInput = gradOutput * (np.exp(input) / (1 + np.exp(input)))
    return self.gradInput

class Criterion(object):
    def __init__ (self):
        self.output = None
        self.gradInput = None

    def forward(self, input, target):
        """
            Given an input and a target, compute the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateOutput`.
        """
        return self.updateOutput(input, target)

    def backward(self, input, target):
        """
            Given an input and a target, compute the gradients of the loss function
            associated to the criterion and return the result.

            For consistency this function should not be overrided,
            all the code goes in `updateGradInput`.
        """
        return self.updateGradInput(input, target)

    def updateOutput(self, input, target):
        """
        Function to override.
        """
        return self.output

    def updateGradInput(self, input, target):
        """
        Function to override.
        """
        return self.gradInput

    def __repr__(self):
        """
        Pretty printing. Should be overrided in every module if you want
        to have readable description.
        """
        return "Criterion"

class MSECriterion(Criterion):
    def __init__(self):
        super(MSECriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = np.sum(np.power(input - target,2)) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput  = (input - target) * 2 / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "MSECriterion"

class ClassNLLCriterionUnstable(Criterion):
    EPS = 1e-15
    def __init__(self):
        a = super(ClassNLLCriterionUnstable, self).__init__()
        super(ClassNLLCriterionUnstable, self).__init__()

    def updateOutput(self, input, target):
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        log_probs = np.log(input_clamp)
        loss = -np.sum(log_probs * target) / input.shape[0]
        self.output = loss
        return self.output

    def updateGradInput(self, input, target):
        input_clamp = np.clip(input, self.EPS, 1 - self.EPS)
        self.gradInput = -target / (input_clamp * input.shape[0])
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterionUnstable"

class ClassNLLCriterion(Criterion):
    def __init__(self):
        a = super(ClassNLLCriterion, self).__init__()
        super(ClassNLLCriterion, self).__init__()

    def updateOutput(self, input, target):
        self.output = -np.sum(input * target) / input.shape[0]
        return self.output

    def updateGradInput(self, input, target):
        self.gradInput = -target / input.shape[0]
        return self.gradInput

    def __repr__(self):
        return "ClassNLLCriterion"

def sgd_momentum(variables, gradients, config, state):
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('accumulated_grads', {})

    var_index = 0
    for current_layer_vars, current_layer_grads in zip(variables, gradients):
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):

            old_grad = state['accumulated_grads'].setdefault(var_index, np.zeros_like(current_grad))

            np.add(config['momentum'] * old_grad, config['learning_rate'] * current_grad, out=old_grad)

            current_var -= old_grad
            var_index += 1

def adam_optimizer(variables, gradients, config, state):
    # 'variables' and 'gradients' have complex structure, accumulated_grads will be stored in a simpler one
    state.setdefault('m', {})  # first moment vars
    state.setdefault('v', {})  # second moment vars
    state.setdefault('t', 0)   # timestamp
    state['t'] += 1
    for k in ['learning_rate', 'beta1', 'beta2', 'epsilon']:
        assert k in config, config.keys()

    var_index = 0
    lr_t = config['learning_rate'] * np.sqrt(1 - config['beta2']**state['t']) / (1 - config['beta1']**state['t'])
    for current_layer_vars, current_layer_grads in zip(variables, gradients):
        for current_var, current_grad in zip(current_layer_vars, current_layer_grads):
            var_first_moment = state['m'].setdefault(var_index, np.zeros_like(current_grad))
            var_second_moment = state['v'].setdefault(var_index, np.zeros_like(current_grad))
            np.add(var_first_moment * config['beta1'], (1 - config['beta1']) * current_grad, out=var_first_moment)
            np.add(var_second_moment * config['beta2'], (1 - config['beta2']) * current_grad * current_grad, out=var_second_moment)
            current_var -= lr_t * var_first_moment / (np.sqrt(var_second_moment) + config['epsilon'])
            assert var_first_moment is state['m'].get(var_index)
            assert var_second_moment is state['v'].get(var_index)
            var_index += 1

