In [1]:
%matplotlib inline
from time import time, sleep
import numpy as np
import pickle as pkl
import copy
import matplotlib.pyplot as plt
from IPython import display
from collections import Counter
from sklearn.preprocessing import OneHotEncoder
import os

from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

def rel_error(x, y):
    """ returns relative error """
    return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))

<a id='toc'></a>
* [1. Modules](#modules)
    * [1.1 Base objects](#base)
        * [1.1.1 Layer](#layer)
        * [1.1.2 Sequential](#seq)
        * [1.1.3 Model](#model)
    * [1.2 Helpers](#helpers)
        * [1.2.1 Initializers](#initializers)
        * [1.2.2 Regularizers](#regularizers)
        * [1.2.3 Gradients checker](#grad_checker)
    * [1.3 Layers](#layers)
        * [1.3.1 Dense](#dense)
            * [1.3.1.1 Dense: forward](#dense_forward)
            * [1.3.1.2 Dense: backward](#dense_backward)
        * [1.3.2 Softmax](#softmax)
        * [1.3.3 Dropout](#dropout)
            * [1.3.3.1 Dropout: forward](#dropout_forward)
            * [1.3.3.2 Dropout: backward](#dropout_backward)
        * [1.3.4 BatchNormalization](#bn)
            * [1.3.4.1 BatchNormalization: forward](#bn_forward)
            * [1.3.4.2 BatchNormalization: backward](#bn_backward)
    * [1.4 Criterions](#criterions)        
    * [1.5 Optimizers](#optimizers)
    * [1.6 Solver](#solver)

<a id='layer'></a>
### 1.1.1 Layer [[toc]](#toc)

In [17]:
def dtype_conversion(fn):
    def fn_(self=None, *args, **kwargs):
        dtype = self.dtype
        return fn(self, 
                  *[arg.astype(dtype, copy=False) for arg in args], 
                  **{k: v.astype(dtype, copy=False) for k, v in kwargs.items()})
    return fn_

def check_initialized(fn):
    def fn_(self=None, *args, **kwargs):
        assert self.initialized, 'Object {} must be initialized to call its methods.'.format(self)
        return fn(self, *args, **kwargs)
    return fn_

class A:
    def __init__(self, dtype=np.float32, initialized=True):
        self.dtype = dtype
        self.initialized = initialized
        
    @check_initialized
    @dtype_conversion
    def sum(self, a, b):
        print(a.dtype, b.dtype)
        return a + b

np.random.seed(45)
a = np.random.randn(5)
b = np.random.randn(5)
s = A(np.float32, False)
s.sum(a, b)

AssertionError: Object <__main__.A object at 0x7fb25516dac8> must be initialized to call its methods.

In [18]:
# %load ../../ml/neural_network/sequential/layer.py

#import numpy as np
#from collections import OrderedDict

class Layer:
    # Checks
    def _assert_nans(self, arr):
        assert not np.any(np.isnan(arr)), 'NaNs etected!'
    def _assert_infs(self, arr):
        assert not np.any(np.isinf(arr)), 'Infs detected!'
    def _check_arrays(self, *arrays):
        if self.debug:
            for arr in arrays:
                self._assert_nans(arr)
                self._assert_infs(arr)
    
    def __init__(self):
        self.output = None       # Output of the layer is always kept for backpropagatoin
        self.grad_input = None   # Input gradient is saved just in case
        self.training = True     # 
        
        self.forward_enter_call  = lambda: None
        self.forward_exit_call   = lambda: None
        self.backward_enter_call = lambda: None
        self.backward_exit_call  = lambda: None
        
        self.dtype = None         # Must be set during initialization
        self.debug = False        # Must be set during initialization
        self.initialized = False  # Must be set to True after initialization
        
    def __repr__(self):
        return type(self).__name__
    
    # Setting callbacks
    def set_forward_enter_call(self, callback=lambda: None):
        self.forward_enter_call = callback
    def set_forward_exit_call(self, callback=lambda: None):
        self.forward_exti_call = callback
    def set_backward_enter_call(self, callback=lambda: None):
        self.backward_enter_call = callback
    def set_backward_exit_call(self, callback=lambda: None):
        self.backward_exit_call = callback
    
    # Initialization
    def initialize(self, params):
        """Thist function is called during compilation process to initialize layer"""
        params = self._initialize(params)
        self.initialized = True
        return params
    def _initialize(self, params):
        """Must be called at each layer via super()._initialize(params) at the beginning of layer.initialize() call"""
        params = self._check_initialization_params(params)
        params = self._initialize_name(params)
        self.debug = params['debug']
        self.dtype = params['dtype']
        return params
    def _check_initialization_params(self, params):
        assert 'input_shape' in params, 'Input shape must be provided.' # This is probably not critical
        params.setdefault('seed', 0)
        params.setdefault('names', {})
        params.setdefault('debug', False)
        params.setdefautl('dtype', np.float64)
        return params
    def _initialize_name(self, params):
        names = params['names']
        layer_type_name = type(self).__name__
        n_layers = names.setdefault(layer_type_name, 0)
        self.name = layer_type_name + str(n_layers)
        names[layer_type_name] += 1
        return params

    # Forward propagation
    @check_initialized
    @dtype_conversion
    def forward(self, input):
        assert self.initialized
        self._check_arrays(input) # Check
        self.update_output(input) # Finding output tensor
        if self.fcall is not None: self.fcall()    # Callback during forward propagation
        return self.output
    def update_output(self, input):
        """Must update self.output"""
        pass

    # Backward propagation
    @check_initialized
    @dtype_conversion
    def backward(self, input, grad_output):
        assert self.initialized # Otherwise some params are not initialized and self.dtype is not available
        # Checks and transformations
        self.assert_nans(grad_output)
        self.assert_infs(grad_output)
        grad_output = grad_output.astype(self.dtype, copy=False)
        input = input.astype(self.dtype, copy=False)
        # Backprop
        self.update_grad_input(input, grad_output) # This updates self.grad_input
        self.update_grad_param(input, grad_output)
        if self.bcall is not None: self.bcall()    # Callback during backward propagation
        return self.grad_input
    def update_grad_input(self, input, grad_output):
        """Must update self.grad_input"""
        pass
    def update_grad_param(self, input, grad_output):
        pass

    # Regulariation
    @check_initialized
    def get_regularization_loss(self):
        return 0.0
    
    # Getting params and gradients
    @check_initialized
    def get_params(self):
        return OrderedDict()
    @check_initialized
    def get_grad_params(self):
        return OrderedDict()
    @check_initialized
    def set_params(self, new_params):
        params = self.get_params()
        for param_name in new_params:
            np.copyto(params[param_name], new_params[param_name]) 

    # Changing network mode
    @check_initialized
    def train(self):
        self.training = True
    @check_initialized
    def evaluate(self):
        self.training = False

<a id='seq'></a>
### 1.1.2 Sequential [[toc]](#toc)

In [403]:
# %load ../../ml/neural_network/sequential/sequential.py

#import numpy as np
#import copy
#from collections import OrderedDict
#from .layer import Layer

class Sequential(Layer):
    def __init__(self):
        super().__init__()
        self.layers = []
    def __repr__(self):
        return '->'.join([str(layer) for layer in self.layers])
    def __getitem__(self, n):
        return self.layers[n]
    
    def add(self, layer):
        assert isinstance(layer, Layer)
        self.layers.append(layer)
        
    # Initialization
    def compile(self, config):
        """
        Compilation stage for all layers in the network:
            sets random seeds
            sets dtypes
            sets names
            runs parameters initialization
        """
    
    def initialize(self, params):
        self._check_initialization_params(params)
        for n_layer, layer in enumerate(self.layers):
            params = layer.initialize(params)
        return params

    # Forward propagation
    def update_output(self, input):
        """This function passes input through all layers and saves output"""
        for n_layer, layer in enumerate(self.layers):
            output = layer.forward(input)
            input = output
        self.output = output
        return self.output
        
    # Backward propagation
    def backward(self, input, grad_output):
        n_layers = len(self.layers)
        for n_layer in reversed(list(range(1, n_layers))):
            grad_output = self.layers[n_layer].backward(self.layers[n_layer - 1].output, grad_output)
        self.grad_input = self.layers[0].backward(input, grad_output)
        return self.grad_input
        
    # Get params and their gradients
    def get_params(self):
        assert self.isinitialized
        params = OrderedDict()
        for layer in self.layers.get_params():
            for param_name, param_value in layer.items():
                params[param_name] = param_value
        return params
    def get_grad_params(self):
        assert self.isinitialized
        grad_params = OrderedDict()
        for layer in self.layers:
            for grad_name, grad_value in layer.get_grad_params().items():
                grad_params[grad_name] = grad_value
        return grad_params
    def zero_grad_params(self):
        assert self.isinitialized
        for layer in self.layers:
            layer.zero_grad_params()
            
    # Regularization
    def get_regularization_loss(self):
        loss = 0.0
        for layer in self.layers:
            loss += layer.get_regularization_loss()
        return loss

    def train(self):
        """Sets all layers to training mode"""
        assert self.initialized
        for layer in self.layers:
            layer.train()
    def evaluate(self):
        """Sets all layers to evaluation mode"""
        assert self.initialized
        for layer in self.layers:
            layer.evaluate()

<a id='model'></a>
### 1.1.3 Model [[toc]](#toc)

In [None]:
class Model:
    def __init__(self, sequential, criterion):

<a id='helpers'></a>
## 1.2 Helper objects [[toc]](#toc)

<a id='initializers'></a>
### 1.2.1 Initializers [[toc]](#toc)

In [404]:
# %load ../../ml/neural_network/initializers/initializers.py

import numpy as np


class Initializer:
    def __init__(self):
        pass

    
class DeterministicInitializer(Initializer):
    def __init__(self, init_value):
        self.init_value = init_value
    def __call__(self, shape=None, dtype=np.float32):
        return self.init_value.astype(dtype)


class RandomInitializer(Initializer):
    def __init__(self, seed=None):
        super().__init__()
        self.gen = np.random.RandomState(seed)

        
class ZerosInitializer(Initializer):
    def __init__(self):
        super().__init__()
    def __call__(self, shape=None, dtype=np.float32):
        if shape is None:
            return 0.0
        return np.zeros(shape, dtype=dtype)


class NormalInitializer(RandomInitializer):
    def __init__(self, seed=None):
        super().__init__(seed=seed)
    def __call__(self, shape=None, dtype=np.float32):
        stddev = 1.0
        if len(shape) == 2:
            stddev = 1. / np.sqrt(shape[0])
        if len(shape) == 4:
            stddev = 1.0 / np.sqrt(np.prod(shape[1:]))
        return self.gen.uniform(-stddev, stddev, size=shape).astype(dtype)


class NormalInitializer(RandomInitializer):
    def __init__(self, seed=None):
        super().__init__(seed=seed)
        
    def __call__(self, shape=None, dtype=np.float32):
        stddev = 1.0
        if len(shape) == 2:
            stddev = 1. / np.sqrt(shape[0])
        if len(shape) == 4:
            stddev = 1.0 / np.sqrt(np.prod(shape[1:]))
        return self.gen.normal(loc=0, scale=stddev, size=shape).astype(dtype)

<a id='regularizers'></a>
### 1.2.2 Regularizers [[toc]](#toc)

In [405]:
# %load ../../ml/neural_network/regularizers/regularizers.py

import numpy as np


class Regularizer:
    pass


class EmptyRegularizer(Regularizer):
    def __bool__(self):
        return False


class L2regularizer(Regularizer):
    def __init__(self, l2=0.0):
        self.l2 = l2
    def __bool__(self):
        return True
    def loss(self, arr):
        return 0.5 * self.l2 * np.sum(arr ** 2)
    def grad(self, arr):
        return self.l2 * arr

<a id='grad_checker'></a>
### 1.2.3 Gradients Checker [[toc]](#toc)

In [406]:
def eval_numerical_gradient_array(f, x, df, h=1e-5):
    """
    Evaluate a numeric gradient for a function that accepts a numpy
    array and returns a numpy array.
    """
    grad = np.zeros_like(x)
    it = np.nditer(x, flags=['multi_index'], op_flags=['readwrite'])
    while not it.finished:
        ix = it.multi_index
        oldval = x[ix]
        x[ix] = oldval + h
        pos = f(x).copy()
        x[ix] = oldval - h
        neg = f(x).copy()
        x[ix] = oldval
        grad[ix] = np.sum((pos - neg) * df) / (2 * h)
        it.iternext()
    return grad


class GradientsChecker:
    def __init__(self, step=1e-5):
        self.step = step
        
    def eval_gradients(self, layer, input, grad_output):
        assert isinstance(input, np.ndarray) # It must be an array
        self.layer = layer
        self.input = input
        self.num_grad_input = eval_numerical_gradient_array(self.forward, input, grad_output, self.step)
        self.num_grad_params = OrderedDict()
        for param_name, param_value in layer.get_params().items():
            self.num_grad_params[param_name] = eval_numerical_gradient_array(
                self.forward, param_value, grad_output, self.step)
        self.grad_input  = layer.backward(input, grad_output)
        self.grad_params = layer.get_grad_params()
        
    def __call__(self, layer, input, grad_output):
        self.eval_gradients(layer, input, grad_output)
        for param_name in layer.get_params():
            print('grad_{} error: {}'.format(param_name, 
                  rel_error(self.num_grad_params[param_name], self.grad_params[param_name])))
        print('grad_X error: {}'.format(rel_error(self.num_grad_input, self.grad_input)))
        
    def forward(self, *args, **kwargs):
        return self.layer.forward(self.input)

<a id='layers'></a>
## 1.3 Layers [[toc]](#toc)

<a id='dense'></a>
### 1.3.1 Dense [[toc]](#toc)

In [407]:
# %load ../../ml/neural_network/layers/dense.py

#import numpy as np
from collections import OrderedDict


class Dense(Layer):
    def __init__(self, units, use_bias=True, 
                 W_initializer=None, b_initializer=None, 
                 W_regularizer=None, b_regularizer=None):
        """
        Inputs:
        - units - Integer or Long, dimensionality of the output space.
        - W_initializer
        - b_initializer
        - seed - used for initializers!!!
        """
        super().__init__()
        self.units = units
        self.use_bias = use_bias
        self.W_initializer = W_initializer
        self.b_initializer = b_initializer
        self.W_regularizer = W_regularizer
        self.b_regularizer = b_regularizer
        self.params = OrderedDict()
        self.grad_params = OrderedDict()
    
    def __repr__(self):
        return 'Dense({}->{})'.format(self.input_size, self.output_size)   
    
    # Initialization
    def _initialize(self, params):
        # Params check and name initialization
        params = super()._initialize(params)

        # Initializing params and grads
        params = self._initialize_W(params)
        params = self._initialize_b(params)
        
        # Regularization
        if self.W_regularizer is None: self.W_regularizer = EmptyRegularizer()
        if self.b_regularizer is None: self.b_regularizer = EmptyRegularizer()
        return params

    def _initialize_W(self, params):
        input_shape = params['input_shape']
        seed = params['seed']
        dtype = params['dtype']
        if self.W_initializer is None:
            self.W_initializer = NormalInitializer(seed=seed)
        elif isinstance(self.W_initializer, np.ndarray):
            assert self.W_initializer.shape == (input_shape[1], self.units)
            self.W_initializer = DeterministicInitializer(self.W_initializer)
        else:
            assert False
        self.W = self.W_initializer(shape=(input_shape[1], self.units), dtype=dtype)
        self.grad_W = np.zeros_like(self.W, dtype=dtype)
        params['seed'] = seed + 1
        params['input_shape'] = (input_shape[0], self.units) # Input shape for the next layer
        return params
        
    def _initialize_b(self, params):
        dtype = params['dtype']
        if self.b_initializer is None:
            self.b_initializer = ZerosInitializer()
        elif isinstance(self.b_initializer, np.ndarray):
            assert self.b_initializer.shape == (self.units,)
            self.b_initializer = DeterministicInitializer(self.b_initializer)
        else:
            assert False
        self.b = self.b_initializer(shape=(self.units,), dtype=dtype)
        self.grad_b = np.zeros_like(self.b, dtype=dtype)
        return params
    
    def update_output(self, input):
        self.assert_nans(input)
        self.output = np.dot(input, self.W)  # [B x I] x [I x O] = [B x O]
        if self.use_bias:
            self.output += self.b[None, :]
        return self.output
    
    def update_grad_input(self, input, grad_output):
        self.grad_input = np.dot(grad_output, self.W.T)         # [N x H] x [H x D] = [N x D]
        return self.grad_input
    
    def update_grad_param(self, input, grad_output):
        self.grad_W = np.dot(input.T, grad_output)               # ([D x N] x [N x H]).T = [D, H]
        if self.W_regularizer:
            self.grad_W += self.W_regularizer.grad(self.W)
        if self.use_bias:
            self.grad_b = np.sum(grad_output, axis=0)
            if self.b_regularizer:
                self.grad_b += self.b_regularizer.grad(self.b)

    def get_regularization_loss(self):
        loss = 0.0
        if self.W_regularizer:
            loss += self.W_regularizer.loss(self.W)
        if self.use_bias:
            if self.b_regularizer:
                loss += self.b_regularizer.loss(self.b)  
        return loss
 
    def get_params(self):
        return OrderedDict([(self.name + ':W', self.W), (self.name + ':b', self.b)])
    def get_grad_params(self):
        return OrderedDict([(self.name + ':W', self.grad_W), (self.name + ':b', self.grad_b)])
    def zero_grad_params(self):
        self.grad_W.fill(0)
        self.grad_b.fill(0)

In [408]:
dense = Dense(units=100)
dense.initialize({'input_shape': (10, 20)})

{'dtype': numpy.float64,
 'input_shape': (10, 100),
 'names': {'Dense': 1},
 'seed': 1}

<a id='dense_forward'></a>
#### 1.3.1.1 Dense: forward [[toc]](#toc)

In [410]:
# Test the affine_forward function
batch_size = 2
input_size = 120
output_size = 3

dtype = np.float64
X = np.linspace(-0.1, 0.5, num=batch_size * input_size).reshape(batch_size, input_size) # [2, 120]
W = np.linspace(-0.2, 0.3, num=input_size * output_size).reshape(input_size, output_size) # [360] - > [120, 3]
b = np.linspace(-0.3, 0.1, num=output_size)
X = X.astype(dtype)
W = W.astype(dtype)
b = b.astype(dtype)
dense = Dense(output_size, W_initializer=W, b_initializer=b)
dense.initialize({'input_shape': (-1, 120)})

output = dense.forward(X)
correct_output = np.array([[ 1.49834967,  1.70660132,  1.91485297],
                           [ 3.25553199,  3.5141327,   3.77273342]])

# Compare your output with ours. The error should be around 1e-9.
print('Testing affine_forward function:')
print('difference: ', rel_error(output, correct_output))

Testing affine_forward function:
difference:  9.7698488884e-10


<a id='dense_backward'></a>
#### 1.3.1.2 Dense: backward [[toc]](#toc)

In [411]:
# Test the affine_backward function
np.random.seed(231)
dtype = np.float64
X = np.random.randn(10, 6).astype(dtype)
W = np.random.randn(6, 5).astype(dtype)
b = np.random.randn(5).astype(dtype)
grad_Y = np.random.randn(10, 5).astype(dtype)
print('X.shape =', X.shape)
print('W.shape =', W.shape)
print('b.shape =', b.shape)

dense = Dense(5, W_initializer=W, b_initializer=b)
dense.initialize({'input_shape': X.shape})
grad_checker = GradientsChecker()
grad_checker(dense, X, grad_Y)

X.shape = (10, 6)
W.shape = (6, 5)
b.shape = (5,)
grad_Dense0:W error: 2.1752635504596857e-10
grad_Dense0:b error: 7.736978834487815e-12
grad_X error: 1.0908199508708189e-10


<a id='softmax'></a>
### 1.3.2 Softmax [[toc]](#toc)

In [412]:
# %load ../../ml/neural_network/layers/softmax.py

#import numpy as np
#from ..sequential import Layer

class SoftMax(Layer):
    def __init__(self):
        super().__init__()
    
    def update_output(self, input):
        self.output = np.subtract(input, input.max(axis=1, keepdims=True))
        np.exp(self.output, self.output)
        self.output /= np.sum(self.output, axis=1, keepdims=True)
        return self.output
    
    def update_grad_input(self, input, grad_output):
        G = np.multiply(self.output, grad_output)
        self.grad_input = G - self.output * np.sum(G, axis=1, keepdims=True)
        return self.grad_input

In [414]:
softmax = SoftMax()
print(softmax.initialize({'input_shape': (10, 25)}))

{'input_shape': (10, 25), 'names': {'SoftMax': 1}, 'dtype': <class 'numpy.float64'>, 'seed': 0}


<a id='dropout'></a>
### 1.3.3 Dropout [[toc]](#toc)

In [415]:
# %load ../../ml/neural_network/layers/dropout.py

#import numpy as np
#from ..sequential import Layer


class Dropout(Layer):
    def __init__(self, p=0.5):
        super().__init__()
        self.p = p
        self.mask = None

    # initialization
    def _initialize(self, params):
        # Check params and initialize name
        params = super()._initialize(params)
        seed = params['seed']
        self.gen = np.random.RandomState(seed)
        params['seed'] += 1
        return params
    
    # Setters
    def set_p(self, p):
        self.p = p
    
    # Forward propagation
    def update_output(self, input):
        if self.training:
            self.mask = self.gen.choice([0, 1], p=[self.p, 1 - self.p], size=input.shape)
            self.output = np.multiply(self.mask, input)
        else:
            self.output = (1 - self.p) * input
        return self.output
    
    # Backward propagation
    def update_grad_input(self, input, grad_output):
        if self.training:
            self.grad_input = np.multiply(self.mask, grad_output)
        else:
            self.grad_input = (1 - self.p) * grad_output
        return self.grad_input

In [416]:
dropout = Dropout(0.5)
dropout.initialize({'input_shape': (10, 25)})

{'dtype': numpy.float64,
 'input_shape': (10, 25),
 'names': {'Dropout': 1},
 'seed': 1}

<a id='dropout_forward'></a>
#### Dropout: forward [[toc]](#toc)

In [417]:
np.random.seed(231)
X = np.random.randn(500, 500) + 10
dropout = Dropout(0)
dropout.initialize({'input_shape': (10, 25)})

for p in [0.3, 0.6, 0.75]:
    dropout.p = p
    dropout.train()
    out      = dropout.forward(X)
    dropout.evaluate()
    out_test = dropout.forward(X)
    
    print('Running tests with p = ', p)
    print('Mean of input: ', X.mean())
    print('Mean of train-time output: ', out.mean())
    print('Mean of test-time output: ', out_test.mean())
    print('Fraction of train-time output set to zero: ', (out == 0).mean())
    print('Fraction of test-time output set to zero: ', (out_test == 0).mean())
    print()

Running tests with p =  0.3
Mean of input:  10.0002078785
Mean of train-time output:  7.00311276126
Mean of test-time output:  7.00014551493
Fraction of train-time output set to zero:  0.299804
Fraction of test-time output set to zero:  0.0

Running tests with p =  0.6
Mean of input:  10.0002078785
Mean of train-time output:  3.98521948532
Mean of test-time output:  4.00008315139
Fraction of train-time output set to zero:  0.60156
Fraction of test-time output set to zero:  0.0

Running tests with p =  0.75
Mean of input:  10.0002078785
Mean of train-time output:  2.52026295957
Mean of test-time output:  2.50005196962
Fraction of train-time output set to zero:  0.748108
Fraction of test-time output set to zero:  0.0



<a id='dropout_backward'></a>
#### Dropout: backward [[toc]](#toc)

In [418]:
np.random.seed(231)
X = np.random.randn(5, 5) + 10
grad_Y = np.random.randn(*X.shape)

dropout = Dropout(0.8)
# The code below resets state of the dropout layer after each forward propagation
forward_callback = lambda: dropout.initialize({'input_shape': (10, 25)})
dropout.set_fcall(forward_callback)
forward_callback()

grad_checker = GradientsChecker()
grad_checker.eval_gradients(dropout, X, grad_Y)
grad_X     = grad_checker.grad_input
num_grad_X = grad_checker.num_grad_input

print('grad_X relative error: ', rel_error(grad_X, num_grad_X))

grad_X relative error:  1.89289321191e-11


<a id='bn'></a>
### 1.3.4 BatchNormalization [[toc]](#toc)

In [419]:
# %load ../../ml/neural_network/layers/batch_normalization.py
#import numpy as np
#from collections import OrderedDict
#from ..sequential import Layer

class BatchNormalization(Layer):
    """
    Forward pass for batch normalization.

    During training the sample mean and (uncorrected) sample variance are
    computed from minibatch statistics and used to normalize the incoming data.
    During training we also keep an exponentially decaying running mean of the
    mean and variance of each feature, and these averages are used to normalize
    data at test-time.

    At each timestep we update the running averages for mean and variance using
    an exponential decay based on the momentum parameter:

    running_mean = momentum * running_mean + (1 - momentum) * sample_mean
    running_var  = momentum * running_var  + (1 - momentum) * sample_var

    Note that the batch normalization paper suggests a different test-time
    behavior: they compute sample mean and variance for each feature using a
    large number of training images rather than using a running average. For
    this implementation we have chosen to use running averages instead since
    they do not require an additional estimation step; the torch7
    implementation of batch normalization also uses running averages.

    Input:
    - x: Data of shape (N, D)
    - gamma: Scale parameter of shape (D,)
    - beta: Shift paremeter of shape (D,)
    - bn_param: Dictionary with the following keys:
      - mode: 'train' or 'test'; required
      - eps: Constant for numeric stability
      - momentum: Constant for running mean / variance.
      - running_mean: Array of shape (D,) giving running mean of features
      - running_var Array of shape (D,) giving running variance of features

    Returns a tuple of:
    - out: of shape (N, D)
    - cache: A tuple of values needed in the backward pass
    """
    def __init__(self, momentum=0.9, eps=1e-5):
        super().__init__()
        self.momentum = momentum
        self.eps = eps
        
    def _initialize(self, params):
        # Check params and initialize name
        params = super()._initialize(params)
        input_shape = params['input_shape']
        dtype = params['dtype']
        n_features = input_shape[1]
        self.running_mean = np.zeros(n_features, dtype=dtype)
        self.running_var = np.zeros(n_features, dtype=dtype)
        self.gamma = np.ones(n_features, dtype=dtype)
        self.beta = np.zeros(n_features, dtype=dtype)
        return params

    # Forward propagation
    def update_output(self, input):
        if self.training:
            self.sample_mean = np.mean(input, axis=0)
            self.sample_var = np.var(input, axis=0)
            self.running_mean = self.momentum * self.running_mean + (1 - self.momentum) * self.sample_mean
            self.running_var = self.momentum * self.running_var + (1 - self.momentum) * self.sample_var
            self.normed_input = (input - self.sample_mean[None, :]) / np.sqrt(self.sample_var + self.eps)[None, :]
            self.output = self.gamma[None, :] * self.normed_input + self.beta[None, :]
        else:
            normed_input = (input - self.running_mean[None, :]) / (np.sqrt(self.running_var[None, :] + self.eps))
            self.output = self.gamma[None, :] * normed_input + self.beta[None, :]
        return self.output

    # Backward propagation
    def update_grad_input(self, input, grad_output):
        if self.training:
            var = self.sample_var
        else:
            var = self.running_var
        self.grad_input = self.gamma / np.sqrt(var + self.eps)[None, :] *\
                ((grad_output - np.mean(grad_output, axis=0)[None, :]) -\
                 self.normed_input * np.mean(np.multiply(self.normed_input, grad_output), axis=0)[None, :]) 
    def update_grad_param(self, input, grad_output):
        self.grad_gamma = np.sum(np.multiply(self.normed_input, grad_output), axis=0)
        self.grad_beta = np.sum(grad_output, axis=0)
          
    # Get params and grad_params
    def get_params(self):
        return OrderedDict([(self.name + '/gamma', self.gamma), (self.name + '/beta', self.beta)])
    def get_grad_params(self):
        return OrderedDict([(self.name + '/gamma', self.grad_gamma), (self.name + '/beta', self.grad_beta)])

In [420]:
bn = BatchNormalization()
bn.initialize({'input_shape': (10, 25)})

{'dtype': numpy.float64,
 'input_shape': (10, 25),
 'names': {'BatchNormalization': 1},
 'seed': 0}

<a id='bn_forward'></a>
#### 1.3.4.1 Batch Normalization: forward [[toc]](#toc)

In [421]:
# Check the training-time forward pass by checking means and variances
# of features both before and after batch normalization

# Simulate the forward pass for a two-layer network
np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
a = np.maximum(0, X.dot(W1)).dot(W2)

bn = BatchNormalization()
print(bn.initialize({'input_shape': a.shape}))

print('Before batch normalization:')
print('  means: ', a.mean(axis=0))
print('  stds: ', a.std(axis=0))

# Means should be close to zero and stds close to one
print('After batch normalization (gamma=1, beta=0)')
a_norm = bn.forward(a)
print('  mean: ', a_norm.mean(axis=0))
print('  std: ', a_norm.std(axis=0))

# Now means should be close to beta and stds close to gamma
gamma = np.asarray([1.0, 2.0, 3.0])
beta = np.asarray([11.0, 12.0, 13.0])
bn.gamma = gamma
bn.beta = beta
a_norm  = bn.forward(a)
print('After batch normalization (nontrivial gamma, beta)')
print('  means: ', a_norm.mean(axis=0))
print('  stds: ', a_norm.std(axis=0))

{'input_shape': (200, 3), 'names': {'BatchNormalization': 1}, 'dtype': <class 'numpy.float64'>, 'seed': 0}
Before batch normalization:
  means:  [ -2.3814598  -13.18038246   1.91780462]
  stds:  [ 27.18502186  34.21455511  37.68611762]
After batch normalization (gamma=1, beta=0)
  mean:  [  1.77635684e-17  -5.32907052e-17   2.13717932e-17]
  std:  [ 0.99999999  1.          1.        ]
After batch normalization (nontrivial gamma, beta)
  means:  [ 11.  12.  13.]
  stds:  [ 0.99999999  1.99999999  2.99999999]


In [422]:
# Check the test-time forward pass by running the training-time
# forward pass many times to warm up the running averages, and then
# checking the means and variances of activations after a test-time
# forward pass.
np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)

bn = BatchNormalization()
bn.initialize({'input_shape': a.shape})
bn.gamma = np.ones(D3)
bn.beta = np.zeros(D3)
bn.train()

for t in range(50):
    X = np.random.randn(N, D1)
    a = np.maximum(0, X.dot(W1)).dot(W2)
    a_norm = bn.forward(a)

bn.evaluate()
X = np.random.randn(N, D1)
a = np.maximum(0, X.dot(W1)).dot(W2)
a_norm = bn.forward(a)

# Means should be close to zero and stds close to one, but will be
# noisier than training-time forward passes.
print('After batch normalization (test-time):')
print('  means: ', a_norm.mean(axis=0))
print('  stds: ', a_norm.std(axis=0))

After batch normalization (test-time):
  means:  [-0.03927354 -0.04349152 -0.10452688]
  stds:  [ 1.01531428  1.01238373  0.97819988]


<a id='bn_backward'></a>
#### 1.3.4.2 Batch Normalization: backward  [[toc](#toc)]

In [423]:
# Gradient check batchnorm backward pass
np.random.seed(231)
N, D = 100, 5
X      = 5 * np.random.randn(N, D) + 12
gamma  = np.random.randn(D)
beta   = np.random.randn(D)
grad_Y = np.random.randn(N, D)
step   = 1e-5

grad_checker = GradientsChecker(step=step)
bn = BatchNormalization()
bn.initialize({'input_shape': X.shape})
bn.train()

grad_checker(bn, X, grad_Y)

grad_BatchNormalization0/gamma error: 3.4897447838672933e-11
grad_BatchNormalization0/beta error: 8.743146873306484e-11
grad_X error: 4.6504237753155334e-07


<a id='criterions'></a>
## 1.4 Criterions [[toc]](#toc)
## TODO add other criterions

In [431]:
# %load ../../ml/neural_network/criterions/criterions.py

import numpy as np
#from ..sequential import Layer

class Criterion(Layer):
    def __init__ (self):
        self.output = None
        self.grad_input = None
    def forward(self, input, target):
        return self.update_output(input, target)
    def backward(self, input, target):
        return self.update_grad_input(input, target)
    def update_output(self, input, target):
        assert False
    def update_grad_input(self, input, target):
        assert False  

class MulticlassLogLoss(Criterion):
    def __init__(self, n_classes, eps=1e-20):
        super().__init__()
        self.eps = eps
        self.n_classes = n_classes
    def update_output(self, input, target): 
        if len(target.shape) > 1:
            target = np.argmax(target, axis=1)
        assert np.max(target) < self.n_classes
        assert np.min(target) >= 0
        input_clamp = np.clip(input, self.eps, 1 - self.eps) # Using this trick to avoid numerical errors
        self.output = -np.sum(np.log(input_clamp[np.arange(input.shape[0]), target]))
        return self.output
    def update_grad_input(self, input, target):
        if len(target.shape) == 1:
            target = np.eye(self.n_classes)[target]
        self.grad_input = -np.array(target).astype(np.float64, copy=False)
        self.grad_input /= np.maximum(input, self.eps) # Using this trick to avoid numerical errors
        self.assert_inf(self.grad_input)
        self.assert_nans(self.grad_input)
        return self.grad_input

<a id='optimizers'></a>
## 1.5 Optimizers [[toc]](#toc)

In [432]:
def sgd(w, dw, config=None):
    """
    Performs vanilla stochastic gradient descent.

    config format:
    - learning_rate: Scalar learning rate.
    """
    if config is None: 
        config = {}
    config.setdefault('learning_rate', 1e-2)
    w -= config['learning_rate'] * dw
    return w, config


def sgd_momentum(w, dw, config=None):
    """
    Performs stochastic gradient descent with momentum.

    config format:
    - learning_rate: Scalar learning rate.
    - momentum: Scalar between 0 and 1 giving the momentum value.
      Setting momentum = 0 reduces to sgd.
    - velocity: A numpy array of the same shape as w and dw used to store a
      moving average of the gradients.
    """
    assert isinstance(w, np.ndarray)
    assert isinstance(dw, np.ndarray)
    if config is None: 
        config = {}
    lr = config.setdefault('learning_rate', 1e-2)
    momentum = config.setdefault('momentum', 0.9)
    v = config.get('velocity', np.zeros_like(w))
    v *= momentum
    v -= lr * dw
    next_w = w + v
    config['velocity'] = v
    return next_w, config


def rmsprop(x, dx, config=None):
    """
    Uses the RMSProp update rule, which uses a moving average of squared
    gradient values to set adaptive per-parameter learning rates.

    config format:
    - learning_rate: Scalar learning rate.
    - decay_rate: Scalar between 0 and 1 giving the decay rate for the squared
      gradient cache.
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
    - cache: Moving average of second moments of gradients.
    """
    if config is None:
        config = {}
    learning_rate = config.setdefault('learning_rate', 1e-2)
    decay_rate = config.setdefault('decay_rate', 0.99)
    epsilon = config.setdefault('epsilon', 1e-8)
    cache = config.setdefault('cache', np.zeros_like(x))
    
    cache *= decay_rate
    cache += (1 - decay_rate) * (dx ** 2)
    next_x = x - learning_rate * dx / (np.sqrt(cache) + epsilon)
    config['cache'] = cache
    return next_x, config


def adam(x, dx, config=None):
    """
    Uses the Adam update rule, which incorporates moving averages of both the
    gradient and its square and a bias correction term.

    config format:
    - learning_rate: Scalar learning rate.
    - beta1: Decay rate for moving average of first moment of gradient.
    - beta2: Decay rate for moving average of second moment of gradient.
    - epsilon: Small scalar used for smoothing to avoid dividing by zero.
    - m: Moving average of gradient.
    - v: Moving average of squared gradient.
    - t: Iteration number.
    """
    if config is None:
        config = {}
    learning_rate = config.setdefault('learning_rate', 1e-3)
    beta1 = config.setdefault('beta1', 0.9)
    beta2 = config.setdefault('beta2', 0.999)
    epsilon = config.setdefault('epsilon', 1e-8)
    m = config.setdefault('m', np.zeros_like(x))
    v = config.setdefault('v', np.zeros_like(x))
    t = config.setdefault('t', 0)
    
    m = beta1 * m + (1 - beta1) * dx
    v = beta2 * v + (1 - beta2) * (dx ** 2)
    mt = m / (1 - beta1 ** (t + 1))
    vt = v / (1 - beta2 ** (t + 1))
    next_x = x - learning_rate * mt / (np.sqrt(vt) + epsilon)
    
    config['m'] = m
    config['v'] = v
    config['t'] = t + 1
    return next_x, config

<a id='solver'></a>
## 1.6 Solver [[toc]](#toc)

In [350]:
# %load ../../ml/neural_network/cs231n/second/solver.py

import numpy as np
from collections import defaultdict, Counter, OrderedDict

class Solver:
    def __init__(self, model, criterion, data, **kwargs):
        """
        Construct a new Solver instance.

        Required arguments:
        - model: A model object conforming to the API described above
        - data: A dictionary of training and validation data containing:
          'X_train': Array, shape (N_train, d_1, ..., d_k) of training images
          'X_val':   Array, shape (N_val, d_1, ..., d_k) of validation images
          'y_train': Array, shape (N_train,) of labels for training images
          'y_val':   Array, shape (N_val,) of labels for validation images

        Optional arguments:
        - update_rule:       A string giving the name of an update rule in optim.py. Default is 'sgd'.
        - optim_config:      A dictionary containing hyperparameters that will be passed to the chosen update rule.
            Each update rule requires different hyperparameters (see optim.py) but all update rules require a
           'learning_rate' parameter so that should always be present.
           'learning_rate_decay': A scalar for learning rate decay; after each epoch the learning rate is multiplied by this value.
        - batch_size:        Size of minibatches used to compute loss and gradient during training.
        - num_epochs:        The number of epochs to run for during training.
        - verbose:           Boolean; if set to False then no output will be printed during training; default is False.
        - print_every_iter:  Integer; training losses will be printed every print_every_iter iterations; default is 1000000000.
        - print_every_epoch: Integer; training losses will be printed every print_every_epoch epochs; default is 1000000000.
        - num_train_samples: Number of training samples used to check training accuracy; default is None, which uses the entire training set.
        - num_val_samples:   Number of validation samples to use to check val accuracy; default is None, which uses the entire validation set.
        - seed:              Used to initialize an internal random generator; default is 0.
        - dtype:
        - checkpoint_name:   If not None, then save model checkpoints here every epoch.
        """
        
        # Unpack keyword arguments
        self.seed         = kwargs.pop('seed', 0)
        self.dtype        = kwargs.pop('dtype', np.float64)
        
        self.update_rule  = kwargs.pop('update_rule', 'sgd')
        self.optim_config = kwargs.pop('optim_config', {})
        self.batch_size   = kwargs.pop('batch_size', 100)
        self.num_epochs   = kwargs.pop('num_epochs', 10)
        self.checkpoint_name   = kwargs.pop('checkpoint_name', None)
        
        self.verbose = kwargs.pop('verbose', False)
        self.print_every_iter  = kwargs.pop('print_every_iter', 1000000000)
        self.print_every_epoch = kwargs.pop('print_every_epoch', 1000000000)
       
        
        # Unpacking data
        self.X_train = data['X_train'].astype(self.dtype)
        self.y_train = data['y_train'].astype(self.dtype)
        self.X_val   = data['X_val'].astype(self.dtype)
        self.y_val   = data['y_val'].astype(self.dtype)

        self.gen = np.random.RandomState(self.seed)
        self.seed += 1
        
        # Compiling model
        assert isinstance(model, Sequential)
        assert isinstance(criterion, Criterion)
        self.model = model; self.criterion = criterion
        self.model_params = {'input_shape': self.X_train.shape, 'seed': self.seed, 'dtype': np.float64, 'names': {}}
        self.model_params = self.model.initialize(self.model_params)
        if self.verbose: print(self.model_params)
            
        # Throw an error if there are extra keyword arguments
        if len(kwargs) > 0:
            extra = ', '.join('"{}"'.format(k) for k in sorted(list(kwargs.keys())))
            raise ValueError('Unrecognized arguments {}'.format(extra))

        # Make sure the update rule exists, then replace the string
        # name with the actual function
        if self.update_rule == 'sgd': 
            self.update_rule = sgd
        elif self.update_rule == 'sgd_momentum':
            self.update_rule = sgd_momentum
        elif self.update_rule == 'rmsprop':
            self.update_rule = rmsprop
        elif self.update_rule == 'adam':
            self.update_rule = adam
        else:
            assert False, 'Unknown update rule "{}"'.format(self.update_rule)
            
        """from . import optimizers
        if not hasattr(optimizers, self.update_rule):
            raise ValueError('Invalid update_rule "{}"'.format(self.update_rule))
        self.update_rule = getattr(optimizers, self.update_rule)"""
        self._reset()

    def _reset(self):
        """
        Set up some book-keeping variables for optimization. Don't call this manually.
        """
        # Set up some variables for book-keeping
        self.n_epoch = 0
        self.best_val_acc = 0
        self.best_params = {}
        self.loss_history = []
        self.train_loss_history = []
        self.train_acc_history  = []
        self.val_loss_history = []                  
        self.val_acc_history  = []
        self.history = {'loss_history':       self.loss_history,
                        'train_loss_history': self.train_loss_history,
                        'train_acc_history':  self.train_acc_history,
                        'val_loss_history':   self.val_loss_history,
                        'val_acc_history':    self.val_acc_history}
        
        # Make a deep copy of the optim_config for each parameter
        self.optim_configs = {}
        for param_name in self.model.get_params():
            self.optim_configs[param_name] = {k : v for k, v in self.optim_config.items()}

    def _step(self):
        """
        Make a single gradient update. This is called by train() and should not be called manually.
        """
        # Make a minibatch of training data
        num_train  = self.X_train.shape[0]
        batch_mask = self.gen.choice(num_train, self.batch_size)
        X_batch = self.X_train[batch_mask]
        y_batch = self.y_train[batch_mask]

        # Compute loss and gradient
        output = self.model.forward(X_batch)
        loss, output_grad = self.criterion(output, y_batch)
        self.model.backward(output, output_grad)
        loss += self.model.get_regularization_loss()
        self.loss_history.append(loss)
        
        # Perform a parameter update
        params = self.model.get_params()
        grad_params = self.model.get_grad_params()
        for param_name, param_value in params().items():
            dw = grads[p]
            config = self.optim_configs[p]
            next_w, next_config = self.update_rule(w, dw, config)
            self.model.params[p] = next_w
            self.optim_configs[p] = next_config

    def _save_checkpoint(self):
        if self.checkpoint_name is None:
            return
        checkpoint = {
          'model':              self.model,
          'update_rule':        self.update_rule,
          'optim_config':       self.optim_config,
          'batch_size':         self.batch_size,
          'n_epoch':            self.n_epoch,
          'loss_history':       self.loss_history,
          'train_loss_history': self.train_loss_history,
          'train_acc_history':  self.train_acc_history,
          'val_loss_history':   self.val_loss_history,
          'val_acc_history':    self.val_acc_history,
        }
        filename = '{}_epoch_{}.pkl'.format(self.checkpoint_name, int(self.epoch))
        if self.verbose:
            print('Saving checkpoint to "{}"'.format(filename))
        with open(filename, 'wb') as f:
            pickle.dump(checkpoint, f)

    def eval(self, X, y, batch_size=100, eval_func=None):
        """
        Evaluate the model on the provided data.

        Inputs:
        - X: Array of data, of shape (N, d_1, ..., d_k)
        - y: Array of labels, of shape (N,)
        - num_samples: If not None, subsample the data and only test the model
          on num_samples datapoints.
        - batch_size: Split X and y into batches of this size to avoid using
          too much memory.

        Returns:
        - metric_value: Scalar giving the the value of the required metric.
        """
        # Compute predictions in batches
        N = X.shape[0]
        num_batches = N // batch_size
        if N % batch_size != 0:
            num_batches += 1
        scores = []
        for i in range(num_batches):
            start = i * batch_size
            end   = (i + 1) * batch_size
            batch_scores = self.model.forward(X[start:end])
            scores.append(batch_scores)
        scores = np.vstack(scores)
        assert scores.ndim == 2
        return eval_func(scores, y)

    def _logloss(self, scores, y_true):
        n_samples = scores.shape[0]
        y_pred = scores - np.max(scores, axis=1, keepdims=True)
        y_pred = np.exp(y_pred)
        y_pred /= np.sum(y_pred, axis=1, keepdims=True)
        y_pred = np.clip(y_pred, 1e-18, 1 - 1e-18)
        return -np.mean(np.log(y_pred[np.arange(n_samples), y_true]))
        
    def _accuracy(self, scores, y_true):
        y_pred = np.argmax(scores, axis=1)
        return np.mean(y_pred == y_true)
    
    def _update_history(self):
        train_acc  = self.eval(self.X_train, self.y_train, batch_size=self.batch_size, eval_func=self._accuracy)
        train_loss = self.eval(self.X_train, self.y_train, batch_size=self.batch_size, eval_func=self._logloss)
        val_acc    = self.eval(self.X_val, self.y_val, batch_size=self.batch_size, eval_func=self._accuracy)
        val_loss   = self.eval(self.X_val, self.y_val, batch_size=self.batch_size, eval_func=self._logloss)
        self.train_acc_history.append(train_acc)
        self.val_acc_history.append(val_acc)
        self.train_loss_history.append(train_loss)
        self.val_loss_history.append(val_loss)
    
    def train(self):
        """
        Run optimization to train the model.
        """
        num_train = self.X_train.shape[0]
        num_iter_per_epoch = int(np.ceil(float(num_train) / self.batch_size))
        num_all_iterations = num_iter_per_epoch * self.num_epochs
        if self.verbose:
            print('num of epochs = {}\nnum of iterations = {}\niterations per epoch = {}'.format(
                self.num_epochs, num_all_iterations, num_iter_per_epoch))
        n_all_iter = 0
        
        self._update_history() # Initial model quality
        for n_epoch in range(self.num_epochs):
            self.n_epoch = n_epoch
            for n_iter in range(num_iter_per_epoch):
                self._step()
                # Maybe print training loss
                if self.verbose & ((n_all_iter + 1) % self.print_every_iter == 0):
                    msg = '(Iteration {}/{}) loss: {}'.format(n_all_iter + 1, num_all_iterations, 
                                                              self.loss_history[-1])
                    print(msg)
                n_all_iter += 1
            self._update_history()
            
            # Keep track of the best model
            val_acc = self.val_acc_history[-1]
            if val_acc > self.best_val_acc:
                self.best_val_acc = val_acc
                self.best_params  = OrderedDict()
                for param_name, param_value in self.model.get_params().items():
                    self.best_params[param_name] = param_value.copy()

            # Maybe print training loss
            if self.verbose & ((n_epoch + 1) % self.print_every_epoch == 0):
                msg = '(Epoch {}/{}) train acc: {:.2}; val acc: {:.2}, train loss: {:.4}; val loss: {:.4}'.format(
                    self.n_epoch + 1, self.num_epochs, 
                    self.train_acc_history[-1],  self.val_acc_history[-1],
                    self.train_loss_history[-1], self.val_loss_history[-1])
                print(msg)
            
            # Save the model at the end of every epoch
            self._save_checkpoint()
            
            # At the end of every epoch, increment the epoch counter and decay the learning rate.
            for k in self.optim_configs:
                optim_config = self.optim_configs[k]
                lr_decay = optim_config.get('learning_rate_decay', 1.0)
                optim_config['learning_rate'] *= lr_decay

        # At the end of training swap the best params into the model
        self.model.params = self.best_params
        return self.model