# Backpropagation example

- dataset: MNIST
- layers: fullyconnected
- activation: sigmoid
- loss: categorical cross entropy

In [0]:
import numpy as np
from sklearn import datasets

In [0]:
class FullyConnectedLayer():
  """
    Fully Connected Layer
    
  """
  def __init__(self, n_outputs, seed=None,
               initializer=np.random.standard_normal):
    self.n_inputs = None
    self.n_outputs = n_outputs
    
    # variables
    self.weights = None
    self.bias = None
    
    # initializer
    self._is_initialized = False
    self.seed = seed
    self.initializer = initializer
    
    # keep track of last output
    self.outputs = {
        "z": None
    }
  
  def _initialize(self):
    """
      Weights and bias initialization
    
    """
    np.random.seed(self.seed)
    self.weights = self.initializer([self.n_inputs, self.n_outputs])
    self.bias = self.initializer([1, self.n_outputs])
    
  def forward_pass(self, X):
    """
      Evaluate output of forward pass
      Lazy initialization on weights and bias
      
      Parameters
      ----------
      X: numpy array [batch_size, features]
      
      Output
      ------
      numpy array X * W + b
    
    """
    if self._is_initialized is False:
      self.n_inputs = X.shape[1]
      self._initialize()
    
    z = np.dot(X, self.weights) + self.bias  
    self.outputs['z'] = np.copy(z)
    
    return z
  
  def backward_pass(self):
    pass

In [0]:
class SigmoidLayer():
  
  def __init__(self):
    # keep track of last output
    self.outputs = {
        "z": None
    }
  
  def forward_pass(self, X):
    """
      Apply pointwise sigmoid to inputs
      
      Parameters
      ----------
      X: numpy array [batch_size, ...]
      
      Output
      ------
      numpy array dimension: input dimension 
    
    """
    z = 1.0 / (1.0 + np.exp(-X))
    
    self.outputs['z'] = z
    
    return z
    
  def backward_pass(self):
    pass

## Softmax
Practical implementation, to overcome numerical instabilities. 
>$\mbox{softmax} : \mathcal{R}^{(n, m)}\rightarrow \mathcal{R}^{(n, m)}$

>$[\mbox{softmax}(X)]_{i,j} = \frac{
 e^{X_{i,j}}
 }
 {
 \sum_j e^{X_{i,j}}
 } = 
 \frac{
 C e^{X_{i,j}}
 }
 {
 C \sum_j e^{X_{i,j}}
 }
 =
 \frac{
 e^{X_{i,j} + \log C_i }
 }
 {
 \sum_j e^{X_{i,j}+ \log C_i}
 }
 $
 
with 
>$\log C_i = \max_j X_{i,j}$

In [0]:
def softmax(X):
  """
    Evaluate softmax along axis=1
    
    Parameters
    ----------
    X: numpy array [batch_size, classes]
    
    Output
    ------
    softmax along axis=1
  """
  num = np.exp( logits - np.expand_dims(np.max(logits,axis=1), axis=1) )
  z = num / np.expand_dims(np.sum(num, axis=1), axis=1)
    
  return z

## Cross Entropy
Given $n$ batch size,  $c$ classes:
> $p \in \mathcal{R}^{(n, c)} $: output of NN in terms of probability per class

> $l \in \mathcal{R}^{(n, 1)} $: labels

> $t \in \mathcal{R}^{(n, c)} $: onehot of labels

>$ E( p; l ) = \frac{1}{n} \sum_{i} \sum_j t_{i,j} \log(p_{i,j}) = \frac{1}{n} \sum_i \log(p_{i,l_{i}})$

##Cross Entropy with logits

>$ E( X; l ) = \frac{1}{n} \sum_i \log(p_{i,l_{i}}) = \frac{1}{n} \sum_i \log \frac{
 e^{X_{i,l_i}}
 }
 {
 \sum_j e^{X_{i,j}}
 }  $
 
 >$ = \frac{1}{n} \sum_i \left( \log(e^{X_{i,l_i}}) - \log (\sum_j e^{X_{i,j}}) \right) = $
 
 > $ = \frac{1}{n} \sum_i \left( X_{i,l_i} - \log (\sum_j e^{X_{i,j}}) \right) $


In [0]:
def cross_entropy(p, labels):
  """
    Evaluate cross entropy
    
    Parameters
    ----------
    p: numpy array [batch_size, classes]
    labels: numpy array [batch_size, 1]
    
    Output
    ------
    scalar
  """
  batch_size = p.shape[0]
      
  log_likelihood = -np.log(p[range(batch_size), labels])
  loss = np.mean(log_likelihood)
  
  return loss
  
  
def cross_entropy_with_logits(logits, labels):
  """
    Evaluate cross entropy
    
    Parameters
    ----------
    p: numpy array [batch_size, classes]
    labels: numpy array [batch_size, 1]
    
    Output
    ------
    scalar
  """
  loss = - np.mean(logits[range(batch_size), labels] - np.log(np.sum(np.exp(logits), axis=1)))
  
  return loss

In [7]:
 # Load dataset
from keras.datasets import mnist

(X_train, y_train), (X_test, y_test) = mnist.load_data()

# reshape to have one row per observation
X_train = X_train.reshape(-1, 28*28)
X_test = X_test.reshape(-1, 28*28)

# reshape to have one column labels
y_train = np.expand_dims(y_train, axis=1)
y_test = np.expand_dims(y_test, axis=1)  

Using TensorFlow backend.


In [0]:
def forward_pass(net, batch):
   # forward pass
  output = batch
  for layer in net:
    output = layer.forward_pass(output)
      
  return output

In [0]:
net = []

net.append(FullyConnectedLayer(10, seed=42, initializer=(lambda x: 0.05*np.random.standard_normal(x)) ))
# net.append(SigmoidLayer())

In [21]:
batch_size = 1000
batch_features = X_train[:batch_size]
batch_labels = y_train[:batch_size]


logits = forward_pass(net, batch_features)
p = softmax(logits)
loss = cross_entropy(p, batch_labels)

loss2 = cross_entropy_with_logits(logits, batch_labels)

print(loss - loss2)

-7.284492653525376e-08


In [0]:
def epoch_batcher(features, labels, batch_size=100):
    # Provide chunks one by one
    start = 0
    N = len(features)
    while start < N:
        rows = range(start,start+batch_size)
        start += batch_size
        yield features[rows], labels[rows,:]

In [28]:
n_epochs=2
batch_size = 2000

net = []
net.append(FullyConnectedLayer(10, seed=42, initializer=(lambda x: 0.01*np.random.standard_normal(x)) ))


for epoch in range(n_epochs):
  for idx, (features, labels) in enumerate(epoch_batcher(X_train, y_train, batch_size=batch_size)):
    
    logits = forward_pass(net, features)
    loss = cross_entropy_with_logits(logits, labels)

    print('epoch {: 4d}, batch {: 4d} loss {:6.4e}'.format(epoch,idx,loss))

epoch    0, batch    0 loss 3.3368e+01
epoch    0, batch    1 loss 3.4153e+01
epoch    0, batch    2 loss 3.4138e+01
epoch    0, batch    3 loss 3.3276e+01
epoch    0, batch    4 loss 3.4029e+01
epoch    0, batch    5 loss 3.4354e+01
epoch    0, batch    6 loss 3.4320e+01
epoch    0, batch    7 loss 3.3618e+01
epoch    0, batch    8 loss 3.3855e+01
epoch    0, batch    9 loss 3.4344e+01
epoch    0, batch   10 loss 3.3890e+01
epoch    0, batch   11 loss 3.4459e+01
epoch    0, batch   12 loss 3.4357e+01
epoch    0, batch   13 loss 3.3758e+01
epoch    0, batch   14 loss 3.4335e+01
epoch    0, batch   15 loss 3.3450e+01
epoch    0, batch   16 loss 3.4032e+01
epoch    0, batch   17 loss 3.3693e+01
epoch    0, batch   18 loss 3.4438e+01
epoch    0, batch   19 loss 3.3808e+01
epoch    0, batch   20 loss 3.3207e+01
epoch    0, batch   21 loss 3.3785e+01
epoch    0, batch   22 loss 3.3605e+01
epoch    0, batch   23 loss 3.3775e+01
epoch    0, batch   24 loss 3.4288e+01
epoch    0, batch   25 lo