In [None]:
from keras.layers import Dense, InputLayer
from keras.models import Sequential, Model
from keras.utils import to_categorical

from keras.utils.np_utils import to_categorical
from keras.optimizers import Adam, SGD
import numpy as np
import matplotlib as mpl
from talk_plottingutils import plot_3d
%pylab
%matplotlib inline

# Introduction

A multilayer perceptron is a simple extension of logistic regression:
## logistic regression: 
- $p_i = \sigma(W x_i + b$)
- $y_i \sim Bernoulli(p_i)$

- mapping the input directly to output: 
- **linear** decision boundary
![logreg](images/MLP/logreg.png)

## Multilayer perceptron:
- we add another layer in between input and output
- this **hidden layer** allows us to learn a different **representation of the input data**
- the data might be linearly separable in the hidden layer, even though it wasnt in the input space
![mlp](images/MLP/MLP.png)


# A new dataset
not linearly separable, so logistic regression will fail

In [None]:
from talk_utils import create_nonlin_data
X, y = create_nonlin_data(1000)
plt.scatter(X[:,0], X[:,1], c=y, alpha=0.2, cmap=plt.cm.bwr); plt.xlabel('x1'), plt.ylabel('x2');

# Keras
Instead of implementing the MLP from scratch (calculating all the gradients etc), let's use [keras](https://keras.io), a high level library for NNs.

In [None]:
def MLP_factory():
    MLP = Sequential()

    #input layer
    MLP.add(InputLayer(input_shape=(2,), name='input_layer'))

    # hidden layer with 3 neurons/units
    MLP.add(Dense(units=3, name='hidden_layer', activation='sigmoid'))

    # output layer, outputs two class probabilities
    MLP.add(Dense(units=2, name='output_layer', activation='softmax'))

    return MLP

MLP = MLP_factory() 
MLP.summary()

Before we fit it, we must 'compile' the model and define the type of **optimization algorithm** to minimize the loss.
<img src="images/MLP/gradient_descent.png" alt="Watershedding" style="width:400px;" title="http://dsdeepdive.blogspot.com/2016/03/optimizations-of-gradient-descent.html"/>


In [None]:
# define a loss function and optimizer
MLP.compile(loss='categorical_crossentropy',
              optimizer='adam',     # or 'sge' as simplest case
              metrics=['accuracy'])

In [None]:
training_history = MLP.fit(X,to_categorical(y), epochs=50,  verbose=1)

In [None]:
plt.figure(figsize=(15,5))
subplot(121);plt.plot(training_history.epoch, training_history.history['loss']); 
plt.xlabel('Epoch');plt.ylabel('Loss');

subplot(122);plt.plot(training_history.epoch, training_history.history['acc']); 
plt.xlabel('Epoch');plt.ylabel('Accuracy');

**Task 1**: Visualize the decision boundary (Hint: `MLP.predict()`)

In [None]:
...

**Solution 1**

In [None]:
%load solutions/mlp-01.py

# The latent representation
<img src="images/MLP/MLP.png" alt="Watershedding" style="width:300px;">

As discussed, the key is the latent representation, i.e. the hidden layer of the MLP.
Let's have a look at it.

In [None]:
# the slightly complicated way to get access to the activations of intermediate layers
Hmodel = Model(inputs=MLP.get_layer('input_layer').input, 
               outputs=MLP.get_layer('hidden_layer').output)
h = Hmodel.predict(X)  # that is the latent 3D representation of our data
h.shape

here's how it looks:

In [None]:
plot_3d(h, y)

- the MLP **learned a representation** where all red datapoints are pushed into one corner of the cube.
- in this 3D representation, the two classes are **linearly separable**

## The latent representation during training
How does this latent representation change during training?
Here's a slightly hacky way to do a step-by-step gradient descend with keras recording the hidden layer (better use *callbacks*)

In [None]:
MLP = MLP_factory() 
MLP.compile(loss='categorical_crossentropy',
              optimizer=Adam(0.005), 
              metrics=['accuracy'])

h_vector = []
for epoch in range(40):
    # get latent rep
    Hmodel = Model(inputs=MLP.get_layer('input_layer').input, 
                   outputs=MLP.get_layer('hidden_layer').output)
    h = Hmodel.predict(X)  # that is the latent 3D representation of our data
    h_vector.append(h)
    
    # update with one gradient step
    MLP.fit(X,to_categorical(y), epochs=1)

# alternatively: step by step GD in tensorflow
but using keras to define the layers. In tf, one would have to first implement the layer operations...

In [None]:
import tensorflow as tf
sess = tf.Session()

from keras import backend as K
K.set_session(sess)

## model definitition
img = tf.placeholder(tf.float32, shape=(None, 2))
H = Dense(3, activation='sigmoid', name='hidden_layer')(img)
preds = Dense(2, activation='sigmoid', name='output_layer')(H)  # thats the predicted class scores
labels = tf.placeholder(tf.float32, shape=(None, 2))  # here , we feed in the true labels

from keras.objectives import categorical_crossentropy
loss = tf.reduce_mean(categorical_crossentropy(labels, preds))  # just comparing prediction and truth

# define a single step of gradient descend on the loss
train_step = tf.train.AdamOptimizer(0.1).minimize(loss)

In [None]:
# Initialize all variables
init_op = tf.global_variables_initializer()
sess.run(init_op)

# Run training loop
h_vector = []
with sess.as_default():
    for i in range(100):
        h_tmp = sess.run(H,feed_dict={img: X})
        h_vector.append(h_tmp)
        train_step.run(feed_dict={img: X,
                                  labels: to_categorical(y,2)})

# Animation
using holoviews to display the latent respresentation over time

In [None]:
import holoviews as hv
hv.extension('matplotlib')
scatter_dict = {i: hv.Scatter3D(h[y==0][:200])*hv.Scatter3D(h[y==1][:200]) for i,h in enumerate(h_vector[:200])} 
hmap = hv.HoloMap(scatter_dict, kdims=['Epoch'])
hmap

# Deep Neural Net

- just add more layer between input and output
- more nonlinearities

![DNN](images/MLP/DNN.png)

## GoogLenet
![DNN](images/MLP/googlenet.png)
