# RPROP Demo
This notebook demonstrates the usage of The RPROP optimizer on the example of the Autoencoder and a simple network trained on the Fashion MNIST dataset.
The RPROP optimizer optimizes the parameters based on the direction of the gradient, but not its magnitude. In each iteration it goes one step in the opposite direction of the gradient.The stepsize is small in the beginning and is increased in each iteration until the gradient changes its direction, which means the Minimum was crossed. Then it goes back the last step and the stepsize is decreased for the next step. Then the stepsize is increased again until the gradient changes its sign again.

In [None]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
%matplotlib notebook

from scipy import interpolate
from mpl_toolkits.mplot3d import Axes3D
from sklearn.model_selection import train_test_split

# import the RPROP optimizer
from psipy.nn.optimizers.rprop import RpropPlus, iRpropPlus

# import the autoeconder.
from psipy.dataroom.internal.autoencoder import FullyConnectedAutoencoder

# Set the random seed.
from numpy.random import seed
seed(10)
from tensorflow import set_random_seed
set_random_seed(10)

## Fashion MNIST Dataset

In [None]:
# Import the Fashion MNIST dataset
fashion_mnist = keras.datasets.fashion_mnist

(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat',
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

In [None]:
# The data must be preprocessed before training the network. The pixel values fall in the range of 0 to 255. 
# Scale values to a range of 0 to 1 before feeding them to the neural network model. To do so, divide the values 
# by 255.

train_images = train_images / 255.0
test_images = test_images / 255.0

In [None]:
# Build the model 
model = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(10, activation=tf.nn.softmax)
])

# Use RpropPlus() or iRpropPlus() as optimizer in the model
model.compile(optimizer=RpropPlus(),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
# Train & evaluate the model with full batch_size
model.fit(train_images, train_labels, batch_size=len(train_images), epochs=10)
test_loss, test_acc = model.evaluate(test_images,  test_labels, verbose=2)
print('\nTest accuracy:', test_acc)

### Comparison with same model, but Adam optimizer and minibatches

In [None]:
# Build the model 
model2 = keras.Sequential([
    keras.layers.Flatten(input_shape=(28, 28)),
    keras.layers.Dense(128, activation=tf.nn.relu),
    keras.layers.Dense(10, activation=tf.nn.softmax)
])

# Use Adam as optimizer in the model
model2.compile(optimizer='Adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
# Train & evaluate the model with full batch_size
model2.fit(train_images, train_labels, epochs=10)
test_loss2, test_acc2 = model2.evaluate(test_images,  test_labels, verbose=2)
print('\nTest accuracy:', test_acc2)

## Autoencoder

The following code is completely copied from the Autoencoer Demo notebook. For detailed explanation, look at its comments.

In [None]:
# Define methods for data generation.
def cubeND(n):
    """Returns waypoints for non-intersecting path connecting the corners of an n-D cube"""
    pattern = [0, 1, 1, 0]
    c = np.zeros((2**n+1, n))
    for d in range(n-1):
        repeated_pattern = np.repeat(pattern, 2**d)
        repeated_tiled_pattern = np.tile(repeated_pattern, 2**(n-d-2))
        c[:-1,d] = repeated_tiled_pattern
    c[:-1,-1] = np.repeat(pattern, 2**(n-1))[:2**n]
    c[2**n,:] = c[0,:]
    return c

def dataCubeND(n, step=0.01, std=0.05):
    """Returns data points with added noise along path through n-D cube"""
    c = cubeND(n)
    interp = interpolate.interp1d(range(len(c)), c, axis=0)
    d = interp(np.arange(0, len(c)-1+step, step))
    return d + np.random.standard_normal(d.shape)*std

In [None]:
data = dataCubeND(3, 0.01, 0.01) - 0.5
X_train, X_test = train_test_split(data)
topology = [3, 6, 2]
activation = 'sigmoid'

In [None]:
# Construct the model.
# Use RpropPlus() or iRpropPlus() as optimizer in the model
ae = FullyConnectedAutoencoder(optimizer=RpropPlus() ,topology=topology, 
                               hidden_layer_kwargs=dict(activation=activation, kernel_initializer='random_normal'),
                               code_layer_kwargs=dict(activation=activation, kernel_initializer='random_normal'))

In [None]:
# We start by first pretraining the network. Instead of training the whole network, pretraining starts by training
# individual layers, adding layers gradually and retraining until all the network layers have been added.
# It acts as a 'warm up' for the actual training, as our network already has some sense of 'direction'.
# We do the training for the set number of epochs, where in each epoch we feed each of the training 
# sample in X_train to the network.
# In contrast to the standard optimizer used in Autoencoder Demo notebook, the RPROP optimizer needs no 
# learning rate lr.
# The RPROP optimizer works on the full batch size. If it is not specified the default batch size would be 32.


val_error = ae.pretrain(X_train, validation_data=(X_test, X_test), batch_size=len(X_train), epochs=1000)

In [None]:
val_error = ae.fit(X_train, validation_data=(X_test, X_test), batch_size=len(X_train), epochs=15000)

### Evaluation
In the following ther eis some evaluation of the trained network.

In [None]:
# Transform data to the learned lower dimensional embedding.
embedding = ae.transform(data)

# Try to reconstruct the original higher dimensional data from the learned lower dimensional embedding.
rec = ae.inverse_transform(embedding)

In [None]:
# We now plot the orginal data, the learned lower embedding and the data
# reconstructed from only the lower dimensional embedding. 
# The hope is that the original data and reconstructed data look very similar.
# And by looking at the learned embedding one could also get some insight in to what 
# the network learns.


cm = plt.cm.get_cmap('RdYlGn')
fig = plt.figure(figsize=(4.5,12))


# Plot the original data.
ax = fig.add_subplot(311, projection='3d')
ax.scatter(data[:,0], data[:,1], zs=data[:,2], c=np.arange(len(data)), cmap=cm)
plt.title('Original')


# Plot the learned lower dimensional embedding.
ax = fig.add_subplot(312)
ax.scatter(embedding[:,0], embedding[:,1], c=np.arange(len(data)), cmap=cm)
plt.title('Embedding')


# Plot the image reconstructed using the learned lower dimensional embedding.
ax = fig.add_subplot(313, projection='3d')
ax.scatter(rec[:, 0], rec[:, 1], zs=rec[:, 2], c=np.arange(len(data)), cmap=cm)
plt.title('Reconstruction')
plt.tight_layout()