From cdbe259939fd33561ed4cf728103c415ae65b503 Mon Sep 17 00:00:00 2001 From: Sanuj Date: Wed, 8 Jun 2016 14:10:53 +0530 Subject: [PATCH] add setters/getters in nn and autoencoders --- .../neuralnets/autoencoders.ipynb | 949 +++++++-------- .../neuralnets/neuralnets_digits.ipynb | 1034 +++++++++-------- src/shogun/io/NeuralNetworkFileReader.cpp | 28 +- src/shogun/neuralnets/Autoencoder.h | 63 +- src/shogun/neuralnets/DeepAutoencoder.cpp | 24 +- src/shogun/neuralnets/NeuralNetwork.h | 310 ++++- .../io/NeuralNetworkFileReader_unittest.cc | 24 +- .../unit/neuralnets/NeuralNetwork_unittest.cc | 38 +- 8 files changed, 1381 insertions(+), 1089 deletions(-) diff --git a/doc/ipython-notebooks/neuralnets/autoencoders.ipynb b/doc/ipython-notebooks/neuralnets/autoencoders.ipynb index ed5b1a0c68d..4539efef4bd 100644 --- a/doc/ipython-notebooks/neuralnets/autoencoders.ipynb +++ b/doc/ipython-notebooks/neuralnets/autoencoders.ipynb @@ -1,471 +1,484 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Deep Autoencoders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### by Khaled Nasr as a part of a GSoC 2014 project mentored by Theofanis Karaletsos and Sergey Lisitsyn " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook illustrates how to train and evaluate a deep autoencoder using Shogun. We'll look at both regular fully-connected autoencoders and convolutional autoencoders." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "A (single layer) [autoencoder](http://deeplearning.net/tutorial/dA.html#autoencoders) is a neural network that has three layers: an input layer, a hidden (encoding) layer, and a decoding layer. The network is trained to reconstruct its inputs, which forces the hidden layer to try to learn good representations of the inputs.\n", + "\n", + "In order to encourage the hidden layer to learn good input representations, certain variations on the simple autoencoder exist. Shogun currently supports two of them: Denoising Autoencoders [1] and Contractive Autoencoders [2]. In this notebook we'll focus on denoising autoencoders. \n", + "\n", + "For denoising autoencoders, each time a new training example is introduced to the network, it's randomly corrupted in some mannar, and the target is set to the original example. The autoencoder will try to recover the orignal data from it's noisy version, which is why it's called a denoising autoencoder. This process will force the hidden layer to learn a good representation of the input, one which is not affected by the corruption process.\n", + "\n", + "A deep autoencoder is an autoencoder with multiple hidden layers. Training such autoencoders directly is usually difficult, however, they can be pre-trained as a stack of single layer autoencoders. That is, we train the first hidden layer to reconstruct the input data, and then train the second hidden layer to reconstruct the states of the first hidden layer, and so on. After pre-training, we can train the entire deep autoencoder to fine-tune all the parameters together. We can also use the autoencoder to initialize a regular neural network and train it in a supervised manner.\n", + "\n", + "In this notebook we'll apply deep autoencoders to the USPS dataset for handwritten digits. We'll start by loading the data and dividing it into a training set and a test set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pylab inline\n", + "%matplotlib inline\n", + "from scipy.io import loadmat\n", + "from modshogun import RealFeatures, MulticlassLabels, Math\n", + "\n", + "# load the dataset\n", + "dataset = loadmat('../../../data/multiclass/usps.mat')\n", + "\n", + "Xall = dataset['data']\n", + "# the usps dataset has the digits labeled from 1 to 10 \n", + "# we'll subtract 1 to make them in the 0-9 range instead\n", + "Yall = np.array(dataset['label'].squeeze(), dtype=np.double)-1 \n", + "\n", + "# 4000 examples for training\n", + "Xtrain = RealFeatures(Xall[:,0:4000])\n", + "Ytrain = MulticlassLabels(Yall[0:4000])\n", + "\n", + "# the rest for testing\n", + "Xtest = RealFeatures(Xall[:,4000:-1])\n", + "Ytest = MulticlassLabels(Yall[4000:-1])\n", + "\n", + "# initialize the random number generator with a fixed seed, for repeatability\n", + "Math.init_random(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the autoencoder" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Similar to regular neural networks in Shogun, we create a [deep autoencoder](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html) using an array of [NeuralLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayer.html)-based classes, which can be created using the utility class [NeuralLayers](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayers.html). However, for deep autoencoders there's a restriction that the layer sizes in the network have to be symmetric, that is, the first layer has to have the same size as the last layer, the second layer has to have the same size as the second-to-last layer, and so on. This restriction is necessary for pre-training to work. More details on that can found in the following section.\n", + "\n", + "We'll create a 5-layer deep autoencoder with following layer sizes: 256->512->128->512->256. We'll use [rectified linear neurons](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralRectifiedLinearLayer.html) for the hidden layers and [linear neurons](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLinearLayer.html) for the output layer." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import NeuralLayers, DeepAutoencoder\n", + "\n", + "layers = NeuralLayers()\n", + "layers = layers.input(256).rectified_linear(512).rectified_linear(128).rectified_linear(512).linear(256).done()\n", + "\n", + "ae = DeepAutoencoder(layers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Pre-training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can pre-train the network. To illustrate exactly what's going to happen, we'll give the layers some labels: L1 for the input layer, L2 for the first hidden layer, and so on up to L5 for the output layer.\n", + "\n", + "In pre-training, an autoencoder will formed for each encoding layer (layers up to the middle layer in the network). So here we'll have two autoencoders: L1->L2->L5, and L2->L3->L4. The first autoencoder will be trained on the raw data and used to initialize the weights and biases of layers L2 and L5 in the deep autoencoder. After the first autoencoder is trained, we use it to transform the raw data into the states of L2. These states will then be used to train the second autoencoder, which will be used to initialize the weights and biases of layers L3 and L4 in the deep autoencoder.\n", + "\n", + "The operations described above are performed by the the [pre_train()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#acf6896cb166afbba063fd1257cb8bc97) function. Pre-training parameters for each autoencoder can be controlled using the [pt_* public attributes](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#a6389a6f19b8854c64e1b6be5aa0c1fc4) of [DeepAutoencoder](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html). Each of those attributes is an [SGVector](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1SGVector.html) whose length is the number of autoencoders in the deep autoencoder (2 in our case). It can be used to set the parameters for each autoencoder indiviually. [SGVector's set_const()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1SGVector.html#a8bce01a1fc41a734d9b5cf1533fd7a2a) method can also be used to assign the same parameter value for all autoencoders.\n", + "\n", + "Different noise types can be used to corrupt the inputs in a denoising autoencoder. Shogun currently supports 2 [noise types](http://www.shogun-toolbox.org/doc/en/latest/namespaceshogun.html#af95cf5d3778127a87c8a67516405d863): dropout noise, where a random portion of the inputs is set to zero at each iteration in training, and gaussian noise, where the inputs are corrupted with random gaussian noise. The noise type and strength can be controlled using [pt_noise_type](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#af6e5d2ade5cb270cc50565d590f929ae) and [pt_noise_parameter](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#adbdff6c07fa7dd70aaf547e192365075). Here, we'll use dropout noise." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import AENT_DROPOUT, NNOM_GRADIENT_DESCENT\n", + "\n", + "ae.pt_noise_type.set_const(AENT_DROPOUT) # use dropout noise\n", + "ae.pt_noise_parameter.set_const(0.5) # each input has a 50% chance of being set to zero\n", + "\n", + "ae.pt_optimization_method.set_const(NNOM_GRADIENT_DESCENT) # train using gradient descent\n", + "ae.pt_gd_learning_rate.set_const(0.01)\n", + "ae.pt_gd_mini_batch_size.set_const(128)\n", + "\n", + "ae.pt_max_num_epochs.set_const(50)\n", + "ae.pt_epsilon.set_const(0.0) # disable automatic convergence testing\n", + "\n", + "# uncomment this line to allow the training progress to be printed on the console\n", + "#from modshogun import MSG_INFO; ae.io.set_loglevel(MSG_INFO)\n", + "\n", + "# start pre-training. this might take some time\n", + "ae.pre_train(Xtrain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Fine-tuning" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "After pre-training, we can train the autoencoder as a whole to fine-tune the parameters. Training the whole autoencoder is performed using the [train()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CAutoencoder.html#ace3eb6cc545affcbfa31d754ffd087dc) function. Training parameters are controlled through the [public attributes](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#pub-attribs), same as a regular neural network." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "ae.set_noise_type(AENT_DROPOUT) # same noise type we used for pre-training\n", + "ae.set_noise_parameter(0.5)\n", + "\n", + "ae.set_max_num_epochs(50)\n", + "ae.set_optimization_method(NNOM_GRADIENT_DESCENT)\n", + "ae.set_gd_mini_batch_size(128)\n", + "ae.set_gd_learning_rate(0.0001)\n", + "ae.set_epsilon(0.0)\n", + "\n", + "# start fine-tuning. this might take some time\n", + "_ = ae.train(Xtrain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can evaluate the autoencoder that we trained. We'll start by providing it with corrupted inputs and looking at how it will reconstruct them. The function [reconstruct()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#ae8c2d565cf2ea809103d0557c57689c7) is used to obtain the reconstructions:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# get a 50-example subset of the test set\n", + "subset = Xtest[:,0:50].copy()\n", + "\n", + "# corrupt the first 25 examples with multiplicative noise\n", + "subset[:,0:25] *= (random.random((256,25))>0.5)\n", + "\n", + "# corrupt the other 25 examples with additive noise \n", + "subset[:,25:50] += random.random((256,25))\n", + "\n", + "# obtain the reconstructions\n", + "reconstructed_subset = ae.reconstruct(RealFeatures(subset))\n", + "\n", + "# plot the corrupted data and the reconstructions\n", + "figure(figsize=(10,10))\n", + "for i in range(50):\n", + " ax1=subplot(10,10,i*2+1)\n", + " ax1.imshow(subset[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", + " ax1.set_xticks([])\n", + " ax1.set_yticks([])\n", + "\n", + " ax2=subplot(10,10,i*2+2)\n", + " ax2.imshow(reconstructed_subset[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", + " ax2.set_xticks([])\n", + " ax2.set_yticks([])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The figure shows the corrupted examples and their reconstructions. The top half of the figure shows the ones corrupted with multiplicative noise, the bottom half shows the ones corrupted with additive noise. We can see that the autoencoders can provide decent reconstructions despite the heavy noise.\n", + "\n", + "Next we'll look at the weights that the first hidden layer has learned. To obtain the weights, we can call the [get_layer_parameters()]() function, which will return a vector containing both the weights and the biases of the layer. The biases are stored first in the array followed by the weights matrix in column-major format." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# obtain the weights matrix of the first hidden layer\n", + "# the 512 is the number of biases in the layer (512 neurons)\n", + "# the transpose is because numpy stores matrices in row-major format, and Shogun stores \n", + "# them in column major format\n", + "w1 = ae.get_layer_parameters(1)[512:].reshape(256,512).T\n", + "\n", + "# visualize the weights between the first 100 neurons in the hidden layer \n", + "# and the neurons in the input layer\n", + "figure(figsize=(10,10))\n", + "for i in range(100):\n", + "\tax1=subplot(10,10,i+1)\n", + "\tax1.imshow(w1[i,:].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", + "\tax1.set_xticks([])\n", + "\tax1.set_yticks([])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now, we can use the autoencoder to initialize a supervised neural network. The network will have all the layer of the autoencoder up to (and including) the middle layer. We'll also add a softmax output layer. So, the network will look like: L1->L2->L3->Softmax. The network is obtained by calling [convert_to_neural_network()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#a8c179cd9a503b2fa78b9bfe10ae473e5):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import NeuralSoftmaxLayer\n", + "\n", + "nn = ae.convert_to_neural_network(NeuralSoftmaxLayer(10))\n", + "\n", + "nn.set_max_num_epochs(50)\n", + "\n", + "nn.set_labels(Ytrain)\n", + "_ = nn.train(Xtrain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Next, we'll evaluate the accuracy on the test set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import MulticlassAccuracy\n", + "\n", + "predictions = nn.apply_multiclass(Xtest)\n", + "accuracy = MulticlassAccuracy().evaluate(predictions, Ytest) * 100\n", + "\n", + "print \"Classification accuracy on the test set =\", accuracy, \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convolutional Autoencoders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convolutional autoencoders [3] are the adaptation of autoencoders to images (or other spacially-structured data). They are built with convolutional layers where each layer consists of a number of feature maps. Each feature map is produced by convolving a small filter with the layer's inputs, adding a bias, and then applying some non-linear activation function. Additionally, a max-pooling operation can be performed on each feature map by dividing it into small non-overlapping regions and taking the maximum over each region. In this section we'll pre-train a [convolutional network](http://deeplearning.net/tutorial/lenet.html) as a stacked autoencoder and use it for classification.\n", + "\n", + "In Shogun, convolutional autoencoders are constructed and trained just like regular autoencoders. Except that we build the autoencoder using [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) objects:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import DynamicObjectArray, NeuralInputLayer, NeuralConvolutionalLayer, CMAF_RECTIFIED_LINEAR\n", + "\n", + "conv_layers = DynamicObjectArray()\n", + "# 16x16 single channel images\n", + "conv_layers.append_element(NeuralInputLayer(16,16,1)) \n", + "\n", + "# the first encoding layer: 5 feature maps, filters with radius 2 (5x5 filters)\n", + "# and max-pooling in a 2x2 region: its output will be 10 8x8 feature maps\n", + "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 5, 2, 2, 2, 2)) \n", + "\n", + "# the second encoding layer: 15 feature maps, filters with radius 2 (5x5 filters)\n", + "# and max-pooling in a 2x2 region: its output will be 20 4x4 feature maps\n", + "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 15, 2, 2, 2, 2))\n", + "\n", + "# the first decoding layer: same structure as the first encoding layer\n", + "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 5, 2, 2))\n", + "\n", + "# the second decoding layer: same structure as the input layer\n", + "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 1, 2, 2))\n", + "\n", + "conv_ae = DeepAutoencoder(conv_layers)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll pre-train the autoencoder:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "conv_ae.pt_noise_type.set_const(AENT_DROPOUT) # use dropout noise\n", + "conv_ae.pt_noise_parameter.set_const(0.3) # each input has a 30% chance of being set to zero\n", + "\n", + "conv_ae.pt_optimization_method.set_const(NNOM_GRADIENT_DESCENT) # train using gradient descent\n", + "conv_ae.pt_gd_learning_rate.set_const(0.002)\n", + "conv_ae.pt_gd_mini_batch_size.set_const(100)\n", + "\n", + "conv_ae.pt_max_num_epochs[0] = 30 # max number of epochs for pre-training the first encoding layer\n", + "conv_ae.pt_max_num_epochs[1] = 10 # max number of epochs for pre-training the second encoding layer\n", + "conv_ae.pt_epsilon.set_const(0.0) # disable automatic convergence testing\n", + "\n", + "# start pre-training. this might take some time\n", + "conv_ae.pre_train(Xtrain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And then convert the autoencoder to a regular neural network for classification:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "conv_nn = ae.convert_to_neural_network(NeuralSoftmaxLayer(10))\n", + "\n", + "# train the network\n", + "conv_nn.set_epsilon(0.0)\n", + "conv_nn.set_max_num_epochs(50)\n", + "conv_nn.set_labels(Ytrain)\n", + "\n", + "# start training. this might take some time\n", + "_ = conv_nn.train(Xtrain)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And evaluate it on the test set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "predictions = conv_nn.apply_multiclass(Xtest)\n", + "accuracy = MulticlassAccuracy().evaluate(predictions, Ytest) * 100\n", + "\n", + "print \"Classification accuracy on the test set =\", accuracy, \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## References" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- [1] [Stacked Denoising Autoencoders: Learning Useful Representations in a Deep Network with a Local Denoising Criterion, Vincent, 2010](http://jmlr.org/papers/volume11/vincent10a/vincent10a.pdf)\n", + "- [2] [Contractive Auto-Encoders: Explicit Invariance During Feature Extraction, Rifai, 2011](http://machinelearning.wustl.edu/mlpapers/paper_files/ICML2011Rifai_455.pdf)\n", + "- [3] [Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, J. Masci, 2011](http://www.idsia.ch/~ciresan/data/icann2011.pdf)" + ] + } + ], "metadata": { - "name": "", - "signature": "sha256:726c48e3f4ec34218a2fe322545dc56338feaf4bb7d011c40ca80db3cb18095f" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Deep Autoencoders" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "by Khaled Nasr as a part of a GSoC 2014 project mentored by Theofanis Karaletsos and Sergey Lisitsyn " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook illustrates how to train and evaluate a deep autoencoder using Shogun. We'll look at both regular fully-connected autoencoders and convolutional autoencoders." - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "A (single layer) [autoencoder](http://deeplearning.net/tutorial/dA.html#autoencoders) is a neural network that has three layers: an input layer, a hidden (encoding) layer, and a decoding layer. The network is trained to reconstruct its inputs, which forces the hidden layer to try to learn good representations of the inputs.\n", - "\n", - "In order to encourage the hidden layer to learn good input representations, certain variations on the simple autoencoder exist. Shogun currently supports two of them: Denoising Autoencoders [1] and Contractive Autoencoders [2]. In this notebook we'll focus on denoising autoencoders. \n", - "\n", - "For denoising autoencoders, each time a new training example is introduced to the network, it's randomly corrupted in some mannar, and the target is set to the original example. The autoencoder will try to recover the orignal data from it's noisy version, which is why it's called a denoising autoencoder. This process will force the hidden layer to learn a good representation of the input, one which is not affected by the corruption process.\n", - "\n", - "A deep autoencoder is an autoencoder with multiple hidden layers. Training such autoencoders directly is usually difficult, however, they can be pre-trained as a stack of single layer autoencoders. That is, we train the first hidden layer to reconstruct the input data, and then train the second hidden layer to reconstruct the states of the first hidden layer, and so on. After pre-training, we can train the entire deep autoencoder to fine-tune all the parameters together. We can also use the autoencoder to initialize a regular neural network and train it in a supervised manner.\n", - "\n", - "In this notebook we'll apply deep autoencoders to the USPS dataset for handwritten digits. We'll start by loading the data and dividing it into a training set and a test set:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pylab inline\n", - "%matplotlib inline\n", - "from scipy.io import loadmat\n", - "from modshogun import RealFeatures, MulticlassLabels, Math\n", - "\n", - "# load the dataset\n", - "dataset = loadmat('../../../data/multiclass/usps.mat')\n", - "\n", - "Xall = dataset['data']\n", - "# the usps dataset has the digits labeled from 1 to 10 \n", - "# we'll subtract 1 to make them in the 0-9 range instead\n", - "Yall = np.array(dataset['label'].squeeze(), dtype=np.double)-1 \n", - "\n", - "# 4000 examples for training\n", - "Xtrain = RealFeatures(Xall[:,0:4000])\n", - "Ytrain = MulticlassLabels(Yall[0:4000])\n", - "\n", - "# the rest for testing\n", - "Xtest = RealFeatures(Xall[:,4000:-1])\n", - "Ytest = MulticlassLabels(Yall[4000:-1])\n", - "\n", - "# initialize the random number generator with a fixed seed, for repeatability\n", - "Math.init_random(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Creating the autoencoder" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Similar to regular neural networks in Shogun, we create a [deep autoencoder](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html) using an array of [NeuralLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayer.html)-based classes, which can be created using the utility class [NeuralLayers](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayers.html). However, for deep autoencoders there's a restriction that the layer sizes in the network have to be symmetric, that is, the first layer has to have the same size as the last layer, the second layer has to have the same size as the second-to-last layer, and so on. This restriction is necessary for pre-training to work. More details on that can found in the following section.\n", - "\n", - "We'll create a 5-layer deep autoencoder with following layer sizes: 256->512->128->512->256. We'll use [rectified linear neurons](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralRectifiedLinearLayer.html) for the hidden layers and [linear neurons](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLinearLayer.html) for the output layer." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import NeuralLayers, DeepAutoencoder\n", - "\n", - "layers = NeuralLayers()\n", - "layers = layers.input(256).rectified_linear(512).rectified_linear(128).rectified_linear(512).linear(256).done()\n", - "\n", - "ae = DeepAutoencoder(layers)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Pre-training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can pre-train the network. To illustrate exactly what's going to happen, we'll give the layers some labels: L1 for the input layer, L2 for the first hidden layer, and so on up to L5 for the output layer.\n", - "\n", - "In pre-training, an autoencoder will formed for each encoding layer (layers up to the middle layer in the network). So here we'll have two autoencoders: L1->L2->L5, and L2->L3->L4. The first autoencoder will be trained on the raw data and used to initialize the weights and biases of layers L2 and L5 in the deep autoencoder. After the first autoencoder is trained, we use it to transform the raw data into the states of L2. These states will then be used to train the second autoencoder, which will be used to initialize the weights and biases of layers L3 and L4 in the deep autoencoder.\n", - "\n", - "The operations described above are performed by the the [pre_train()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#acf6896cb166afbba063fd1257cb8bc97) function. Pre-training parameters for each autoencoder can be controlled using the [pt_* public attributes](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#a6389a6f19b8854c64e1b6be5aa0c1fc4) of [DeepAutoencoder](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html). Each of those attributes is an [SGVector](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1SGVector.html) whose length is the number of autoencoders in the deep autoencoder (2 in our case). It can be used to set the parameters for each autoencoder indiviually. [SGVector's set_const()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1SGVector.html#a8bce01a1fc41a734d9b5cf1533fd7a2a) method can also be used to assign the same parameter value for all autoencoders.\n", - "\n", - "Different noise types can be used to corrupt the inputs in a denoising autoencoder. Shogun currently supports 2 [noise types](http://www.shogun-toolbox.org/doc/en/latest/namespaceshogun.html#af95cf5d3778127a87c8a67516405d863): dropout noise, where a random portion of the inputs is set to zero at each iteration in training, and gaussian noise, where the inputs are corrupted with random gaussian noise. The noise type and strength can be controlled using [pt_noise_type](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#af6e5d2ade5cb270cc50565d590f929ae) and [pt_noise_parameter](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#adbdff6c07fa7dd70aaf547e192365075). Here, we'll use dropout noise." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import AENT_DROPOUT, NNOM_GRADIENT_DESCENT\n", - "\n", - "ae.pt_noise_type.set_const(AENT_DROPOUT) # use dropout noise\n", - "ae.pt_noise_parameter.set_const(0.5) # each input has a 50% chance of being set to zero\n", - "\n", - "ae.pt_optimization_method.set_const(NNOM_GRADIENT_DESCENT) # train using gradient descent\n", - "ae.pt_gd_learning_rate.set_const(0.01)\n", - "ae.pt_gd_mini_batch_size.set_const(128)\n", - "\n", - "ae.pt_max_num_epochs.set_const(50)\n", - "ae.pt_epsilon.set_const(0.0) # disable automatic convergence testing\n", - "\n", - "# uncomment this line to allow the training progress to be printed on the console\n", - "#from modshogun import MSG_INFO; ae.io.set_loglevel(MSG_INFO)\n", - "\n", - "# start pre-training. this might take some time\n", - "ae.pre_train(Xtrain)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Fine-tuning" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "After pre-training, we can train the autoencoder as a whole to fine-tune the parameters. Training the whole autoencoder is performed using the [train()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CAutoencoder.html#ace3eb6cc545affcbfa31d754ffd087dc) function. Training parameters are controlled through the [public attributes](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#pub-attribs), same as a regular neural network." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "ae.noise_type = AENT_DROPOUT # same noise type we used for pre-training\n", - "ae.noise_parameter = 0.5\n", - "\n", - "ae.max_num_epochs = 50\n", - "ae.optimization_method = NNOM_GRADIENT_DESCENT\n", - "ae.gd_mini_batch_size = 128\n", - "ae.gd_learning_rate = 0.0001\n", - "ae.epsilon = 0.0\n", - "\n", - "# start fine-tuning. this might take some time\n", - "_ = ae.train(Xtrain)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can evaluate the autoencoder that we trained. We'll start by providing it with corrupted inputs and looking at how it will reconstruct them. The function [reconstruct()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#ae8c2d565cf2ea809103d0557c57689c7) is used to obtain the reconstructions:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# get a 50-example subset of the test set\n", - "subset = Xtest[:,0:50].copy()\n", - "\n", - "# corrupt the first 25 examples with multiplicative noise\n", - "subset[:,0:25] *= (random.random((256,25))>0.5)\n", - "\n", - "# corrupt the other 25 examples with additive noise \n", - "subset[:,25:50] += random.random((256,25))\n", - "\n", - "# obtain the reconstructions\n", - "reconstructed_subset = ae.reconstruct(RealFeatures(subset))\n", - "\n", - "# plot the corrupted data and the reconstructions\n", - "figure(figsize=(10,10))\n", - "for i in range(50):\n", - " ax1=subplot(10,10,i*2+1)\n", - " ax1.imshow(subset[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", - " ax1.set_xticks([])\n", - " ax1.set_yticks([])\n", - "\n", - " ax2=subplot(10,10,i*2+2)\n", - " ax2.imshow(reconstructed_subset[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", - " ax2.set_xticks([])\n", - " ax2.set_yticks([])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The figure shows the corrupted examples and their reconstructions. The top half of the figure shows the ones corrupted with multiplicative noise, the bottom half shows the ones corrupted with additive noise. We can see that the autoencoders can provide decent reconstructions despite the heavy noise.\n", - "\n", - "Next we'll look at the weights that the first hidden layer has learned. To obtain the weights, we can call the [get_layer_parameters()]() function, which will return a vector containing both the weights and the biases of the layer. The biases are stored first in the array followed by the weights matrix in column-major format." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# obtain the weights matrix of the first hidden layer\n", - "# the 512 is the number of biases in the layer (512 neurons)\n", - "# the transpose is because numpy stores matrices in row-major format, and Shogun stores \n", - "# them in column major format\n", - "w1 = ae.get_layer_parameters(1)[512:].reshape(256,512).T\n", - "\n", - "# visualize the weights between the first 100 neurons in the hidden layer \n", - "# and the neurons in the input layer\n", - "figure(figsize=(10,10))\n", - "for i in range(100):\n", - "\tax1=subplot(10,10,i+1)\n", - "\tax1.imshow(w1[i,:].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", - "\tax1.set_xticks([])\n", - "\tax1.set_yticks([])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now, we can use the autoencoder to initialize a supervised neural network. The network will have all the layer of the autoencoder up to (and including) the middle layer. We'll also add a softmax output layer. So, the network will look like: L1->L2->L3->Softmax. The network is obtained by calling [convert_to_neural_network()](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDeepAutoencoder.html#a8c179cd9a503b2fa78b9bfe10ae473e5):" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import NeuralSoftmaxLayer\n", - "\n", - "nn = ae.convert_to_neural_network(NeuralSoftmaxLayer(10))\n", - "\n", - "nn.max_num_epochs = 50\n", - "\n", - "nn.set_labels(Ytrain)\n", - "_ = nn.train(Xtrain)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Next, we'll evaluate the accuracy on the test set:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import MulticlassAccuracy\n", - "\n", - "predictions = nn.apply_multiclass(Xtest)\n", - "accuracy = MulticlassAccuracy().evaluate(predictions, Ytest) * 100\n", - "\n", - "print \"Classification accuracy on the test set =\", accuracy, \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Convolutional Autoencoders" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Convolutional autoencoders [3] are the adaptation of autoencoders to images (or other spacially-structured data). They are built with convolutional layers where each layer consists of a number of feature maps. Each feature map is produced by convolving a small filter with the layer's inputs, adding a bias, and then applying some non-linear activation function. Additionally, a max-pooling operation can be performed on each feature map by dividing it into small non-overlapping regions and taking the maximum over each region. In this section we'll pre-train a [convolutional network](http://deeplearning.net/tutorial/lenet.html) as a stacked autoencoder and use it for classification.\n", - "\n", - "In Shogun, convolutional autoencoders are constructed and trained just like regular autoencoders. Except that we build the autoencoder using [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) objects:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import DynamicObjectArray, NeuralInputLayer, NeuralConvolutionalLayer, CMAF_RECTIFIED_LINEAR\n", - "\n", - "conv_layers = DynamicObjectArray()\n", - "# 16x16 single channel images\n", - "conv_layers.append_element(NeuralInputLayer(16,16,1)) \n", - "\n", - "# the first encoding layer: 5 feature maps, filters with radius 2 (5x5 filters)\n", - "# and max-pooling in a 2x2 region: its output will be 10 8x8 feature maps\n", - "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 5, 2, 2, 2, 2)) \n", - "\n", - "# the second encoding layer: 15 feature maps, filters with radius 2 (5x5 filters)\n", - "# and max-pooling in a 2x2 region: its output will be 20 4x4 feature maps\n", - "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 15, 2, 2, 2, 2))\n", - "\n", - "# the first decoding layer: same structure as the first encoding layer\n", - "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 5, 2, 2))\n", - "\n", - "# the second decoding layer: same structure as the input layer\n", - "conv_layers.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 1, 2, 2))\n", - "\n", - "conv_ae = DeepAutoencoder(conv_layers)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we'll pre-train the autoencoder:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "conv_ae.pt_noise_type.set_const(AENT_DROPOUT) # use dropout noise\n", - "conv_ae.pt_noise_parameter.set_const(0.3) # each input has a 30% chance of being set to zero\n", - "\n", - "conv_ae.pt_optimization_method.set_const(NNOM_GRADIENT_DESCENT) # train using gradient descent\n", - "conv_ae.pt_gd_learning_rate.set_const(0.002)\n", - "conv_ae.pt_gd_mini_batch_size.set_const(100)\n", - "\n", - "conv_ae.pt_max_num_epochs[0] = 30 # max number of epochs for pre-training the first encoding layer\n", - "conv_ae.pt_max_num_epochs[1] = 10 # max number of epochs for pre-training the second encoding layer\n", - "conv_ae.pt_epsilon.set_const(0.0) # disable automatic convergence testing\n", - "\n", - "# start pre-training. this might take some time\n", - "conv_ae.pre_train(Xtrain)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And then convert the autoencoder to a regular neural network for classification:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "conv_nn = ae.convert_to_neural_network(NeuralSoftmaxLayer(10))\n", - "\n", - "# train the network\n", - "conv_nn.epsilon = 0.0\n", - "conv_nn.max_num_epochs = 50\n", - "conv_nn.set_labels(Ytrain)\n", - "\n", - "# start training. this might take some time\n", - "_ = conv_nn.train(Xtrain)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And evaluate it on the test set:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "predictions = conv_nn.apply_multiclass(Xtest)\n", - "accuracy = MulticlassAccuracy().evaluate(predictions, Ytest) * 100\n", - "\n", - "print \"Classification accuracy on the test set =\", accuracy, \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "References" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "- [1] [Stacked Denoising Autoencoders: Learning Useful Representations in a Deep Network with a Local Denoising Criterion, Vincent, 2010](http://jmlr.org/papers/volume11/vincent10a/vincent10a.pdf)\n", - "- [2] [Contractive Auto-Encoders: Explicit Invariance During Feature Extraction, Rifai, 2011](http://machinelearning.wustl.edu/mlpapers/paper_files/ICML2011Rifai_455.pdf)\n", - "- [3] [Stacked Convolutional Auto-Encoders for Hierarchical Feature Extraction, J. Masci, 2011](http://www.idsia.ch/~ciresan/data/icann2011.pdf)" - ] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" } - ] -} \ No newline at end of file + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/doc/ipython-notebooks/neuralnets/neuralnets_digits.ipynb b/doc/ipython-notebooks/neuralnets/neuralnets_digits.ipynb index 31f750a13c2..aea3ed864c7 100644 --- a/doc/ipython-notebooks/neuralnets/neuralnets_digits.ipynb +++ b/doc/ipython-notebooks/neuralnets/neuralnets_digits.ipynb @@ -1,513 +1,529 @@ { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Neural Nets for Digit Classification" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### by Khaled Nasr as a part of a GSoC 2014 project mentored by Theofanis Karaletsos and Sergey Lisitsyn " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook illustrates how to use the NeuralNets module to teach a [neural network](http://en.wikipedia.org/wiki/Artificial_neural_network) to recognize digits. It also explores the different optimization and regularization methods supported by the module. [Convolutional neural networks](http://en.wikipedia.org/wiki/Convolutional_neural_network) are also discussed." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Introduction" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "An Artificial Neural Network is a machine learning model that is inspired by the way biological nervous systems, such as the brain, process information. The building block of neural networks is called a neuron. All a neuron does is take a weighted sum of its inputs and pass it through some non-linear function (activation function) to produce its output. A (feed-forward) neural network is a bunch of neurons arranged in layers, where each neuron in layer *i* takes its input from all the neurons in layer *i-1*. For more information on how neural networks work, [follow this link](https://www.youtube.com/playlist?list=PL6Xpj9I5qXYEcOhn7TqghAJ6NAPrNmUBH).\n", + "\n", + "In this notebook, we'll look at how a neural network can be used to recognize digits. We'll train the network on the USPS dataset of handwritten digits.\n", + "\n", + "We'll start by loading the data and dividing it into a training set, a validation set, and a test set. The USPS dataset has 9298 examples of handwritten digits. We'll intentionally use just a small portion (1000 examples) of the dataset for training . This is to keep training time small and to illustrate the effects of different regularization methods." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "%pylab inline\n", + "%matplotlib inline\n", + "from scipy.io import loadmat\n", + "from modshogun import RealFeatures, MulticlassLabels, Math\n", + "\n", + "# load the dataset\n", + "dataset = loadmat('../../../data/multiclass/usps.mat')\n", + "\n", + "Xall = dataset['data']\n", + "# the usps dataset has the digits labeled from 1 to 10 \n", + "# we'll subtract 1 to make them in the 0-9 range instead\n", + "Yall = np.array(dataset['label'].squeeze(), dtype=np.double)-1 \n", + "\n", + "# 1000 examples for training\n", + "Xtrain = RealFeatures(Xall[:,0:1000])\n", + "Ytrain = MulticlassLabels(Yall[0:1000])\n", + "\n", + "# 4000 examples for validation\n", + "Xval = RealFeatures(Xall[:,1001:5001])\n", + "Yval = MulticlassLabels(Yall[1001:5001])\n", + "\n", + "# the rest for testing\n", + "Xtest = RealFeatures(Xall[:,5002:-1])\n", + "Ytest = MulticlassLabels(Yall[5002:-1])\n", + "\n", + "# initialize the random number generator with a fixed seed, for repeatability\n", + "Math.init_random(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Creating the network" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "To create a neural network in shogun, we'll first create an instance of [NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) and then [initialize](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a8ff6d177c3e2d8977e5fc6920d3e1579) it by telling it how many inputs it has and what type of layers it contains. To specifiy the layers of the network a [DynamicObjectArray](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDynamicObjectArray.html) is used. The array contains instances of [NeuralLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayer.html)-based classes that determine the type of neurons each layer consists of. Some of the supported layer types are: [NeuralLinearLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLinearLayer.html), [NeuralLogisticLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLogisticLayer.html) and\n", + "[NeuralSoftmaxLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralSoftmaxLayer.html).\n", + "\n", + "We'll create a feed-forward, fully connected (every neuron is connected to all neurons in the layer below) neural network with 2 logistic hidden layers and a softmax output layer. The network will have 256 inputs, one for each pixel (16*16 image). The first hidden layer will have 256 neurons, the second will have 128 neurons, and the output layer will have 10 neurons, one for each digit class. Note that we're using a big network, compared with the size of the training set. This is to emphasize the effects of different regularization methods. We'll try training the network with:\n", + "\n", + "* No regularization\n", + "* L2 regularization\n", + "* L1 regularization\n", + "* [Dropout](http://arxiv.org/abs/1207.0580) regularization\n", + "\n", + "Therefore, we'll create 4 versions of the network, train each one of them differently, and then compare the results on the validation set." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import NeuralNetwork, NeuralInputLayer, NeuralLogisticLayer, NeuralSoftmaxLayer\n", + "from modshogun import DynamicObjectArray\n", + "\n", + "# setup the layers\n", + "layers = DynamicObjectArray()\n", + "layers.append_element(NeuralInputLayer(256)) # input layer, 256 neurons\n", + "layers.append_element(NeuralLogisticLayer(256)) # first hidden layer, 256 neurons\n", + "layers.append_element(NeuralLogisticLayer(128)) # second hidden layer, 128 neurons\n", + "layers.append_element(NeuralSoftmaxLayer(10)) # output layer, 10 neurons\n", + "\n", + "# create the networks\n", + "net_no_reg = NeuralNetwork(layers)\n", + "net_no_reg.quick_connect()\n", + "net_no_reg.initialize_neural_network()\n", + "\n", + "net_l2 = NeuralNetwork(layers)\n", + "net_l2.quick_connect()\n", + "net_l2.initialize_neural_network()\n", + "\n", + "net_l1 = NeuralNetwork(layers)\n", + "net_l1.quick_connect()\n", + "net_l1.initialize_neural_network()\n", + "\n", + "net_dropout = NeuralNetwork(layers)\n", + "net_dropout.quick_connect()\n", + "net_dropout.initialize_neural_network()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also visualize what the network would look like. To do that we'll draw a smaller network using [networkx](http://networkx.github.io/). The network we'll draw will have 8 inputs (labeled X), 8 neurons in the first hidden layer (labeled H), 4 neurons in the second hidden layer (labeled U), and 6 neurons in the output layer (labeled Y). Each neuron will be connected to all neurons in the layer that precedes it." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# import networkx, install if necessary\n", + "try:\n", + " import networkx as nx\n", + "except ImportError:\n", + " import pip\n", + " pip.main(['install', '--user', 'networkx'])\n", + " import networkx as nx\n", + " \n", + "G = nx.DiGraph()\n", + "pos = {}\n", + "\n", + "for i in range(8):\n", + " pos['X'+str(i)] = (i,0) # 8 neurons in the input layer\n", + " pos['H'+str(i)] = (i,1) # 8 neurons in the first hidden layer\n", + " \n", + " for j in range(8): G.add_edge('X'+str(j),'H'+str(i))\n", + " \n", + " if i<4:\n", + " pos['U'+str(i)] = (i+2,2) # 4 neurons in the second hidden layer\n", + " for j in range(8): G.add_edge('H'+str(j),'U'+str(i))\n", + " \n", + " if i<6:\n", + " pos['Y'+str(i)] = (i+1,3) # 6 neurons in the output layer\n", + " for j in range(4): G.add_edge('U'+str(j),'Y'+str(i))\n", + "\n", + "nx.draw(G, pos, node_color='y', node_size=750)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Training" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "[NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) supports two methods for training: LBFGS (default) and mini-batch gradient descent.\n", + "\n", + "[LBFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is a full-batch optimization methods, it looks at the entire training set each time before it changes the network's parameters. This makes it slow with large datasets. However, it works very well with small/medium size datasets and is very easy to use as it requires no parameter tuning.\n", + "\n", + "[Mini-batch Gradient Descent](http://en.wikipedia.org/wiki/Stochastic_gradient_descent) looks at only a small portion of the training set (a mini-batch) before each step, which it makes it suitable for large datasets. However, it's a bit harder to use than LBFGS because it requires some tuning for its parameters (learning rate, learning rate decay,..)\n", + "\n", + "Training in [NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) stops when:\n", + "\n", + "* Number of epochs (iterations over the entire training set) exceeds [max_num_epochs](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a7a2132cd0710750d28eaa4cd51d702af)\n", + "* The (percentage) difference in error between the current and previous iterations is smaller than [epsilon](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a0bde8d297e19e73b20b99110ba38f7bd), i.e the error is not anymore being reduced by training\n", + "\n", + "To see all the options supported for training, check the [documentation](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#pub-attribs)\n", + "\n", + "We'll first write a small function to calculate the classification accuracy on the validation set, so that we can compare different models:\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import MulticlassAccuracy\n", + "\n", + "def compute_accuracy(net, X, Y):\n", + " predictions = net.apply_multiclass(X)\n", + "\n", + " evaluator = MulticlassAccuracy()\n", + " accuracy = evaluator.evaluate(predictions, Y)\n", + " return accuracy*100" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training without regularization**\n", + "\n", + "We'll start by training the first network without regularization using LBFGS optimization. Note that LBFGS is suitable because we're using a small dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "net_no_reg.set_epsilon(1e-6)\n", + "net_no_reg.set_max_num_epochs(600)\n", + "\n", + "# uncomment this line to allow the training progress to be printed on the console\n", + "#from modshogun import MSG_INFO; net_no_reg.io.set_loglevel(MSG_INFO)\n", + "\n", + "net_no_reg.set_labels(Ytrain)\n", + "net_no_reg.train(Xtrain) # this might take a while, depending on your machine\n", + "\n", + "# compute accuracy on the validation set\n", + "print \"Without regularization, accuracy on the validation set =\", compute_accuracy(net_no_reg, Xval, Yval), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training with L2 regularization**\n", + "\n", + "We'll train another network, but with L2 regularization. This type of regularization attempts to prevent overfitting by penalizing large weights. This is done by adding $\\frac{1}{2} \\lambda \\Vert W \\Vert_2$ to the optimization objective that the network tries to minimize, where $\\lambda$ is the regularization coefficient.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# turn on L2 regularization\n", + "net_l2.set_l2_coefficient(3e-4)\n", + "\n", + "net_l2.set_epsilon(1e-6)\n", + "net_l2.set_max_num_epochs(600)\n", + "\n", + "net_l2.set_labels(Ytrain)\n", + "net_l2.train(Xtrain) # this might take a while, depending on your machine\n", + "\n", + "# compute accuracy on the validation set\n", + "print \"With L2 regularization, accuracy on the validation set =\", compute_accuracy(net_l2, Xval, Yval), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training with L1 regularization**\n", + "\n", + "We'll now try L1 regularization. It works by by adding $\\lambda \\Vert W \\Vert_1$ to the optimzation objective. This has the effect of penalizing all non-zero weights, therefore pushing all the weights to be close to 0." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# turn on L1 regularization\n", + "net_l1.set_l1_coefficient(3e-5)\n", + "\n", + "net_l1.set_epsilon1(e-6)\n", + "net_l1.set_max_num_epochs(600)\n", + "\n", + "net_l1.set_labels(Ytrain)\n", + "net_l1.train(Xtrain) # this might take a while, depending on your machine\n", + "\n", + "# compute accuracy on the validation set\n", + "print \"With L1 regularization, accuracy on the validation set =\", compute_accuracy(net_l1, Xval, Yval), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Training with dropout**\n", + "\n", + "The idea behind [dropout](http://arxiv.org/abs/1207.0580) is very simple: randomly ignore some neurons during each training iteration. When used on neurons in the hidden layers, it has the effect of forcing each neuron to learn to extract features that are useful in any context, regardless of what the other hidden neurons in its layer decide to do. Dropout can also be used on the inputs to the network by randomly omitting a small fraction of them during each iteration.\n", + "\n", + "When using dropout, it's usually useful to limit the L2 norm of a neuron's incoming weights vector to some constant value.\n", + "\n", + "Due to the stochastic nature of dropout, LBFGS optimization doesn't work well with it, therefore we'll use mini-batch gradient descent instead." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import NNOM_GRADIENT_DESCENT\n", + "\n", + "# set the dropout probabilty for neurons in the hidden layers\n", + "net_dropout.set_dropout_hidden(0.5)\n", + "# set the dropout probabilty for the inputs\n", + "net_dropout.set_dropout_input(0.2)\n", + "# limit the maximum incoming weights vector lengh for neurons\n", + "net_dropout.set_max_norm(15)\n", + "\n", + "net_dropout.set_epsilon(1e-6)\n", + "net_dropout.set_max_num_epochs(600)\n", + "\n", + "# use gradient descent for optimization\n", + "net_dropout.set_optimization_method(NNOM_GRADIENT_DESCENT)\n", + "net_dropout.set_gd_learning_rate(0.5)\n", + "net_dropout.set_gd_mini_batch_size(100)\n", + "\n", + "net_dropout.set_labels(Ytrain)\n", + "net_dropout.train(Xtrain) # this might take a while, depending on your machine\n", + "\n", + "# compute accuracy on the validation set\n", + "print \"With dropout, accuracy on the validation set =\", compute_accuracy(net_dropout, Xval, Yval), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Convolutional Neural Networks" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we'll look at a different type of network, namely [convolutional neural networks](http://deeplearning.net/tutorial/lenet.html). A convolutional net operates on two principles:\n", + "\n", + "- **Local connectivity**: Convolutional nets work with inputs that have some sort of spacial structure, where the order of the inputs features matter, i.e images. Local connectivity means that each neuron will be connected only to a small neighbourhood of pixels.\n", + "- **Weight sharing**: Different neurons use the same set of weights. This greatly reduces the number of free parameters, and therefore makes the optimization process easier and acts as a good regularizer. \n", + "\n", + "With that in mind, each layer in a convolutional network consists of a number of feature maps. Each feature map is produced by convolving a small filter with the layer's inputs, adding a bias, and then applying some non-linear activation function. The convolution operation satisfies the local connectivity and the weight sharing constraints. Additionally, a max-pooling operation can be performed on each feature map by dividing it into small non-overlapping regions and taking the maximum over each region. This adds some translation invarience and improves the performance.\n", + "\n", + "Convolutional nets in Shogun are handled through the [CNeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classes.html) class along with the [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) class. A [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) represents a convolutional layer with multiple feature maps, optional max-pooling, and support for [different types of activation functions](http://www.shogun-toolbox.org/doc/en/latest/namespaceshogun.html#a2b9827281875ee8de764ea86e7735482)\n", + "\n", + "Now we'll creates a convolutional neural network with two convolutional layers and a softmax output layer. We'll use the [rectified linear](http://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation function for the convolutional layers:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from modshogun import NeuralConvolutionalLayer, CMAF_RECTIFIED_LINEAR\n", + "\n", + "# prepere the layers\n", + "layers_conv = DynamicObjectArray()\n", + "\n", + "# input layer, a 16x16 image single channel image\n", + "layers_conv.append_element(NeuralInputLayer(16,16,1)) \n", + "\n", + "# the first convolutional layer: 10 feature maps, filters with radius 2 (5x5 filters)\n", + "# and max-pooling in a 2x2 region: its output will be 10 8x8 feature maps\n", + "layers_conv.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 10, 2, 2, 2, 2))\n", + "\n", + "# the first convolutional layer: 15 feature maps, filters with radius 2 (5x5 filters)\n", + "# and max-pooling in a 2x2 region: its output will be 15 4x4 feature maps\n", + "layers_conv.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 15, 2, 2, 2, 2))\n", + "\n", + "# output layer\n", + "layers_conv.append_element(NeuralSoftmaxLayer(10))\n", + "\n", + "# create and initialize the network\n", + "net_conv = NeuralNetwork(layers_conv)\n", + "net_conv.quick_connect()\n", + "net_conv.initialize_neural_network()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we can train the network. Like in the previous section, we'll use gradient descent with dropout and max-norm regularization:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "# 50% dropout in the input layer\n", + "net_conv.set_dropout_input(0.5)\n", + "\n", + "# max-norm regularization\n", + "net_conv.set_max_norm(1.0)\n", + "\n", + "# set gradient descent parameters\n", + "net_conv.set_optimization_method(NNOM_GRADIENT_DESCENT)\n", + "net_conv.set_gd_learning_rate(0.01)\n", + "net_conv.set_gd_mini_batch_size(100)\n", + "net_conv.set_epsilon(0.0)\n", + "net_conv.set_max_num_epochs(100)\n", + "\n", + "# start training\n", + "net_conv.set_labels(Ytrain)\n", + "net_conv.train(Xtrain)\n", + "\n", + "# compute accuracy on the validation set\n", + "print \"With a convolutional network, accuracy on the validation set =\", compute_accuracy(net_conv, Xval, Yval), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Evaluation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "According the accuracy on the validation set, the convolutional network works best in out case. Now we'll measure its performance on the test set:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "print \"Accuracy on the test set using the convolutional network =\", compute_accuracy(net_conv, Xtest, Ytest), \"%\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can also look at some of the images and the network's response to each of them:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "predictions = net_conv.apply_multiclass(Xtest)\n", + "\n", + "_=figure(figsize=(10,12))\n", + "# plot some images, with the predicted label as the title of each image\n", + "# this code is borrowed from the KNN notebook by Chiyuan Zhang and Sören Sonnenburg \n", + "for i in range(100):\n", + " ax=subplot(10,10,i+1)\n", + " title(int(predictions[i]))\n", + " ax.imshow(Xtest[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", + " ax.set_xticks([])\n", + " ax.set_yticks([])" + ] + } + ], "metadata": { - "name": "neuralnets_digits" - }, - "nbformat": 3, - "nbformat_minor": 0, - "worksheets": [ - { - "cells": [ - { - "cell_type": "heading", - "level": 1, - "metadata": {}, - "source": [ - "Neural Nets for Digit Classification" - ] - }, - { - "cell_type": "heading", - "level": 4, - "metadata": {}, - "source": [ - "by Khaled Nasr as a part of a GSoC 2014 project mentored by Theofanis Karaletsos and Sergey Lisitsyn " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook illustrates how to use the NeuralNets module to teach a [neural network](http://en.wikipedia.org/wiki/Artificial_neural_network) to recognize digits. It also explores the different optimization and regularization methods supported by the module. [Convolutional neural networks](http://en.wikipedia.org/wiki/Convolutional_neural_network) are also discussed." - ] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Introduction" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "An Artificial Neural Network is a machine learning model that is inspired by the way biological nervous systems, such as the brain, process information. The building block of neural networks is called a neuron. All a neuron does is take a weighted sum of its inputs and pass it through some non-linear function (activation function) to produce its output. A (feed-forward) neural network is a bunch of neurons arranged in layers, where each neuron in layer *i* takes its input from all the neurons in layer *i-1*. For more information on how neural networks work, [follow this link](https://www.youtube.com/playlist?list=PL6Xpj9I5qXYEcOhn7TqghAJ6NAPrNmUBH).\n", - "\n", - "In this notebook, we'll look at how a neural network can be used to recognize digits. We'll train the network on the USPS dataset of handwritten digits.\n", - "\n", - "We'll start by loading the data and dividing it into a training set, a validation set, and a test set. The USPS dataset has 9298 examples of handwritten digits. We'll intentionally use just a small portion (1000 examples) of the dataset for training . This is to keep training time small and to illustrate the effects of different regularization methods." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "%pylab inline\n", - "%matplotlib inline\n", - "from scipy.io import loadmat\n", - "from modshogun import RealFeatures, MulticlassLabels, Math\n", - "\n", - "# load the dataset\n", - "dataset = loadmat('../../../data/multiclass/usps.mat')\n", - "\n", - "Xall = dataset['data']\n", - "# the usps dataset has the digits labeled from 1 to 10 \n", - "# we'll subtract 1 to make them in the 0-9 range instead\n", - "Yall = np.array(dataset['label'].squeeze(), dtype=np.double)-1 \n", - "\n", - "# 1000 examples for training\n", - "Xtrain = RealFeatures(Xall[:,0:1000])\n", - "Ytrain = MulticlassLabels(Yall[0:1000])\n", - "\n", - "# 4000 examples for validation\n", - "Xval = RealFeatures(Xall[:,1001:5001])\n", - "Yval = MulticlassLabels(Yall[1001:5001])\n", - "\n", - "# the rest for testing\n", - "Xtest = RealFeatures(Xall[:,5002:-1])\n", - "Ytest = MulticlassLabels(Yall[5002:-1])\n", - "\n", - "# initialize the random number generator with a fixed seed, for repeatability\n", - "Math.init_random(10)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Creating the network" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "To create a neural network in shogun, we'll first create an instance of [NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) and then [initialize](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a8ff6d177c3e2d8977e5fc6920d3e1579) it by telling it how many inputs it has and what type of layers it contains. To specifiy the layers of the network a [DynamicObjectArray](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CDynamicObjectArray.html) is used. The array contains instances of [NeuralLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLayer.html)-based classes that determine the type of neurons each layer consists of. Some of the supported layer types are: [NeuralLinearLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLinearLayer.html), [NeuralLogisticLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralLogisticLayer.html) and\n", - "[NeuralSoftmaxLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralSoftmaxLayer.html).\n", - "\n", - "We'll create a feed-forward, fully connected (every neuron is connected to all neurons in the layer below) neural network with 2 logistic hidden layers and a softmax output layer. The network will have 256 inputs, one for each pixel (16*16 image). The first hidden layer will have 256 neurons, the second will have 128 neurons, and the output layer will have 10 neurons, one for each digit class. Note that we're using a big network, compared with the size of the training set. This is to emphasize the effects of different regularization methods. We'll try training the network with:\n", - "\n", - "* No regularization\n", - "* L2 regularization\n", - "* L1 regularization\n", - "* [Dropout](http://arxiv.org/abs/1207.0580) regularization\n", - "\n", - "Therefore, we'll create 4 versions of the network, train each one of them differently, and then compare the results on the validation set." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import NeuralNetwork, NeuralInputLayer, NeuralLogisticLayer, NeuralSoftmaxLayer\n", - "from modshogun import DynamicObjectArray\n", - "\n", - "# setup the layers\n", - "layers = DynamicObjectArray()\n", - "layers.append_element(NeuralInputLayer(256)) # input layer, 256 neurons\n", - "layers.append_element(NeuralLogisticLayer(256)) # first hidden layer, 256 neurons\n", - "layers.append_element(NeuralLogisticLayer(128)) # second hidden layer, 128 neurons\n", - "layers.append_element(NeuralSoftmaxLayer(10)) # output layer, 10 neurons\n", - "\n", - "# create the networks\n", - "net_no_reg = NeuralNetwork(layers)\n", - "net_no_reg.quick_connect()\n", - "net_no_reg.initialize_neural_network()\n", - "\n", - "net_l2 = NeuralNetwork(layers)\n", - "net_l2.quick_connect()\n", - "net_l2.initialize_neural_network()\n", - "\n", - "net_l1 = NeuralNetwork(layers)\n", - "net_l1.quick_connect()\n", - "net_l1.initialize_neural_network()\n", - "\n", - "net_dropout = NeuralNetwork(layers)\n", - "net_dropout.quick_connect()\n", - "net_dropout.initialize_neural_network()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also visualize what the network would look like. To do that we'll draw a smaller network using [networkx](http://networkx.github.io/). The network we'll draw will have 8 inputs (labeled X), 8 neurons in the first hidden layer (labeled H), 4 neurons in the second hidden layer (labeled U), and 6 neurons in the output layer (labeled Y). Each neuron will be connected to all neurons in the layer that precedes it." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# import networkx, install if necessary\n", - "try:\n", - " import networkx as nx\n", - "except ImportError:\n", - " import pip\n", - " pip.main(['install', '--user', 'networkx'])\n", - " import networkx as nx\n", - " \n", - "G = nx.DiGraph()\n", - "pos = {}\n", - "\n", - "for i in range(8):\n", - " pos['X'+str(i)] = (i,0) # 8 neurons in the input layer\n", - " pos['H'+str(i)] = (i,1) # 8 neurons in the first hidden layer\n", - " \n", - " for j in range(8): G.add_edge('X'+str(j),'H'+str(i))\n", - " \n", - " if i<4:\n", - " pos['U'+str(i)] = (i+2,2) # 4 neurons in the second hidden layer\n", - " for j in range(8): G.add_edge('H'+str(j),'U'+str(i))\n", - " \n", - " if i<6:\n", - " pos['Y'+str(i)] = (i+1,3) # 6 neurons in the output layer\n", - " for j in range(4): G.add_edge('U'+str(j),'Y'+str(i))\n", - "\n", - "nx.draw(G, pos, node_color='y', node_size=750)" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Training" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "[NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) supports two methods for training: LBFGS (default) and mini-batch gradient descent.\n", - "\n", - "[LBFGS](http://en.wikipedia.org/wiki/Limited-memory_BFGS) is a full-batch optimization methods, it looks at the entire training set each time before it changes the network's parameters. This makes it slow with large datasets. However, it works very well with small/medium size datasets and is very easy to use as it requires no parameter tuning.\n", - "\n", - "[Mini-batch Gradient Descent](http://en.wikipedia.org/wiki/Stochastic_gradient_descent) looks at only a small portion of the training set (a mini-batch) before each step, which it makes it suitable for large datasets. However, it's a bit harder to use than LBFGS because it requires some tuning for its parameters (learning rate, learning rate decay,..)\n", - "\n", - "Training in [NeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html) stops when:\n", - "\n", - "* Number of epochs (iterations over the entire training set) exceeds [max_num_epochs](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a7a2132cd0710750d28eaa4cd51d702af)\n", - "* The (percentage) difference in error between the current and previous iterations is smaller than [epsilon](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#a0bde8d297e19e73b20b99110ba38f7bd), i.e the error is not anymore being reduced by training\n", - "\n", - "To see all the options supported for training, check the [documentation](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralNetwork.html#pub-attribs)\n", - "\n", - "We'll first write a small function to calculate the classification accuracy on the validation set, so that we can compare different models:\n" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import MulticlassAccuracy\n", - "\n", - "def compute_accuracy(net, X, Y):\n", - " predictions = net.apply_multiclass(X)\n", - "\n", - " evaluator = MulticlassAccuracy()\n", - " accuracy = evaluator.evaluate(predictions, Y)\n", - " return accuracy*100" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Training without regularization**\n", - "\n", - "We'll start by training the first network without regularization using LBFGS optimization. Note that LBFGS is suitable because we're using a small dataset." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "net_no_reg.epsilon = 1e-6\n", - "net_no_reg.max_num_epochs = 600\n", - "\n", - "# uncomment this line to allow the training progress to be printed on the console\n", - "#from modshogun import MSG_INFO; net_no_reg.io.set_loglevel(MSG_INFO)\n", - "\n", - "net_no_reg.set_labels(Ytrain)\n", - "net_no_reg.train(Xtrain) # this might take a while, depending on your machine\n", - "\n", - "# compute accuracy on the validation set\n", - "print \"Without regularization, accuracy on the validation set =\", compute_accuracy(net_no_reg, Xval, Yval), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Training with L2 regularization**\n", - "\n", - "We'll train another network, but with L2 regularization. This type of regularization attempts to prevent overfitting by penalizing large weights. This is done by adding $\\frac{1}{2} \\lambda \\Vert W \\Vert_2$ to the optimization objective that the network tries to minimize, where $\\lambda$ is the regularization coefficient.\n" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# turn on L2 regularization\n", - "net_l2.l2_coefficient = 3e-4\n", - "\n", - "net_l2.epsilon = 1e-6\n", - "net_l2.max_num_epochs = 600\n", - "\n", - "net_l2.set_labels(Ytrain)\n", - "net_l2.train(Xtrain) # this might take a while, depending on your machine\n", - "\n", - "# compute accuracy on the validation set\n", - "print \"With L2 regularization, accuracy on the validation set =\", compute_accuracy(net_l2, Xval, Yval), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Training with L1 regularization**\n", - "\n", - "We'll now try L1 regularization. It works by by adding $\\lambda \\Vert W \\Vert_1$ to the optimzation objective. This has the effect of penalizing all non-zero weights, therefore pushing all the weights to be close to 0." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# turn on L1 regularization\n", - "net_l1.l1_coefficient = 3e-5\n", - "\n", - "net_l1.epsilon = 1e-6\n", - "net_l1.max_num_epochs = 600\n", - "\n", - "net_l1.set_labels(Ytrain)\n", - "net_l1.train(Xtrain) # this might take a while, depending on your machine\n", - "\n", - "# compute accuracy on the validation set\n", - "print \"With L1 regularization, accuracy on the validation set =\", compute_accuracy(net_l1, Xval, Yval), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "**Training with dropout**\n", - "\n", - "The idea behind [dropout](http://arxiv.org/abs/1207.0580) is very simple: randomly ignore some neurons during each training iteration. When used on neurons in the hidden layers, it has the effect of forcing each neuron to learn to extract features that are useful in any context, regardless of what the other hidden neurons in its layer decide to do. Dropout can also be used on the inputs to the network by randomly omitting a small fraction of them during each iteration.\n", - "\n", - "When using dropout, it's usually useful to limit the L2 norm of a neuron's incoming weights vector to some constant value.\n", - "\n", - "Due to the stochastic nature of dropout, LBFGS optimization doesn't work well with it, therefore we'll use mini-batch gradient descent instead." - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import NNOM_GRADIENT_DESCENT\n", - "\n", - "# set the dropout probabilty for neurons in the hidden layers\n", - "net_dropout.dropout_hidden = 0.5\n", - "# set the dropout probabilty for the inputs\n", - "net_dropout.dropout_input = 0.2\n", - "# limit the maximum incoming weights vector lengh for neurons\n", - "net_dropout.max_norm = 15\n", - "\n", - "net_dropout.epsilon = 1e-6\n", - "net_dropout.max_num_epochs = 600\n", - "\n", - "# use gradient descent for optimization\n", - "net_dropout.optimization_method = NNOM_GRADIENT_DESCENT\n", - "net_dropout.gd_learning_rate = 0.5\n", - "net_dropout.gd_mini_batch_size = 100\n", - "\n", - "net_dropout.set_labels(Ytrain)\n", - "net_dropout.train(Xtrain) # this might take a while, depending on your machine\n", - "\n", - "# compute accuracy on the validation set\n", - "print \"With dropout, accuracy on the validation set =\", compute_accuracy(net_dropout, Xval, Yval), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Convolutional Neural Networks" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we'll look at a different type of network, namely [convolutional neural networks](http://deeplearning.net/tutorial/lenet.html). A convolutional net operates on two principles:\n", - "\n", - "- **Local connectivity**: Convolutional nets work with inputs that have some sort of spacial structure, where the order of the inputs features matter, i.e images. Local connectivity means that each neuron will be connected only to a small neighbourhood of pixels.\n", - "- **Weight sharing**: Different neurons use the same set of weights. This greatly reduces the number of free parameters, and therefore makes the optimization process easier and acts as a good regularizer. \n", - "\n", - "With that in mind, each layer in a convolutional network consists of a number of feature maps. Each feature map is produced by convolving a small filter with the layer's inputs, adding a bias, and then applying some non-linear activation function. The convolution operation satisfies the local connectivity and the weight sharing constraints. Additionally, a max-pooling operation can be performed on each feature map by dividing it into small non-overlapping regions and taking the maximum over each region. This adds some translation invarience and improves the performance.\n", - "\n", - "Convolutional nets in Shogun are handled through the [CNeuralNetwork](http://www.shogun-toolbox.org/doc/en/latest/classes.html) class along with the [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) class. A [CNeuralConvolutionalLayer](http://www.shogun-toolbox.org/doc/en/latest/classshogun_1_1CNeuralConvolutionalLayer.html) represents a convolutional layer with multiple feature maps, optional max-pooling, and support for [different types of activation functions](http://www.shogun-toolbox.org/doc/en/latest/namespaceshogun.html#a2b9827281875ee8de764ea86e7735482)\n", - "\n", - "Now we'll creates a convolutional neural network with two convolutional layers and a softmax output layer. We'll use the [rectified linear](http://en.wikipedia.org/wiki/Rectifier_(neural_networks)) activation function for the convolutional layers:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "from modshogun import NeuralConvolutionalLayer, CMAF_RECTIFIED_LINEAR\n", - "\n", - "# prepere the layers\n", - "layers_conv = DynamicObjectArray()\n", - "\n", - "# input layer, a 16x16 image single channel image\n", - "layers_conv.append_element(NeuralInputLayer(16,16,1)) \n", - "\n", - "# the first convolutional layer: 10 feature maps, filters with radius 2 (5x5 filters)\n", - "# and max-pooling in a 2x2 region: its output will be 10 8x8 feature maps\n", - "layers_conv.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 10, 2, 2, 2, 2))\n", - "\n", - "# the first convolutional layer: 15 feature maps, filters with radius 2 (5x5 filters)\n", - "# and max-pooling in a 2x2 region: its output will be 15 4x4 feature maps\n", - "layers_conv.append_element(NeuralConvolutionalLayer(CMAF_RECTIFIED_LINEAR, 15, 2, 2, 2, 2))\n", - "\n", - "# output layer\n", - "layers_conv.append_element(NeuralSoftmaxLayer(10))\n", - "\n", - "# create and initialize the network\n", - "net_conv = NeuralNetwork(layers_conv)\n", - "net_conv.quick_connect()\n", - "net_conv.initialize_neural_network()" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can train the network. Like in the previous section, we'll use gradient descent with dropout and max-norm regularization:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "# 50% dropout in the input layer\n", - "net_conv.dropout_input = 0.5\n", - "\n", - "# max-norm regularization\n", - "net_conv.max_norm = 1.0\n", - "\n", - "# set gradient descent parameters\n", - "net_conv.optimization_method = NNOM_GRADIENT_DESCENT\n", - "net_conv.gd_learning_rate = 0.01\n", - "net_conv.gd_mini_batch_size = 100\n", - "net_conv.epsilon = 0.0\n", - "net_conv.max_num_epochs = 100\n", - "\n", - "# start training\n", - "net_conv.set_labels(Ytrain)\n", - "net_conv.train(Xtrain)\n", - "\n", - "# compute accuracy on the validation set\n", - "print \"With a convolutional network, accuracy on the validation set =\", compute_accuracy(net_conv, Xval, Yval), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "heading", - "level": 2, - "metadata": {}, - "source": [ - "Evaluation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "According the accuracy on the validation set, the convolutional network works best in out case. Now we'll measure its performance on the test set:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "print \"Accuracy on the test set using the convolutional network =\", compute_accuracy(net_conv, Xtest, Ytest), \"%\"" - ], - "language": "python", - "metadata": {}, - "outputs": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can also look at some of the images and the network's response to each of them:" - ] - }, - { - "cell_type": "code", - "collapsed": false, - "input": [ - "predictions = net_conv.apply_multiclass(Xtest)\n", - "\n", - "_=figure(figsize=(10,12))\n", - "# plot some images, with the predicted label as the title of each image\n", - "# this code is borrowed from the KNN notebook by Chiyuan Zhang and S\u00f6ren Sonnenburg \n", - "for i in range(100):\n", - " ax=subplot(10,10,i+1)\n", - " title(int(predictions[i]))\n", - " ax.imshow(Xtest[:,i].reshape((16,16)), interpolation='nearest', cmap = cm.Greys_r)\n", - " ax.set_xticks([])\n", - " ax.set_yticks([])" - ], - "language": "python", - "metadata": {}, - "outputs": [] - } - ], - "metadata": {} + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" } - ] + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/src/shogun/io/NeuralNetworkFileReader.cpp b/src/shogun/io/NeuralNetworkFileReader.cpp index 23933bc6be3..be90fddf303 100644 --- a/src/shogun/io/NeuralNetworkFileReader.cpp +++ b/src/shogun/io/NeuralNetworkFileReader.cpp @@ -138,36 +138,36 @@ CNeuralNetwork* CNeuralNetworkFileReader::parse_network(json_object* json_networ { const char* method = json_object_get_string(iter.val); if (string_equal(method, "NNOM_LBFGS")) - network->optimization_method = NNOM_LBFGS; + network->set_optimization_method(NNOM_LBFGS); else if (string_equal(method, "NNOM_GRADIENT_DESCENT")) - network->optimization_method = NNOM_GRADIENT_DESCENT; + network->set_optimization_method(NNOM_GRADIENT_DESCENT); else SG_ERROR("Invalid optimization method (%s)\n", method); } else if (string_equal(iter.key, "l2_coefficient")) - network->l2_coefficient = json_object_get_double(iter.val); + network->set_l2_coefficient(json_object_get_double(iter.val)); else if (string_equal(iter.key, "l1_coefficient")) - network->l1_coefficient = json_object_get_double(iter.val); + network->set_l1_coefficient(json_object_get_double(iter.val)); else if (string_equal(iter.key, "dropout_hidden")) - network->dropout_hidden = json_object_get_double(iter.val); + network->set_dropout_hidden(json_object_get_double(iter.val)); else if (string_equal(iter.key, "dropout_input")) - network->dropout_input = json_object_get_double(iter.val); + network->set_dropout_input(json_object_get_double(iter.val)); else if (string_equal(iter.key, "max_norm")) - network->max_norm = json_object_get_double(iter.val); + network->set_max_norm(json_object_get_double(iter.val)); else if (string_equal(iter.key, "epsilon")) - network->epsilon = json_object_get_double(iter.val); + network->set_epsilon(json_object_get_double(iter.val)); else if (string_equal(iter.key, "max_num_epochs")) - network->max_num_epochs = json_object_get_int(iter.val); + network->set_max_num_epochs(json_object_get_int(iter.val)); else if (string_equal(iter.key, "gd_mini_batch_size")) - network->gd_mini_batch_size = json_object_get_int(iter.val); + network->set_gd_mini_batch_size(json_object_get_int(iter.val)); else if (string_equal(iter.key, "gd_learning_rate")) - network->gd_learning_rate = json_object_get_double(iter.val); + network->set_gd_learning_rate(json_object_get_double(iter.val)); else if (string_equal(iter.key, "gd_learning_rate_decay")) - network->gd_learning_rate_decay = json_object_get_double(iter.val); + network->set_gd_learning_rate_decay(json_object_get_double(iter.val)); else if (string_equal(iter.key, "gd_momentum")) - network->gd_momentum = json_object_get_double(iter.val); + network->set_gd_momentum(json_object_get_double(iter.val)); else if (string_equal(iter.key, "gd_error_damping_coeff")) - network->gd_error_damping_coeff = json_object_get_double(iter.val); + network->set_gd_error_damping_coeff(json_object_get_double(iter.val)); else if (!string_equal(iter.key, "layers")) SG_ERROR("Invalid parameter (%s)\n", iter.key); diff --git a/src/shogun/neuralnets/Autoencoder.h b/src/shogun/neuralnets/Autoencoder.h index 7821b200d9d..90bba3d84c1 100644 --- a/src/shogun/neuralnets/Autoencoder.h +++ b/src/shogun/neuralnets/Autoencoder.h @@ -165,6 +165,44 @@ class CAutoencoder : public CNeuralNetwork virtual const char* get_name() const { return "Autoencoder"; } + /** Sets noise type for denoising autoencoders. + * + * If set to AENT_DROPOUT, inputs are randomly set to zero during each + * iteration of training with probability noise_parameter. + * + * If set to AENT_GAUSSIAN, gaussian noise with zero mean and noise_parameter + * standard deviation is added to the inputs. + * + * Default value is AENT_NONE + * @param _noise_type noise type for denoising autoencoders + */ + void set_noise_type(EAENoiseType _noise_type) + { + this->noise_type = _noise_type; + } + + /** Returns noise type for denoising autoencoders */ + EAENoiseType get_noise_type() + { + return this->noise_type; + } + + /** Sets noise parameter + * Controls the strength of the noise, depending on noise_type + * + * @param _noise_parameter controls the strength of noise + */ + void set_noise_parameter(float64_t _noise_parameter) + { + this->noise_parameter = _noise_parameter; + } + + /** Returns noise parameter */ + float64_t get_noise_parameter() + { + return this->noise_parameter; + } + protected: /** Computes the error between the output layer's activations and the given * target activations. @@ -181,7 +219,18 @@ class CAutoencoder : public CNeuralNetwork template SGVector get_section(SGVector v, int32_t i); -public: +protected: + /** For contractive autoencoders [Rifai, 2011], a term: + * \f[ \frac{\lambda}{N} \sum_{k=0}^{N-1} \left \| J(x_k) \right \|^2_F \f] + * is added to the error, where \f$ \left \| J(x_k)) \right \|^2_F \f$ is the + * Frobenius norm of the Jacobian of the activations of the hidden layer + * with respect to its inputs, \f$ N \f$ is the batch size, and + * \f$ \lambda \f$ is the contraction coefficient. + * + * Default value is 0.0. + */ + float64_t m_contraction_coefficient; + /** Noise type for denoising autoencoders. * * If set to AENT_DROPOUT, inputs are randomly set to zero during each @@ -196,18 +245,6 @@ class CAutoencoder : public CNeuralNetwork /** Controls the strength of the noise, depending on noise_type */ float64_t noise_parameter; - -protected: - /** For contractive autoencoders [Rifai, 2011], a term: - * \f[ \frac{\lambda}{N} \sum_{k=0}^{N-1} \left \| J(x_k) \right \|^2_F \f] - * is added to the error, where \f$ \left \| J(x_k)) \right \|^2_F \f$ is the - * Frobenius norm of the Jacobian of the activations of the hidden layer - * with respect to its inputs, \f$ N \f$ is the batch size, and - * \f$ \lambda \f$ is the contraction coefficient. - * - * Default value is 0.0. - */ - float64_t m_contraction_coefficient; }; } #endif diff --git a/src/shogun/neuralnets/DeepAutoencoder.cpp b/src/shogun/neuralnets/DeepAutoencoder.cpp index 271cf9d944f..096adfc5a04 100644 --- a/src/shogun/neuralnets/DeepAutoencoder.cpp +++ b/src/shogun/neuralnets/DeepAutoencoder.cpp @@ -112,19 +112,19 @@ void CDeepAutoencoder::pre_train(CFeatures* data) SG_UNREF(ae_encoding_layer); SG_UNREF(ae_decoding_layer); - ae->noise_type = EAENoiseType(pt_noise_type[i-1]); - ae->noise_parameter = pt_noise_parameter[i-1]; + ae->set_noise_type(EAENoiseType(pt_noise_type[i-1])); + ae->set_noise_parameter(pt_noise_parameter[i-1]); ae->set_contraction_coefficient(pt_contraction_coefficient[i-1]); - ae->optimization_method = ENNOptimizationMethod(pt_optimization_method[i-1]); - ae->l2_coefficient = pt_l2_coefficient[i-1]; - ae->l1_coefficient = pt_l1_coefficient[i-1]; - ae->epsilon = pt_epsilon[i-1]; - ae->max_num_epochs = pt_max_num_epochs[i-1]; - ae->gd_learning_rate = pt_gd_learning_rate[i-1]; - ae->gd_learning_rate_decay = pt_gd_learning_rate_decay[i-1]; - ae->gd_momentum = pt_gd_momentum[i-1]; - ae->gd_mini_batch_size = pt_gd_mini_batch_size[i-1]; - ae->gd_error_damping_coeff = pt_gd_error_damping_coeff[i-1]; + ae->set_optimization_method(ENNOptimizationMethod(pt_optimization_method[i-1])); + ae->set_l2_coefficient(pt_l2_coefficient[i-1]); + ae->set_l1_coefficient(pt_l1_coefficient[i-1]); + ae->set_epsilon(pt_epsilon[i-1]); + ae->set_max_num_epochs(pt_max_num_epochs[i-1]); + ae->set_gd_learning_rate(pt_gd_learning_rate[i-1]); + ae->set_gd_learning_rate_decay(pt_gd_learning_rate_decay[i-1]); + ae->set_gd_momentum(pt_gd_momentum[i-1]); + ae->set_gd_mini_batch_size(pt_gd_mini_batch_size[i-1]); + ae->set_gd_error_damping_coeff(pt_gd_error_damping_coeff[i-1]); // forward propagate the data to obtain the training data for the // current autoencoder diff --git a/src/shogun/neuralnets/NeuralNetwork.h b/src/shogun/neuralnets/NeuralNetwork.h index 0fea9d2997d..05434b9483c 100644 --- a/src/shogun/neuralnets/NeuralNetwork.h +++ b/src/shogun/neuralnets/NeuralNetwork.h @@ -231,6 +231,231 @@ friend class CDeepBeliefNetwork; virtual const char* get_name() const { return "NeuralNetwork";} + /** Sets optimization method + * default is NNOM_LBFGS + * @param _optimization_method optimiation method + */ + void set_optimization_method(ENNOptimizationMethod _optimization_method) + { + this->optimization_method = _optimization_method; + } + + /** Returns optimization method */ + ENNOptimizationMethod get_optimization_method() + { + return this->optimization_method; + } + /** Sets L2 Regularization coeff + * default value is 0.0 + * @param _l2_coefficient l2_coefficient + */ + void set_l2_coefficient(float64_t _l2_coefficient) + { + this->l2_coefficient = _l2_coefficient; + } + + /** Returns L2 coefficient */ + float64_t get_l2_coefficient() + { + return this->l2_coefficient; + } + /** Sets L1 Regularization coeff + * default value is 0.0 + * @param _l1_coefficient l1_coefficient + */ + void set_l1_coefficient(float64_t _l1_coefficient) + { + this->l1_coefficient = _l1_coefficient; + } + + /** Returns L1 coefficient */ + float64_t get_l1_coefficient() + { + return this->l1_coefficient; + } + + /** Sets the probabilty that a hidden layer neuron will be dropped out + * When using this, the recommended value is 0.5 + * default value 0.0 (no dropout) + * + * For more details on dropout, see + * [paper](http://arxiv.org/abs/1207.0580) [Hinton, 2012] + * + * @param _dropout_hidden dropout probability + */ + void set_dropout_hidden(float64_t _dropout_hidden) + { + this->dropout_hidden = _dropout_hidden; + } + + /** Returns dropout probability for hidden layers */ + float64_t get_dropout_hidden() + { + return this->dropout_hidden; + } + + /** Sets the probabilty that an input layer neuron will be dropped out + * When using this, a good value might be 0.2 + * default value 0.0 (no dropout) + * + * For more details on dropout, see this + * [paper](http://arxiv.org/abs/1207.0580) [Hinton, 2012] + * + * @param _dropout_input dropout probability + */ + void set_dropout_input(float64_t _dropout_input) + { + this->dropout_input = _dropout_input; + } + + /** Returns dropout probability for input layers */ + float64_t get_dropout_input() + { + return this->dropout_input; + } + + /** Sets maximum allowable L2 norm for a neurons weights + * When using this, a good value might be 15 + * default value -1 (max-norm regularization disabled) + * @param _max_norm maximum allowable L2 norm + */ + void set_max_norm(float64_t _max_norm) + { + this->max_norm = _max_norm; + } + + /** Returns maximum allowable L2 norm */ + float64_t get_max_norm() + { + return this->max_norm; + } + + /** Sets convergence criteria + * training stops when (E'- E)/E < epsilon + * where E is the error at the current iterations and E' is the error at the + * previous iteration + * default value is 1.0e-5 + * @param _epsilon convergence criteria + */ + void set_epsilon(float64_t _epsilon) + { + this->epsilon = _epsilon; + } + + /** Returns epsilon */ + float64_t get_epsilon() + { + return this->epsilon; + } + + /** Sets maximum number of iterations over the training set. + * If 0, training will continue until convergence. + * defualt value is 0 + * @param _max_num_epochs maximum number of iterations over the training set + */ + void set_max_num_epochs(int32_t _max_num_epochs) + { + this->max_num_epochs = _max_num_epochs; + } + + /** Returns maximum number of epochs */ + int32_t get_max_num_epochs() + { + return this->max_num_epochs; + } + + /** Sets size of the mini-batch used during gradient descent training, + * if 0 full-batch training is performed + * default value is 0 + * @param _gd_mini_batch_size mini batch size + */ + void set_gd_mini_batch_size(int32_t _gd_mini_batch_size) + { + this->gd_mini_batch_size = _gd_mini_batch_size; + } + + /** Returns mini batch size */ + int32_t get_gd_mini_batch_size() + { + return this->gd_mini_batch_size; + } + + /** Sets gradient descent learning rate + * defualt value 0.1 + * @param _gd_learning_rate gradient descent learning rate + */ + void set_gd_learning_rate(float64_t _gd_learning_rate) + { + this->gd_learning_rate = _gd_learning_rate; + } + + /** Returns gradient descent learning rate */ + float64_t get_gd_learning_rate() + { + return this->gd_learning_rate; + } + + /** Sets gradient descent learning rate decay + * learning rate is updated at each iteration i according to: + * alpha(i)=decay*alpha(i-1) + * default value is 1.0 (no decay) + * @param _gd_learning_rate_decay gradient descent learning rate decay + */ + void set_gd_learning_rate_decay(float64_t _gd_learning_rate_decay) + { + this->gd_learning_rate_decay = _gd_learning_rate_decay; + } + + /** Returns gradient descent learning rate decay */ + float64_t get_gd_learning_rate_decay() + { + return this->gd_learning_rate_decay; + } + + /** Sets gradient descent momentum multiplier + * + * default value is 0.9 + * + * For more details on momentum, see this + * [paper](http://jmlr.org/proceedings/papers/v28/sutskever13.html) + * [Sutskever, 2013] + * + * @param _gd_momentum gradient descent momentum multiplier + */ + void set_gd_momentum(float64_t _gd_momentum) + { + this->gd_momentum = _gd_momentum; + } + + /** Returns gradient descent momentum multiplier */ + float64_t get_gd_momentum() + { + return this->gd_momentum; + } + + /** Sets gradient descent error damping coefficient + * Used to damp the error fluctuations when stochastic gradient descent is + * used. damping is done according to: + * error_damped(i) = c*error(i) + (1-c)*error_damped(i-1) + * where c is the damping coefficient + * + * If -1, the damping coefficient is automatically computed according to: + * c = 0.99*gd_mini_batch_size/training_set_size + 1e-2; + * + * default value is -1 + * + * @param _gd_error_damping_coeff error damping coefficient + */ + void set_gd_error_damping_coeff(float64_t _gd_error_damping_coeff) + { + this->gd_error_damping_coeff = _gd_error_damping_coeff; + } + + float64_t get_gd_error_damping_coeff() + { + return this->gd_error_damping_coeff; + } + protected: /** trains the network */ virtual bool train_machine(CFeatures* data=NULL); @@ -354,7 +579,49 @@ friend class CDeepBeliefNetwork; /** Returns the section of vector v that belongs to layer i */ template SGVector get_section(SGVector v, int32_t i); -public: + +protected: + /** number of neurons in the input layer */ + int32_t m_num_inputs; + + /** number of layer */ + int32_t m_num_layers; + + /** network's layers */ + CDynamicObjectArray* m_layers; + + /** Describes the connections in the network: if there's a connection from + * layer i to layer j then m_adj_matrix(i,j) = 1. + */ + SGMatrix m_adj_matrix; + + /** total number of parameters in the network */ + int32_t m_total_num_parameters; + + /** array where all the parameters of the network are stored */ + SGVector m_params; + + /** Array that specifies which parameters are to be regularized. This is + * used to turn off regularization for bias parameters + */ + SGVector m_param_regularizable; + + /** offsets specifying where each layer's parameters and parameter + * gradients are stored, i.e layer i's parameters are stored at + * m_params + m_index_offsets[i] + */ + SGVector m_index_offsets; + + /** number of train/test cases the network is expected to deal with. + * Default value is 1 + */ + int32_t m_batch_size; + + /** True if the network is currently being trained + * initial value is false + */ + bool m_is_training; + /** Optimization method, default is NNOM_LBFGS */ ENNOptimizationMethod optimization_method; @@ -442,47 +709,6 @@ friend class CDeepBeliefNetwork; * default value is -1 */ float64_t gd_error_damping_coeff; -protected: - /** number of neurons in the input layer */ - int32_t m_num_inputs; - - /** number of layer */ - int32_t m_num_layers; - - /** network's layers */ - CDynamicObjectArray* m_layers; - - /** Describes the connections in the network: if there's a connection from - * layer i to layer j then m_adj_matrix(i,j) = 1. - */ - SGMatrix m_adj_matrix; - - /** total number of parameters in the network */ - int32_t m_total_num_parameters; - - /** array where all the parameters of the network are stored */ - SGVector m_params; - - /** Array that specifies which parameters are to be regularized. This is - * used to turn off regularization for bias parameters - */ - SGVector m_param_regularizable; - - /** offsets specifying where each layer's parameters and parameter - * gradients are stored, i.e layer i's parameters are stored at - * m_params + m_index_offsets[i] - */ - SGVector m_index_offsets; - - /** number of train/test cases the network is expected to deal with. - * Default value is 1 - */ - int32_t m_batch_size; - - /** True if the network is currently being trained - * initial value is false - */ - bool m_is_training; private: /** temperary pointers to the training data, used to pass the data to L-BFGS diff --git a/tests/unit/io/NeuralNetworkFileReader_unittest.cc b/tests/unit/io/NeuralNetworkFileReader_unittest.cc index eee677685c1..186c9890dc2 100644 --- a/tests/unit/io/NeuralNetworkFileReader_unittest.cc +++ b/tests/unit/io/NeuralNetworkFileReader_unittest.cc @@ -109,18 +109,18 @@ TEST(NeuralNetworkFileReader, read) CNeuralNetwork* net = reader.read_string(net_string); EXPECT_EQ(NNOM_GRADIENT_DESCENT, net->optimization_method); - EXPECT_EQ(0.001, net->l2_coefficient); - EXPECT_EQ(0.003, net->l1_coefficient); - EXPECT_EQ(0.5, net->dropout_hidden); - EXPECT_EQ(0.2, net->dropout_input); - EXPECT_EQ(15, net->max_norm); - EXPECT_EQ(1e-8, net->epsilon); - EXPECT_EQ(1000, net->max_num_epochs); - EXPECT_EQ(100, net->gd_mini_batch_size); - EXPECT_EQ(1.0, net->gd_learning_rate); - EXPECT_EQ(0.995, net->gd_learning_rate_decay); - EXPECT_EQ(0.95, net->gd_momentum); - EXPECT_EQ(0.9, net->gd_error_damping_coeff); + EXPECT_EQ(0.001, net->get_l2_coefficient()); + EXPECT_EQ(0.003, net->get_1_coefficient()); + EXPECT_EQ(0.5, net->get_dropout_hidden()); + EXPECT_EQ(0.2, net->get_dropout_input()); + EXPECT_EQ(15, net->get_max_norm()); + EXPECT_EQ(1e-8, net->get_epsilon()); + EXPECT_EQ(1000, net->get_max_num_epochs()); + EXPECT_EQ(100, net->get_gd_mini_batch_size()); + EXPECT_EQ(1.0, net->get_gd_learning_rate()); + EXPECT_EQ(0.995, net->get_gd_learning_rate_decay()); + EXPECT_EQ(0.95, net->get_gd_momentum()); + EXPECT_EQ(0.9, net->get_gd_error_damping_coeff()); CDynamicObjectArray* layers = net->get_layers(); diff --git a/tests/unit/neuralnets/NeuralNetwork_unittest.cc b/tests/unit/neuralnets/NeuralNetwork_unittest.cc index 417057fe7e5..b88acdce749 100644 --- a/tests/unit/neuralnets/NeuralNetwork_unittest.cc +++ b/tests/unit/neuralnets/NeuralNetwork_unittest.cc @@ -75,8 +75,8 @@ TEST(NeuralNetwork, backpropagation_linear) network->connect(4,5); network->initialize_neural_network(); - network->l2_coefficient = 0.01; - network->l1_coefficient = 0.03; + network->set_l2_coefficient(0.01); + network->set_l1_coefficient(0.03); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); @@ -107,8 +107,8 @@ TEST(NeuralNetwork, neural_layers_builder) network->connect(4,5); network->initialize_neural_network(); - network->l2_coefficient = 0.01; - network->l1_coefficient = 0.03; + network->set_l2_coefficient(0.01); + network->set_l1_coefficient(0.03); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); @@ -142,8 +142,8 @@ TEST(NeuralNetwork, backpropagation_logistic) network->connect(4,5); network->initialize_neural_network(); - network->l1_coefficient = 0.03; - network->l2_coefficient = 0.01; + network->set_l1_coefficient(0.03); + network->set_l2_coefficient(0.01); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); } @@ -174,8 +174,8 @@ TEST(NeuralNetwork, backpropagation_softmax) network->connect(4,5); network->initialize_neural_network(); - network->l1_coefficient = 0.03; - network->l2_coefficient = 0.01; + network->set_l1_coefficient(0.03); + network->set_l2_coefficient(0.01); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); } @@ -206,8 +206,8 @@ TEST(NeuralNetwork, backpropagation_rectified_linear) network->connect(4,5); network->initialize_neural_network(); - network->l1_coefficient = 0.03; - network->l2_coefficient = 0.01; + network->set_l1_coefficient(0.03); + network->set_l2_coefficient(0.01); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); } @@ -241,8 +241,8 @@ TEST(NeuralNetwork, backpropagation_convolutional) network->connect(4,5); network->initialize_neural_network(); - network->l1_coefficient = 0.03; - network->l2_coefficient = 0.01; + network->set_l1_coefficient(0.03); + network->set_l2_coefficient(0.01); EXPECT_NEAR(network->check_gradients(), 0.0, tolerance); SG_UNREF(network); } @@ -284,7 +284,7 @@ TEST(NeuralNetwork, binary_classification) network->quick_connect(); network->initialize_neural_network(0.1); - network->epsilon = 1e-8; + network->set_epsilon(1e-8); network->set_labels(labels); network->train(features); @@ -343,7 +343,7 @@ TEST(NeuralNetwork, multiclass_classification) network->quick_connect(); network->initialize_neural_network(0.1); - network->epsilon = 1e-8; + network->set_epsilon(1e-8); network->set_labels(labels); network->train(features); @@ -394,7 +394,7 @@ TEST(NeuralNetwork, regression) network->quick_connect(); network->initialize_neural_network(1e-6); - network->epsilon = 1e-6; + network->set_epsilon(1e-6); network->set_labels(labels); network->train(features); @@ -448,10 +448,10 @@ TEST(NeuralNetwork, gradient_descent) network->quick_connect(); network->initialize_neural_network(0.1); - network->optimization_method = NNOM_GRADIENT_DESCENT; - network->gd_learning_rate = 10.0; - network->epsilon = 0.0; - network->max_num_epochs = 1000; + network->set_optimization_method(NNOM_GRADIENT_DESCENT); + network->set_gd_learning_rate(10.0); + network->set_epsilon(0.0); + network->set_max_num_epochs(1000); network->set_labels(labels); network->train(features);