# Training a cNN to detect roadsigns
In order to process roadsigns in the autonomous car of the Freie Universität, we want to train a convolutional (deep) neural network.

This network is supposed to distinguish between different classes of signs (stop, attention, train crossing etc) and the final model will then be integrated to the autonomos ROS structure.

This notebook shall download the dataset, read it in and then train the classifier. Afterwards, a validation of the training procedure will be done.

In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt

import urllib2, cStringIO, zipfile
import csv
import os

## Download the dataset

In [2]:
url = 'http://benchmark.ini.rub.de/Dataset/GTSRB_Final_Training_Images.zip'

if not os.path.exists('GTSRB/Final_Training/Images'):
    try:
        remotezip = urllib2.urlopen(url)
        zipinmemory = cStringIO.StringIO(remotezip.read())
        zip = zipfile.ZipFile(zipinmemory)
        zip.extractall('.')
    except urllib2.HTTPError:
        pass

## Read the data in

In [3]:
def readTrafficSigns(rootpath):
    '''Reads traffic sign data for German Traffic Sign Recognition Benchmark.

    Arguments: path to the traffic sign data, for example './GTSRB/Training'
    Returns:   list of images, list of corresponding labels'''
    images = [] # images
    labels = [] # corresponding labels
    # loop over all 42 classes
    for c in range(0,43):
        prefix = rootpath + '/' + format(c, '05d') + '/' # subdirectory for class
        gtFile = open(prefix + 'GT-'+ format(c, '05d') + '.csv') # annotations file
        gtReader = csv.reader(gtFile, delimiter=';') # csv parser for annotations file
        gtReader.next() # skip header
        # loop over all images in current annotations file
        for row in gtReader:
            images.append(plt.imread(prefix + row[0])) # the 1th column is the filename
            labels.append(row[7]) # the 8th column is the label
        gtFile.close()
    return images, labels

In [4]:
trainImg, trainLabels = readTrafficSigns('GTSRB/Final_Training/Images')

In [5]:
print "Number of training images: " + str(len(trainImg))
print "Number of training labels: " + str(len(trainLabels))
maxShape = (0,0)
maxPos = 0
pos = 0
for img in trainImg:
    if np.prod(img.shape) > np.prod(maxShape):
        maxShape = img.shape
        maxPos = pos
    pos += 1
print "Largest Image Dimensions: " + str(maxShape)

Number of training images: 39209
Number of training labels: 39209
Largest Image Dimensions: (225, 243, 3)


In [41]:
print trainLabels[-1]
number_of_classes = 43

42


## Make all images the same size
For a convolutional Neural Network to work, the images all need to have the same size.
This can be done by padding with black pixels. Then, all of the images are of the size of the largest image.

However, this approach may have drawbacks when it comes to the learning.

In [11]:
train_set = []
count = 0
for img in trainImg[:1000]:
    padded = np.zeros(maxShape)
    x1 = int(padded.shape[0] / 2. - img.shape[0] / 2.)
    y1 = int(padded.shape[1] / 2. - img.shape[1] / 2.)
    padded[x1:img.shape[0]+x1, y1:img.shape[1]+y1] = img
    train_set.append(padded)
    if count % 500 == 0:
        print "Done with " + str(count) + " images"
    count += 1
print np.array(train_set).shape

Done with 0 images
Done with 500 images
(1000, 225, 243, 3)


In [7]:
len(train_set)
for img in train_set:
    if not img.shape == maxShape:
        print "ERROR"

## Transform labels to one-hot-vectors

In [45]:
train_labels = []
for label in trainLabels:
    new_label = np.zeros(number_of_classes)
    new_label[int(label)] = 1
    train_labels.append(new_label)
train_labels = np.array(train_labels)
print train_labels[13451]

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.]


## Build the convolutional Neural Network

In [46]:
"""
This class implements a convolutional neural network classifier.
Main usage should consist of two steps, namely train and evaluate. During training, the weights of the network
will be changed in a way to nicely represent the data and classify it in the end.

Parameters:
    img_shape:
        Shape of the images that will be presented to the
        network as (width, height, #channels)
    
    learning_rate:
        The learning rate used for gradient descent
    
    architecture:
        List. It contains for each layer the number
        of neurons. For the convolution, the number
        of neurons corresponds to the number of kernels.
"""
class cNN:
    
    def __init__(self, architecture, img_shape, kernel_shape, learning_rate):
        if len(architecture) > 5:
            print "ERROR. The network is too deep. So far, we can't deal with more than 5 layers!"
        self.learning_rate = learning_rate
        self.img_shape = img_shape #(x, y, channels)
        self.architecture = architecture
        self.kernel_shape = kernel_shape
    
    """
    This function generates lists of weight matrices and bias matrices from
    some given architecture.
    These can then be used to construct the network.
    """
    def generate_weights_and_biases(self):
        weights = []
        biases = []
        for layer in xrange(self.architecture):
            if self.architecture[layer][0] == "conv":
                if layer == 0: # first layer
                    last_output = img_shape[2]
                else:
                    last_output = self.architecture[layer-1][1]
                weights.append(tf.Variable(tf.random_normal([self.kernel_shape[0],
                                                            self.kernel_shape[1],
                                                            img_shape[2],
                                                            self.architecture[layer][1]]
                                                           )))
                biases.append(tf.Variable(tf.random_normal([self.architecture[layer][1]])))

            elif self.architecture[layer][0] == "dense":
                last_output = self.architecture[layer-1][1]
                num_of_conv_layers_so_far = len([i for i in xrange(layer) if self.architecture[i] == "conv"])
                num_of_input_units = (img_shape[0] * img_shape[1]) / num_of_conv_layers_so_far**2
                weights.append(tf.Variable(tf.random_normal([num_of_input_units, self.architecture[layer][1]])))
                biases.append(tf.Variable(tf.random_normal([self.architecture[layer][1]])))
            
            elif self.architecture[layer][0] == "out":
                last_output = self.architecture[layer-1][1]
                weights.append(tf.Variable(tf.random_normal([last_output, self.architecture[layer][1]])))
                biases.append(tf.Variable(tf.random_normal([self.architecture[layer][1]])))
                
    def weight_variable(self, shape):
        initial = tf.truncated_normal(shape, stddev=0.1)
        return tf.Variable(initial)

    def bias_variable(self, shape):
        initial = tf.constant(0.1, shape=shape)
        return tf.Variable(initial)

    def conv2d(self, x, W):
        return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')

    def max_pool_2x2(x):
        return tf.nn.max_pool(x, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='SAME')

    """
    This function creates the graph for training and returns a tensorflow function object.
    This object can then be used to train the batches.
    """
    def construct_model(self, batch):
        # create variables for layer one
        W_conv1 = weight_variable([5, 5, 3, 50])
        b_conv1 = bias_variable([50])
        
        # do convolution
        # and max pooling for layer one
        h_conv1 = tf.nn.relu(conv2d(batch, W_conv1) + b_conv1)
        h_pool1 = max_pool_2x2(h_conv1)

        # initialize vars for layer two
        W_conv2 = weight_variable([5, 5, 50, 64])
        b_conv2 = bias_variable([64])

        # convolve and max pool layer 2
        h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
        h_pool2 = max_pool_2x2(h_conv2)
        
        # now, do the dense layer 3
        W_fc1 = weight_variable([7 * 7 * 64, 1024])
        b_fc1 = bias_variable([1024])
        h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
        h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)

        # apply dropout
        keep_prob = tf.placeholder(tf.float32)
        h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
        
        W_fc2 = weight_variable([1024, 42])
        b_fc2 = bias_variable([42])
        y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)

        # add cross entropy as objective function
        cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
        train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
        correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
        accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

        return (train_step, correct_prediction, accuracy)

    def train_model(self, images, labels):
        print images.shape
        x = tf.placeholder(tf.float32, shape=(self.img_shape[0], self.img_shape[1]))
        y = tf.placeholder(tf.float32, shape=(42))
        
        train_op, eval_op, accuracy_op = self.construct_model(x)
        x = tf.placeholder(tf.float32, [None, n_input])
        keep_prob = tf.placeholder(tf.float32) #dropout (keep probability)
        batch_size = 100
        batch_runs = images.shape[0] / batch_size
        print "Batch size: " + str(batch_size)
        print "Number of iterations per epoch: " + str(batch_runs)
        for epoch in range(100):
            for batchIdx in range(batch_runs):
                train_op.run(feed_dict={x: images[batchIdx*batch_size:(batchIdx+1)*batch_size],
                                        y_: labels[batchIdx*batch_size:(batchIdx+1)*batch_size],
                                        keep_prob: 0.5})
            print "Epoch " + str(epoch) + " done!"


In [48]:
architecture = [("conv", 32), ("conv", 64), ("dense", 1024), ("out", 42)]
img_shape = maxShape
kernel_shape = (5, 5)
learning_rate = 0.5
classifier = cNN(architecture, img_shape, kernel_shape, learning_rate)
classifier.train_model(np.array(train_set), train_labels)

(1000, 225, 243, 3)


ValueError: Shape (225, 243) must have rank 4

In [13]:
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

Successfully downloaded train-images-idx3-ubyte.gz 9912422 bytes.
Extracting MNIST_data/train-images-idx3-ubyte.gz
Successfully downloaded train-labels-idx1-ubyte.gz 28881 bytes.
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Successfully downloaded t10k-images-idx3-ubyte.gz 1648877 bytes.
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Successfully downloaded t10k-labels-idx1-ubyte.gz 4542 bytes.
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
