In [1]:
import os
#import keras
import scipy.misc
import pickle
import cv2
import opencl4py as cl
import sklearn.utils
import numpy as np
import random
import tensorflow as tf
import matplotlib.pyplot as plt

%matplotlib inline

def load_data():
    data_folder = 'traffic-signs-data'
    training_file = os.path.join(data_folder, 'train.p')
    validation_file = os.path.join(data_folder, 'valid.p')
    #testing_file = os.path.join(data_folder, 'test.p')

    with open(training_file, mode='rb') as f:
        train = pickle.load(f)
    with open(validation_file, mode='rb') as f:
        valid = pickle.load(f)
    #with open(testing_file, mode='rb') as f:
    #    test = pickle.load(f)

    return train, valid#, test


def lenet(x, img_channels):
    mu = 0
    sigma = 0.1

    # Layer 1: Convolution. Input = 32x32x3. Output = 28x28x6.
    conv1_W = tf.Variable(tf.truncated_normal(shape=(5, 5, img_channels, 6), mean=mu, stddev=sigma))
    conv1_b = tf.Variable(tf.zeros(6))
    conv1 = tf.nn.conv2d(x, conv1_W, strides=[1, 1, 1, 1], padding='VALID') + conv1_b

    # Activation
    conv1 = tf.nn.relu(conv1)

    # Pooling. Input = 28x28x6. Output = 14x14x6.
    conv1 = tf.nn.max_pool(conv1, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')

    # Layer 2: Convolution. Input = 14x14x6. Output = 10x10x16.
    conv2_W = tf.Variable(tf.truncated_normal(shape=(5, 5, 6, 16), mean=mu, stddev=sigma))
    conv2_b = tf.Variable(tf.zeros(16))
    conv2 = tf.nn.conv2d(conv1, conv2_W, strides=[1, 1, 1, 1], padding='VALID') + conv2_b

    # Activation
    conv2 = tf.nn.relu(conv2)

    # Pooling. Input = 10x10x16. Output = 5x5x16.
    conv2 = tf.nn.max_pool(conv2, ksize=[1, 2, 2, 1], strides=[1, 2, 2, 1], padding='VALID')

    # Flatten. Input = 5x5x16. Output = 400.
    fc0 = tf.contrib.layers.flatten(conv2)

    # Layer 3: Fully connected. Input = 400. Output = 120.
    fc1_W = tf.Variable(tf.truncated_normal(shape=(400, 120), mean=mu, stddev=sigma))
    fc1_b = tf.Variable(tf.zeros(120))
    fc1 = tf.matmul(fc0, fc1_W) + fc1_b

    # Activation
    fc1 = tf.nn.relu(fc1)

    # Layer 4: Fully connected. Input = 120. Output = 84.
    fc2_W = tf.Variable(tf.truncated_normal(shape=(120, 84), mean=mu, stddev=sigma))
    fc2_b = tf.Variable(tf.zeros(84))
    fc2 = tf.matmul(fc1, fc2_W) + fc2_b

    # Activation
    fc2 = tf.nn.relu(fc2)

    # TODO Make 43 (number of classes) an input variable
    # Layer 5: Fully connected. Input = 84. Output = 43.
    fc3_W = tf.Variable(tf.truncated_normal(shape=(84, 43), mean=mu, stddev=sigma))
    fc3_b = tf.Variable(tf.zeros(43))
    logits = tf.matmul(fc2, fc3_W) + fc3_b

    return logits


def evaluate(X_data, y_data, batch_size, accuracy_operation, x, y):
    num_examples = len(X_data)
    total_accuracy = 0
    sess = tf.get_default_session()
    for offset in range(0, num_examples, batch_size):
        end = offset + batch_size
        batch_x = X_data[offset:end]
        batch_y = y_data[offset:end]
        accuracy = sess.run(accuracy_operation, feed_dict={x: batch_x, y: batch_y})
        total_accuracy += accuracy * len(batch_x)

    return total_accuracy / num_examples


cl_kernels = {}
cl_queue = None
cl_context = None

def load_kernels():
    global cl_kernels, cl_queue, cl_context

    platforms = cl.Platforms()
    cuda_platform = None

    for p in platforms:
        # It is hard to determine the device with the most power.
        # As my machines only have nvidia graphics cards, just filter for the nvidia CUDA platform
        if 'cuda' in p.name.lower():
            cuda_platform = p
            break

    if cuda_platform is None:
        print('No suitable device found. Exiting.')
        exit(0)

    device = cuda_platform.devices[0]
    cl_context = cuda_platform.create_context([device])

    cl_queue = cl_context.create_queue(device)
    program = cl_context.create_program(
        """
        __kernel void normalizeData(__global const uchar* input, __global float* output) {
            size_t idx = get_global_id(0);
            output[idx] = (input[idx] - 128.0) / 128.0;
        }
        
        __kernel void normalizeAndGrayscale(__global const uchar* input, __global float* output) {
            size_t grayIdx = get_global_id(0);
            size_t rgbIdx = 3 * grayIdx;
            float gray = 0.21 * input[rgbIdx] + 0.72 * input[rgbIdx + 1] + 0.07 * input[rgbIdx + 2];
            output[grayIdx] = gray; // (gray - 128.0) / 128.0;
        }
        """)

    cl_kernels['normalize'] = program.get_kernel('normalizeData')
    cl_kernels['normalizeGray'] = program.get_kernel('normalizeAndGrayscale')


def normalize_images(images, gray_scale):
    # TODO: A lot of memory is needed for this. Use Image3D?
    data = np.ndarray.flatten(images)

    if gray_scale:
        kernel = cl_kernels['normalizeGray']
        output = np.empty(int(data.size / 3), dtype=np.float32)
    else:
        kernel = cl_kernels['normalize']
        output = np.empty(data.size, dtype=np.float32)

    input_buffer = cl_context.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, data)
    output_buffer = cl_context.create_buffer(cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_ALLOC_HOST_PTR, size=output.nbytes)

    kernel.set_arg(0, input_buffer)
    kernel.set_arg(1, output_buffer)
    cl_queue.execute_kernel(kernel, [output.size], None)
    cl_queue.read_buffer(output_buffer, output)

    if gray_scale:
        output_images = output.reshape((len(images), 32, 32))
    else:
        output_images = output.reshape(images.shape)
    return output_images


def rotate(img, angle):
    # 16 = 32 / 2, center of image
    M = cv2.getRotationMatrix2D((16, 16), angle, 1)
    rotated = cv2.warpAffine(img, M, (32, 32)).reshape((32, 32, 1))
    return rotated


def main():
    load_kernels()

    train, valid = load_data()
    X_train, y_train = train['features'], train['labels']
    X_valid, y_valid = valid['features'], valid['labels']
    #X_test, y_test = test['features'], test['labels']

    #image = X_train[0].squeeze()
    #plt.figure(figsize=(1, 1))
    #plt.imshow(image, cmap='gray')

    X_train = normalize_images(X_train, True)
    X_valid = normalize_images(X_valid, True)
    #X_test = normalize_images(X_test)

    for i in range(0, len(X_train), 10):
        #angle = random.randrange(-10, 10)
        #rotated = rotate(X_train[i], angle)
        plt.imshow(X_train[i], cmap='Greys')
        cv2.imshow('image', X_train[i])
        cv2.waitKey(0)
        cv2.destroyAllWindows()
        #exit(0)
    exit(0)

    img_width = 32
    img_height = 32
    img_channels = 1

    n_train = len(X_train)
    #n_test = len(X_test)

    # TODO: Load from signnames.csv?
    n_classes = 43

    x = tf.placeholder(tf.float32, (None, img_width, img_height, img_channels))
    y = tf.placeholder(tf.int32, (None))
    one_hot_y = tf.one_hot(y, n_classes)

    epochs = 50
    rate = 0.001
    batch_size = 128

    logits = lenet(x, img_channels)
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=one_hot_y)
    loss_operation = tf.reduce_mean(cross_entropy)
    optimizer = tf.train.AdamOptimizer(learning_rate=rate)
    training_operation = optimizer.minimize(loss_operation)

    correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
    accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        print('Training...')
        print()

        accuracies = []

        for i in range(epochs):
            X_train, y_train = sklearn.utils.shuffle(X_train, y_train)
            count = 0
            for offset in range(0, n_train, batch_size):
                end = offset + batch_size
                batch_x = X_train[offset:end]
                batch_y = y_train[offset:end]

                for a in range(len(batch_x)):
                    if random.random() < 0.1:
                        angle = random.random() * 2
                        M = cv2.getRotationMatrix2D((img_width / 2, img_height / 2), angle, 1)
                        batch_x[a] = cv2.warpAffine(batch_x[a], M, (img_width, img_height)).reshape((img_width, img_height, img_channels))

                sess.run(training_operation, feed_dict={x: batch_x, y: batch_y})
                count += 1

            validation_accuracy = evaluate(X_valid, y_valid, batch_size, accuracy_operation, x, y)
            print('EPOCH {0} ...'.format(i + 1))
            print('Validation accuracy = {:.3f}'.format(validation_accuracy))
            print()

            accuracies.append('{:.3f}'.format(validation_accuracy))

        print(accuracies)

if __name__ == '__main__':
    main()