In [1]:
# Necessary imports
import tensorflow as tf
import numpy as np
from imageio import imread
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
import time
from caffe_classes import class_names

%matplotlib inline

### AlexNet implementation

In [2]:
# Retrieve the AlexNet param values
net_data = np.load('bvlc-alexnet.npy', encoding='latin1').item()

In [3]:
# Helper functions
def conv2d(input_vol, W, b, stride=1, padding='VALID', group=1):
    c_i = input_vol.get_shape().as_list()[-1]
    c_o = W.get_shape().as_list()[-1]
    
    assert(c_i % group == 0)
    assert(c_o % group == 0)
    convolve = lambda i, k: tf.nn.conv2d(i, k, strides=[1, stride, stride, 1], padding=padding)
    
    if tf.__version__ < "1.0.0":
        if group == 1:
            conv_layer = convolve(input_vol, W)
        else:
            input_groups = tf.split(3, group, input_vol)
            kernel_groups = tf.split(3, group, W)
            output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)]
            conv_layer = tf.concat(3, output_groups)
    else:
        if group == 1:
            conv_layer = convolve(input_vol, W)
        else:
            input_groups = tf.split(input_vol, group, axis=3)
            kernel_groups = tf.split(W, group, axis=3)
            output_groups = [convolve(i, k) for i, k in zip(input_groups, kernel_groups)]
            conv_layer = tf.concat(output_groups, axis=3)
    
    conv_layer = tf.nn.bias_add(conv_layer, b)
    return tf.nn.relu(conv_layer)

def maxpool2d(input_vol, k=2, stride=2):
    return tf.nn.max_pool(input_vol, ksize=[1, k, k, 1],
                          strides=[1, stride, stride, 1],
                          padding='VALID')

def alexnet(X, feature_extraction=False):
    # CONV1 Layer
    # Kernel=11x11x96. Strides=4x4. Group=1
    conv1W = tf.Variable(net_data['conv1'][0])
    conv1b = tf.Variable(net_data['conv1'][1])
    conv1 = conv2d(X, conv1W, conv1b, stride=4, padding='SAME', group=1) 
    
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    # NORM Layer
    conv1 = tf.nn.local_response_normalization(conv1,
                                               depth_radius=radius,
                                               alpha=alpha,
                                               beta=beta, bias=bias)     
    # POOL Layer. Kernel=3x3. Strides=2x2.
    conv1 = maxpool2d(conv1, 3)                                          
    
    # CONV2 Layer
    # Kernel=5x5x256. Strides=1x1. Group=2
    conv2W = tf.Variable(net_data['conv2'][0])
    conv2b = tf.Variable(net_data['conv2'][1])
    conv2 = conv2d(conv1, conv2W, conv2b, stride=1, padding='SAME', group=2) 
    
    radius = 2; alpha = 2e-05; beta = 0.75; bias = 1.0
    # NORM Layer
    conv2 = tf.nn.local_response_normalization(conv2,
                                               depth_radius=radius,
                                               alpha=alpha,
                                               beta=beta, bias=bias)        
    # POOL Layer. Kernel=3x3. Strides=2x2. 
    conv2 = maxpool2d(conv2, 3)                                             
    
    # CONV3 Layer
    # Kernel=3x3x384. Strides=1x1. Group=1
    conv3W = tf.Variable(net_data['conv3'][0])
    conv3b = tf.Variable(net_data['conv3'][1])
    conv3 = conv2d(conv2, conv3W, conv3b, stride=1, padding='SAME', group=1) 
    
    # CONV4 Layer
    # Kernel=3x3x384. Strides=1x1. Group=2
    conv4W = tf.Variable(net_data['conv4'][0])
    conv4b = tf.Variable(net_data['conv4'][1])
    conv4 = conv2d(conv3, conv4W, conv4b, stride=1, padding='SAME', group=2) 
    
    # CONV5 Layer
    # Kernel=3x3x256. Strides=1x1. Group=2
    conv5W = tf.Variable(net_data['conv5'][0])
    conv5b = tf.Variable(net_data['conv5'][1])
    conv5 = conv2d(conv4, conv5W, conv5b, stride=1, padding='SAME', group=2)
    # POOL Layer. Kernel=3x3. Strides=2x2. 
    conv5 = maxpool2d(conv5, 3)            
    
    # FC6 Layer. 4096
    fc6W = net_data['fc6'][0]
    fc6b = net_data['fc6'][1]
    fc6 = tf.contrib.layers.flatten(conv5)
    fc6 = tf.nn.relu(tf.nn.xw_plus_b(fc6, fc6W, fc6b))
    
    # FC7 Layer. 4096
    fc7W = net_data['fc7'][0]
    fc7b = net_data['fc7'][1]
    fc7 = tf.nn.relu(tf.nn.xw_plus_b(fc6, fc7W, fc7b))
    
    if feature_extraction:
        return fc7
    
    # FC8 Layer. 1000
    fc8W = net_data['fc8'][0]
    fc8b = net_data['fc8'][1]
    logits = tf.nn.xw_plus_b(fc7, fc8W, fc8b)
    probabilities = tf.nn.softmax(logits)
    return probabilities

### Inference on ImageNet

In [4]:
tf.reset_default_graph()

# Get the sample images for inference
img1 = (imread("poodle.png")[:, :, :3]).astype(np.float32)
img2 = (imread("weasel.png")[:, :, :3]).astype(np.float32)

# Perform pre-processing
img1 = img1 - np.mean(img1) # Range: [-127, +128]
img2 = img2 - np.mean(img2) # Range: [-127, +128]

# Define the placholders/hyperparams
X = tf.placeholder(tf.float32, shape=[None, 227, 227, 3])

# Define/Implement the model architecture
probs = alexnet(X)

# Perform the inference using the model
start = time.time()
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    output = session.run(probs, feed_dict={X: [img1, img2]})
    
# Print the output
for img_idx in range(output.shape[0]):
    idxs = np.argsort(output[img_idx, :])
    print('Image {}'.format(img_idx))
    
    for i in range(5):
        print('{}: {:.4f}'.format(class_names[idxs[-1-i]], output[img_idx, idxs[-1 -i]]))
    print()
    
print('Time taken: {:.3f}'.format(time.time() - start))

Image 0
miniature poodle: 0.3895
toy poodle: 0.2231
Bedlington terrier: 0.1730
standard poodle: 0.1496
komondor: 0.0258

Image 1
weasel: 0.3313
polecat, fitch, foulmart, foumart, Mustela putorius: 0.2803
black-footed ferret, ferret, Mustela nigripes: 0.2105
mink: 0.0814
Arctic fox, white fox, Alopex lagopus: 0.0268

Time taken: 1.838


### Inference on Traffic Sign Dataset

In [5]:
tf.reset_default_graph()

# Get the sample images for inference
img1 = (imread("construction.jpg")[:, :, :3]).astype(np.float32)
img2 = (imread("stop.jpg")[:, :, :3]).astype(np.float32)

# Perform pre-processing
img1 = img1 - np.mean(img1) # Range: [-127, +128]
img2 = img2 - np.mean(img2) # Range: [-127, +128]

# Define the placholders/hyperparams
X = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
resized = tf.image.resize_images(X, [227, 227])

# Define/Implement the model architecture
probs = alexnet(resized)

# Perform the inference using the model
start = time.time()
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    output = session.run(probs, feed_dict={X: [img1, img2]})
    
# Print the output
for img_idx in range(output.shape[0]):
    idxs = np.argsort(output[img_idx, :])
    print('Image {}'.format(img_idx))
    
    for i in range(5):
        print('{}: {:.4f}'.format(class_names[idxs[-1-i]], output[img_idx, idxs[-1 -i]]))
    print()
    
print('Time taken: {:.3f}'.format(time.time() - start))

Image 0
screen, CRT screen: 0.0510
digital clock: 0.0408
laptop, laptop computer: 0.0300
balance beam, beam: 0.0270
parallel bars, bars: 0.0227

Image 1
digital watch: 0.3954
digital clock: 0.2749
bottlecap: 0.1150
stopwatch, stop watch: 0.1036
combination lock: 0.0862

Time taken: 0.729


### Feature extraction using AlexNet

In [6]:
import pandas as pd
sign_names = pd.read_csv('signnames.csv')

tf.reset_default_graph()

# Get the sample images for inference
img1 = (imread("construction.jpg")[:, :, :3]).astype(np.float32)
img2 = (imread("stop.jpg")[:, :, :3]).astype(np.float32)

# Perform pre-processing
img1 = img1 - np.mean(img1) # Range: [-127, +128]
img2 = img2 - np.mean(img2) # Range: [-127, +128]

# Define the placholders/hyperparams
X = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
resized = tf.image.resize_images(X, [227, 227])

# Define/Implement the model architecture
fc7 = alexnet(resized, feature_extraction=True)

fc8_input_shape = fc7.get_shape().as_list()[-1]
fc8W = tf.Variable(tf.truncated_normal([fc8_input_shape, 43], mean=0, stddev=0.1))
fc8b = tf.Variable(tf.zeros([43]))
logits = tf.nn.relu(tf.nn.xw_plus_b(fc7, fc8W, fc8b))
probabilities = tf.nn.softmax(logits)

# Perform the inference using the model
start = time.time()
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    output = session.run(probabilities, feed_dict={X: [img1, img2]})
    
# Print the output
for img_idx in range(output.shape[0]):
    idxs = np.argsort(output[img_idx, :])
    print('Image {}'.format(img_idx))
    
    for i in range(5):
        print('{}: {:.4f}'.format(sign_names.loc[idxs[-1-i]][1], output[img_idx, idxs[-1 -i]]))
    print()
    
print('Time taken: {:.3f}'.format(time.time() - start))

Image 0
Double curve: 0.4445
Go straight or left: 0.3626
Pedestrians: 0.0924
Bicycles crossing: 0.0254
Traffic signals: 0.0207

Image 1
Speed limit (50km/h): 0.5737
Go straight or left: 0.3372
Speed limit (20km/h): 0.0872
Yield: 0.0015
Children crossing: 0.0003

Time taken: 0.707


### Training Feature Extractor for classification of images from the German Traffic Sign Dataset

In [7]:
from tqdm import tqdm_notebook as tqdm
import pickle

tf.reset_default_graph()

# Get the data set
with open('train.p', mode='rb') as f:
    data = pickle.load(f)

X_train, X_val, y_train, y_val = train_test_split(data['features'], data['labels'], test_size=0.33, random_state=0)

# Data set summarisation
print('# training examples: {}'.format(len(X_train)))
print('# validation examples: {}'.format(len(X_val)))

# Sanity checks
assert(len(X_train) == len(y_train))
assert(len(X_val) == len(y_val))

# Define hyperparams
IMAGE_SIZE = 227
EPOCHS = 10
BATCH_SIZE = 128
N_LABELS = 43 # Number of labels for the GTS data set

# Define placholders
X = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
X_resized = tf.image.resize_images(X, [IMAGE_SIZE, IMAGE_SIZE])
y = tf.placeholder(tf.int64, shape=[None])

# Define and implement the architecture of the model
fc7 = alexnet(X_resized, feature_extraction=True)
fc7 = tf.stop_gradient(fc7)
fc8_input_shape = fc7.get_shape().as_list()[-1]
fc8W = tf.Variable(tf.truncated_normal([fc8_input_shape, N_LABELS], stddev=0.01))
fc8b = tf.Variable(tf.zeros([N_LABELS]))
logits = tf.nn.xw_plus_b(fc7, fc8W, fc8b)

# Training pipeline
loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y))
optimiser = tf.train.AdamOptimizer().minimize(loss, var_list=[fc8W, fc8b])

# Evaluation pipeline
correct_prediction = tf.equal(tf.argmax(logits, 1), y)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

def evaluate(X_data, y_data):
    nb_examples = len(X_data)
    total_accuracy = 0.0

    session = tf.get_default_session()
    for offset in range(0, nb_examples, BATCH_SIZE):
        X_batch, y_batch = X_data[offset: offset+BATCH_SIZE], y_data[offset: offset+BATCH_SIZE]
        data_accuracy = session.run(accuracy, feed_dict={
            X: X_batch,
            y: y_batch
        })

        total_accuracy += (data_accuracy * len(X_batch))
    return total_accuracy/nb_examples

print('Training...')

# Start the training session
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    nb_examples = len(X_train)
    for epoch_i in range(EPOCHS):
        X_train, y_train = shuffle(X_train, y_train)
        for offset in tqdm(range(0, nb_examples, BATCH_SIZE)):
            X_batch, y_batch = X_train[offset:offset+BATCH_SIZE], y_train[offset:offset+BATCH_SIZE]
            session.run(optimiser, feed_dict={
                X: X_batch,
                y: y_batch
            })

        print('Epoch: {}'.format(epoch_i + 1))
        print('Validation accuracy: {:.4f}'.format(evaluate(X_val, y_val)))

# training examples: 26270
# validation examples: 12939
Training...

Epoch: 1
Validation accuracy: 0.8692

Epoch: 2
Validation accuracy: 0.9172

Epoch: 3
Validation accuracy: 0.9352

Epoch: 4
Validation accuracy: 0.9394

Epoch: 5
Validation accuracy: 0.9546

Epoch: 6
Validation accuracy: 0.9578

Epoch: 7
Validation accuracy: 0.9590

Epoch: 8
Validation accuracy: 0.9611

Epoch: 9
Validation accuracy: 0.9646

Epoch: 10
Validation accuracy: 0.9661
