#### This code attempts to reuse the VGG 16 convolutional layers to read the driver images. A small dense layer is then trained to classifies these VGG output features . Note that the dense layer is smaller than what was used by Jeremy, thus the accuracy stays around 74-76%.

In [1]:
from __future__ import print_function, division
import utils; reload(utils)
from utils import *

Using gpu device 0: Tesla K80 (CNMeM is disabled, cuDNN 5103)
Using Theano backend.


In [2]:
%matplotlib inline
path="/home/ubuntu/nbs/data/state/"
#sample_path="/home/ubuntu/nbs/data/state/sample/"

In [3]:
from IPython.display import FileLink

In [4]:
from keras.preprocessing import image

In [5]:
batch_size=64

####  NOTE:  The validation data used here was created using the state farm  script run on a small sample. (state-farm-sample.ipynb)
#### If you are attempting to use this code, please run that file first so that distinct driver images get copied over to the validation folder.

#### get a Directory Iterator  for the images in the training folder

In [6]:
datagen=image.ImageDataGenerator()

# flow from directory, with class names being categorical, image target size being  224 x 224, shuffle off
batches=datagen.flow_from_directory(path+'train',target_size=(224,224),class_mode='categorical',
shuffle=False,batch_size=batch_size)

Found 18009 images belonging to 10 classes.


#### Get a directory iterator for images in the validation folder

In [7]:
#get the validation set
val_batches=datagen.flow_from_directory(path+'valid',target_size=(224,224),class_mode='categorical',
shuffle=False,batch_size=batch_size)

Found 4415 images belonging to 10 classes.


#### get the directory iterator for images in the Test fodler

In [8]:
#get the test data
test_batches=datagen.flow_from_directory(path+'test',target_size=(224,224),class_mode=None,
shuffle=False,batch_size=batch_size)

Found 79726 images belonging to 1 classes.


In [9]:
# train labels
(val_classes, trn_classes, val_labels, trn_labels, 
    val_filenames, filenames, test_filenames) = get_classes(path)

Found 18009 images belonging to 10 classes.
Found 4415 images belonging to 10 classes.
Found 79726 images belonging to 1 classes.


In [10]:
from vgg16 import Vgg16
#get the vgg model
vgg=Vgg16()
model=vgg.model

In [11]:
# find the  conv layers of vgg
last_conv_layer_id=[index for index,layer in enumerate(model.layers) if type(layer) is  Convolution2D][-1]
vgg_conv_layers=model.layers[:last_conv_layer_id+1]

In [12]:
# create a model from the vgg conv layers
conv_model=Sequential(vgg_conv_layers)

In [None]:
conv_model.summary()

In [None]:
# get numpy array of conv feature predictions for training data
conv_vgg_features=conv_model.predict_generator(batches,batches.nb_sample)

In [None]:
# get numpy array of conv feature predictions for val  data  
conv_val_vgg_features=conv_model.predict_generator(val_batches,val_batches.nb_sample)

In [None]:
# get vgg features for test set
conv_test_vgg_features = conv_model.predict_generator(test_batches, test_batches.nb_sample)

In [15]:
conv_test_vgg_features.shape

(79726, 512, 14, 14)

In [None]:
#save to array
save_array(path+'results/conv_val_vgg_features.dat', conv_val_vgg_features)
save_array(path+'results/conv_vgg_feat.dat', conv_vgg_features)

In [None]:
save_array(path+'results/conv_test_vgg_feat.dat', conv_test_vgg_features)

In [13]:
# load array
conv_vgg_features = load_array(path+'results/conv_vgg_feat.dat')
conv_val_vgg_features = load_array(path+'results/conv_val_vgg_features.dat')
conv_val_vgg_features.shape

(4415, 512, 14, 14)

In [14]:
conv_test_vgg_features = load_array(path+'results/conv_test_vgg_feat.dat')

#### define function to return dense layers with batch norm and dropout

In [16]:
def get_bn_layers(p):
    return [
        MaxPooling2D(input_shape=vgg_conv_layers[-1].output_shape[1:]),
        Flatten(),
        Dropout(p/2),
        Dense(128, activation='relu'),
        BatchNormalization(),
        Dropout(p),
        Dense(10, activation='softmax')
        ]

In [17]:
bn_model=Sequential(get_bn_layers(.5))
bn_model.compile(Adam(lr=.0001),loss='categorical_crossentropy',metrics=['accuracy'])
bn_model.fit(conv_vgg_features, trn_labels, batch_size=batch_size, nb_epoch=1, 
             validation_data=(conv_val_vgg_features, val_labels))

Train on 18009 samples, validate on 4415 samples
Epoch 1/1


<keras.callbacks.History at 0x7f944c14ddd0>

In [18]:
bn_model.optimizer.lr=0.001
bn_model.fit(conv_vgg_features, trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_vgg_features, val_labels))

Train on 18009 samples, validate on 4415 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f9452af4850>

In [19]:
bn_model.optimizer.lr=0.00001
bn_model.fit(conv_vgg_features, trn_labels, batch_size=batch_size, nb_epoch=8, 
             validation_data=(conv_val_vgg_features, val_labels))

Train on 18009 samples, validate on 4415 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


<keras.callbacks.History at 0x7f9452af4810>

#### Since we are overfitting the training set rapidly, increasing the dropout

In [22]:
bn_model=Sequential(get_bn_layers(.6))
bn_model.compile(Adam(lr=.00001),loss='categorical_crossentropy',metrics=['accuracy'])
bn_model.fit(conv_vgg_features, trn_labels, batch_size=batch_size, nb_epoch=6, 
             validation_data=(conv_val_vgg_features, val_labels))

Train on 18009 samples, validate on 4415 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f94499d6750>

#### Increasing learning rate to go faster

In [23]:
bn_model.optimizer.lr=0.001
bn_model.fit(conv_vgg_features, trn_labels, batch_size=batch_size, nb_epoch=4, 
             validation_data=(conv_val_vgg_features, val_labels))

Train on 18009 samples, validate on 4415 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x7f94497aa110>

##### A classification accuracy of 75.38% was achieved

In [24]:
# test set predictions 
preds=bn_model.predict(conv_test_vgg_features,batch_size=batch_size*2)
clip_preds= np.clip(preds, .0078, .93)

In [25]:
subm_name = path+'results/subm.gz'

In [29]:
classes = sorted(batches.class_indices, key=batches.class_indices.get)
submission = pd.DataFrame(clip_preds, columns=classes)

In [30]:
# add the image column to be the first one 
submission.insert(0, 'img', [a[12:] for a in test_filenames])
submission.head()

Unnamed: 0,img,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9
0,img_81601.jpg,0.130433,0.022666,0.0078,0.0078,0.013161,0.0078,0.28103,0.089333,0.025723,0.429339
1,img_14887.jpg,0.77222,0.125298,0.026401,0.035665,0.0078,0.013039,0.0078,0.0078,0.0078,0.012862
2,img_62885.jpg,0.15378,0.0078,0.011335,0.052795,0.730884,0.0078,0.011708,0.008977,0.017077,0.009495
3,img_45125.jpg,0.100768,0.038944,0.200454,0.0078,0.036892,0.0078,0.353301,0.057405,0.192221,0.017755
4,img_22633.jpg,0.093843,0.333974,0.030681,0.0078,0.009634,0.023245,0.0078,0.028647,0.223018,0.247936


In [31]:
submission.to_csv(subm_name, index=False, compression='gzip')
FileLink(subm_name)