# Plankton VGG mashup
This notebook contains a mashup of VGG12 and the Kaggle National Data Science Bowl plankton competition. The data can be obtained at Kaggle on https://www.kaggle.com/c/datasciencebowl

## TODO
* Guestimate validation set size in accordance with Kaggle
* Reshape last three dense layers: done
* Data sample and validation : done
* Change means: done
* Add batchnorm: done, verify some
* Add dropout: done, verify some
* Check grayscale imaging: done
* Verify number of parameters: done
* Create submission
* Save model: done

In [1]:
import glob
import random
import math
%matplotlib inline
import utils; reload(utils)
from utils import plots
from __future__ import division,print_function

import os, json
import numpy as np
np.set_printoptions(precision=4, linewidth=100)
from matplotlib import pyplot as plt

data_dir = "/home/sanne/Data/plankton"
train_dir = data_dir + '/train'
sample_dir = data_dir + '/sample'
valid_dir = data_dir + '/valid'

n_train = 30336
n_validation = int(0.2 * n_train)
n_sample = 150


Using gpu device 0: GeForce GTX 970 (CNMeM is enabled with initial size: 25.0% of memory, cuDNN 5105)
Using Theano backend.


In [2]:
"""
from PIL import Image

def get_mean(tf):
    im = Image.open(train_dir+train_files[0])
    im = im.convert("RGB")
    im = im.resize((224,224))
    data = np.array(im)
    #print(data.shape)
    return data.mean()
    
means = []
for train_file in get_files(train_dir):
    means.append(get_mean(train_file))
    
print(np.mean(means))
"""

'\nfrom PIL import Image\n\ndef get_mean(tf):\n    im = Image.open(train_dir+train_files[0])\n    im = im.convert("RGB")\n    im = im.resize((224,224))\n    data = np.array(im)\n    #print(data.shape)\n    return data.mean()\n    \nmeans = []\nfor train_file in get_files(train_dir):\n    means.append(get_mean(train_file))\n    \nprint(np.mean(means))\n'

In [3]:
def get_files(d):
    return [f[len(d):] for f in glob.glob(d + '/*/*')]    

def get_subdirs(d):
    return [f[len(d):] for f in glob.glob(d + '/*')]    
    
def copy_subdirs(frm, to):
    subdirs = [f[len(frm):] for f in glob.glob(frm + '/*')]
    for s in subdirs:
        os.system("mkdir -p %s" % (to + s))
        
def create_sample(n):
    # Destroy and create new samples
    os.system("rm -rf {}".format(sample_dir))
    
    dirs = ['/valid', '/train', '/test']
    
    for d in dirs:
        os.system("mkdir -p %s" % (sample_dir + d))

    # Create all class in valid and train sample
    for d in dirs[:-1]:
        copy_subdirs(train_dir, sample_dir+d)

    os.system("mkdir -p %s" % (sample_dir + '/test/unknown'))

    for sample in random.sample(get_files(train_dir), n):
        target_dir = random.sample(dirs,1)[0]
        if not target_dir == '/test':
            os.system("cp %s %s" % \
                             (train_dir + sample, sample_dir + target_dir + sample))
        else:
            os.system("cp %s %s" % \
                             (train_dir + sample, sample_dir + target_dir + '/unknown/' + sample.split('/')[-1]))

def create_validation(n, clean=False):
    # Place files back
    files = get_files(valid_dir)
    for f in get_files(valid_dir):
        os.system("mv %s %s" % \
                         (valid_dir + f, train_dir + f))
        
    if clean:
        return
    
    # Destroy and create new validation set
    os.system("rm -rf {}".format(valid_dir))
    
    # Create all class in valid and train sample
    copy_subdirs(train_dir, valid_dir)
    
    for sample in random.sample(get_files(train_dir), n):
        os.system("mkdir -p %s" % \
                         (os.path.dirname(valid_dir + sample)))
        os.system("mv %s %s" % \
                         (train_dir + sample, valid_dir + sample))
    

In [4]:
"""
create_sample(n_sample)
create_validation(n_validation)
"""


'\ncreate_sample(n_sample)\ncreate_validation(n_validation)\n'

In [5]:
print ("Number of training examples: {} in {}".format(len(get_files(train_dir)), \
                                                     len(get_subdirs(train_dir))))
print ("Number of validation examples: {} in {}".format(len(get_files(valid_dir)), \
                                                       len(get_subdirs(valid_dir))))

print ("Total examples: {}/{}".format(len(get_files(valid_dir))+len(get_files(train_dir)), 30336))

print ("Number of train samples: {} in {}".format(len(get_files(sample_dir+'/train')), \
                                           len(get_subdirs(sample_dir+'/train'))))
print ("Number of valid samples: {} in {}".format(len(get_files(sample_dir+'/valid')), \
                                           len(get_subdirs(sample_dir+'/valid'))))

Number of training examples: 24269 in 121
Number of validation examples: 6067 in 121
Total examples: 30336/30336
Number of train samples: 46 in 121
Number of valid samples: 54 in 121


## Model setup

In [6]:
from numpy.random import random, permutation
from scipy import misc, ndimage
from scipy.ndimage.interpolation import zoom

import keras
from keras import backend as K
from keras.utils.data_utils import get_file
from keras.models import Sequential, Model
from keras.layers.core import Flatten, Dense, Dropout, Lambda
from keras.layers import Input
from keras.layers.convolutional import Convolution2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, RMSprop
from keras.preprocessing import image
from keras.layers.normalization import BatchNormalization

In [7]:
FILES_PATH = 'http://www.platform.ai/models/'; CLASS_FILE='imagenet_class_index.json'
# Keras' get_file() is a handy function that downloads files, and caches them for re-use later
fpath = get_file(CLASS_FILE, FILES_PATH+CLASS_FILE, cache_subdir='models')
with open(fpath) as f: class_dict = json.load(f)
# Convert dictionary with string indexes into an array
classes = [class_dict[str(i)][1] for i in range(len(class_dict))]

In [8]:
batch_size=32

def ConvBlock(layers, model, filters, add=False):
    for i in range(layers): 
        model.add(ZeroPadding2D((1,1)))
        model.add(Convolution2D(filters, 3, 3, activation='relu'))
        if add: model.add(BatchNormalization(axis=1))
        if add: model.add(Dropout(0.5)) # verify
    model.add(MaxPooling2D((2,2), strides=(2,2)))
    if add: model.add(BatchNormalization(axis=1)) # verify
    if add: model.add(Dropout(0.5)) # verify
    
def FCBlock(model, add=False):
    model.add(Dense(4096, activation='relu'))
    # BatchNorm should be applied before Dropout
    #if add: model.add(BatchNormalization())
    if add: model.add(Dropout(0.10))

# Mean of each channel as provided by VGG researchers
vgg_mean = np.array([242.96,242.96,242.96]).reshape((3,1,1))

def vgg_preprocess(x):
    x = x - vgg_mean     # subtract mean
    #return x[:, ::-1]    # reverse axis bgr->rgb
    return x

def VGG_16():
    model = Sequential()
    # samples x channels x width x height
    # model.add(BatchNormalization(axis=1, input_shape=(3,176,176)))
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))

    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)

    model.add(Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model

def get_batches(dirname, gen=image.ImageDataGenerator(), shuffle=True, 
                batch_size=batch_size, class_mode='categorical'):
    return gen.flow_from_directory(path+dirname, target_size=(224,224), 
                class_mode=class_mode, shuffle=shuffle, batch_size=batch_size)

In [9]:
def create_plankton():

    model = Sequential()
    # samples x channels x width x height
    # model.add(BatchNormalization(axis=1, input_shape=(3,176,176)))
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))

    ConvBlock(2, model, 64, add=True)
    ConvBlock(2, model, 128, add=True)
    ConvBlock(3, model, 256, add=True)
    ConvBlock(3, model, 512, add=True)
    ConvBlock(3, model, 512, add=True)

    model.add(Flatten())
    
    #model.load_weights('conv_layers.h5')
    
    FCBlock(model, add=True)
    FCBlock(model, add=True)
    model.add(Dense(121, activation='softmax'))
    return model

def tune(model):   
    for i in range(3): model.pop()
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(121, activation='softmax'))
    return model

In [10]:
def VGG_16():
    model = Sequential()
    model.add(Lambda(vgg_preprocess, input_shape=(3,224,224), output_shape=(3,224,224)))

    ConvBlock(2, model, 64)
    ConvBlock(2, model, 128)
    ConvBlock(3, model, 256)
    ConvBlock(3, model, 512)
    ConvBlock(3, model, 512)

    model.add(Flatten())
    FCBlock(model)
    FCBlock(model)
    model.add(Dense(1000, activation='softmax'))
    return model

fpath = get_file('vgg16.h5', FILES_PATH+'vgg16.h5', cache_subdir='models')
model = VGG_16()
model.load_weights(fpath)
for i in range(3): model.pop()
for layer in model.layers: layer.trainable=False
FCBlock(model, True)
FCBlock(model, True)
model.add(Dense(121, activation='softmax'))

# Train net

In [11]:
# Settings
color_mode="grayscale"
lr = 0.001
nb_epoch = 15
path = data_dir + "/"
#path = data_dir + "/sample/"

In [12]:
# Data augmentation
#gen = image.ImageDataGenerator(rotation_range=360, width_shift_range=0.1, shear_range = 0.1, \
#                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True)
gen = image.ImageDataGenerator(rotation_range=45, width_shift_range=0.1, shear_range = 0.1, \
                               height_shift_range=0.1, zoom_range=0.1, horizontal_flip=True, vertical_flip=True)

# Get the batch driver
batches = get_batches('train', batch_size=batch_size)
val_batches = get_batches('valid', shuffle=False, batch_size=batch_size)

# Create, compile and fit
#model = create_plankton()
model.summary()

model.compile(optimizer=Adam(lr=lr),
                loss='categorical_crossentropy', metrics=['accuracy'])

model.fit_generator(batches, samples_per_epoch=batches.nb_sample, nb_epoch=nb_epoch,
                validation_data=val_batches, nb_val_samples=val_batches.nb_sample)
model.save_weights('plankton.h5')


Found 24269 images belonging to 121 classes.
Found 6067 images belonging to 121 classes.
____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
lambda_1 (Lambda)                (None, 3, 224, 224)   0           lambda_input_1[0][0]             
____________________________________________________________________________________________________
zeropadding2d_1 (ZeroPadding2D)  (None, 3, 226, 226)   0           lambda_1[0][0]                   
____________________________________________________________________________________________________
convolution2d_1 (Convolution2D)  (None, 64, 224, 224)  1792        zeropadding2d_1[0][0]            
____________________________________________________________________________________________________
zeropadding2d_2 (ZeroPadding2D)  (None, 64, 226, 226)  0           convolution2d_1[0][0]            
__

## Getting imagenet predictions

In [85]:
def pred_batch(model, imgs):
    preds = model.predict_generator(test_batches, test_batches.nb_sample)
    idxs = np.argmax(preds, axis=1)

    print('Shape: {}'.format(preds.shape))
    print('First 5 classes: {}'.format(classes[:5]))
    print('First 5 probabilities: {}\n'.format(preds[0, :5]))
    print('Predictions prob/class: ')
    
    for i in range(len(idxs)):
        idx = idxs[i]
        print ('  {:.4f}/{}'.format(preds[i, idx], classes[idx]))

In [89]:
test_batches = get_batches('/test', shuffle=False, batch_size=batch_size, class_mode=None)
model = tune(create_vgg())
model.load_weights("plankton.h5")
pred_batch(model, test_batches)

Found 59 images belonging to 1 classes.
Shape: (59, 121)
First 5 classes: [u'tench', u'goldfish', u'great_white_shark', u'tiger_shark', u'hammerhead']
First 5 probabilities: [  1.1670e-12   9.3122e-13   9.4961e-08   1.7425e-09   1.6834e-06]

Predictions prob/class: 
  0.7026/quail
  0.8684/flatworm
  0.9999/flatworm
  0.9999/brambling
  0.8062/axolotl
  1.0000/black_and_gold_garden_spider
  0.9968/spotted_salamander
  0.4372/axolotl
  0.7142/African_chameleon
  0.3819/loggerhead
  0.8120/axolotl
  0.2671/axolotl
  1.0000/brambling
  0.9983/black_and_gold_garden_spider
  0.9632/black_and_gold_garden_spider
  0.9961/goose
  0.7372/African_chameleon
  0.5078/brambling
  1.0000/brambling
  0.5732/brambling
  0.9907/black_and_gold_garden_spider
  0.8119/African_chameleon
  0.6135/spotted_salamander
  1.0000/spotted_salamander
  0.9937/sea_slug
  0.9955/peacock
  0.8988/African_chameleon
  0.9042/African_chameleon
  1.0000/brambling
  0.4219/axolotl
  0.5819/brambling
  1.0000/brambling
  0.

# Show some samples

In [14]:
a = [1, 2]
b = [3, 4]
print(a.extend(b))

None


In [15]:
a

[1, 2, 3, 4]