In [1]:
import os, shutil, random, glob
import bcolz
import keras
import keras
import keras.preprocessing.image
from keras.layers import Input, Flatten, Dense, Dropout, Activation, BatchNormalization
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import scipy

Using TensorFlow backend.


Code below assumes that the train data from the https://www.kaggle.com/c/dogs-vs-cats competition has been downloaded and unzipped into the `train` directory under root of the repository.

In [2]:
files = glob.glob('train/*')
fnames = [f.split('/')[1] for f in files]

os.makedirs('train/cats')
os.makedirs('train/dogs')

for fname in fnames:
    dogs_or_cats = 'dogs' if 'dog' in fname else 'cats'
    shutil.move(f'train/{fname}', f'train/{dogs_or_cats}/{fname}')

In [3]:
gen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=keras.applications.vgg19.preprocess_input)
train_data = gen.flow_from_directory('train', target_size=(224, 224), batch_size=1, shuffle=False)

Found 25000 images belonging to 2 classes.


In [4]:
train_filenames = train_data.filenames
bcolz.carray(train_filenames, rootdir='train_filenames', mode='w').flush()
train_y = keras.utils.to_categorical(train_data.classes)
bcolz.carray(train_y, rootdir='train_y', mode='w').flush()

In [5]:
base_model = keras.applications.vgg19.VGG19(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3),
    pooling=None
)

In [6]:
train_X = base_model.predict_generator(train_data, steps=train_data.n)
bcolz.carray(train_X, rootdir='train_X', mode='w').flush()

In [7]:
trn_ids = np.random.randint(25000, size=6)
val_ids = np.delete(np.arange(25000), trn_ids)

trn_X = train_X[trn_ids, ...]
trn_y = train_y[trn_ids]

random_subset = np.random.randint(24994, size=500)
val_X = train_X[random_subset, ...]
val_y = train_y[random_subset]

In [8]:
inputs = Input(shape=(7, 7, 512))
x = keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2))(inputs)
x = Flatten()(x)
x = Dense(4096)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(2)(x)
x = BatchNormalization()(x)
predictions = Activation('softmax')(x)

model = keras.models.Model(inputs, predictions)

In [9]:
model.compile(keras.optimizers.Adam(lr=1e-4), 'categorical_crossentropy', metrics=['accuracy'])

In [10]:
model.fit(x=trn_X, y=trn_y, batch_size=6, epochs=40, validation_data=(val_X, val_y), verbose=2)

Train on 6 samples, validate on 500 samples
Epoch 1/40
0s - loss: 1.3258 - acc: 0.3333 - val_loss: 4.9667 - val_acc: 0.4740
Epoch 2/40
0s - loss: 0.2921 - acc: 1.0000 - val_loss: 2.8465 - val_acc: 0.5680
Epoch 3/40
0s - loss: 0.2375 - acc: 1.0000 - val_loss: 1.9425 - val_acc: 0.6440
Epoch 4/40
0s - loss: 0.2160 - acc: 1.0000 - val_loss: 1.5847 - val_acc: 0.6720
Epoch 5/40
0s - loss: 0.2037 - acc: 1.0000 - val_loss: 1.4067 - val_acc: 0.6820
Epoch 6/40
0s - loss: 0.1957 - acc: 1.0000 - val_loss: 1.2935 - val_acc: 0.6740
Epoch 7/40
0s - loss: 0.1900 - acc: 1.0000 - val_loss: 1.2052 - val_acc: 0.6780
Epoch 8/40
0s - loss: 0.1858 - acc: 1.0000 - val_loss: 1.1306 - val_acc: 0.6780
Epoch 9/40
0s - loss: 0.1825 - acc: 1.0000 - val_loss: 1.0668 - val_acc: 0.6800
Epoch 10/40
0s - loss: 0.1799 - acc: 1.0000 - val_loss: 1.0111 - val_acc: 0.6840
Epoch 11/40
0s - loss: 0.1777 - acc: 1.0000 - val_loss: 0.9620 - val_acc: 0.6780
Epoch 12/40
0s - loss: 0.1759 - acc: 1.0000 - val_loss: 0.9182 - val_acc: 

<keras.callbacks.History at 0x7f2fd43a2f98>

Let's validate on the entire training set.

In [11]:
val_X = train_X[val_ids, ...]
val_y = train_y[val_ids]

In [12]:
model.fit(x=trn_X, y=trn_y, batch_size=6, epochs=1, validation_data=(val_X, val_y), verbose=2)

Train on 6 samples, validate on 24994 samples
Epoch 1/1
13s - loss: 0.1632 - acc: 1.0000 - val_loss: 0.5085 - val_acc: 0.7163


<keras.callbacks.History at 0x7f2fb74bbef0>

In [13]:
[train_filenames[idx] for idx in trn_ids]

['cats/cat.7981.jpg',
 'cats/cat.6557.jpg',
 'dogs/dog.3869.jpg',
 'cats/cat.9467.jpg',
 'cats/cat.863.jpg',
 'dogs/dog.4733.jpg']