In [182]:
import os, shutil, random, glob
import bcolz
import keras
import keras
import keras.preprocessing.image
from keras.layers import Input, Flatten, Dense, Dropout, Activation, BatchNormalization
%matplotlib inline
from matplotlib import pyplot as plt
import numpy as np
import scipy

Code below assumes that the train data from the https://www.kaggle.com/c/dogs-vs-cats competition has been downloaded and unzipped into the `train` directory under root of the repository.

In [23]:
files = glob.glob('train/*')
fnames = [f.split('/')[1] for f in files]

os.makedirs('train/cats')
os.makedirs('train/dogs')

for fname in fnames:
    dogs_or_cats = 'dogs' if 'dog' in fname else 'cats'
    shutil.move(f'train/{fname}', f'train/{dogs_or_cats}/{fname}')

In [3]:
gen = keras.preprocessing.image.ImageDataGenerator(preprocessing_function=keras.applications.vgg19.preprocess_input)
train_data = gen.flow_from_directory('train', target_size=(224, 224), batch_size=1, shuffle=False)

Found 25000 images belonging to 2 classes.


In [92]:
train_filenames = train_data.filenames
bcolz.carray(train_filenames, rootdir='train_filenames', mode='w').flush()
train_y = keras.utils.to_categorical(train_data.classes)
bcolz.carray(train_y, rootdir='train_y', mode='w').flush()

In [230]:
base_model = keras.applications.vgg19.VGG19(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3),
    pooling=None
)

In [43]:
train_X = base_model.predict_generator(train_data, steps=train_data.n)
bcolz.carray(train_X, rootdir='train_X', mode='w').flush()

In [229]:
trn_ids = np.random.randint(25000, size=6)
val_ids = np.delete(np.arange(25000), random_ids)

trn_X = train_X[trn_ids, ...]
trn_y = train_y[trn_ids]

random_subset = np.random.randint(24994, size=500)
val_X = train_X[random_subset, ...]
val_y = train_y[random_subset]

In [231]:
inputs = Input(shape=(7, 7, 512))
x = keras.layers.MaxPooling2D(pool_size=(2,2), strides=(2,2))(inputs)
x = Flatten()(x)
x = Dense(4096)(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = Dense(2)(x)
x = BatchNormalization()(x)
predictions = Activation('softmax')(x)

model = keras.models.Model(inputs, predictions)

In [232]:
model.compile(keras.optimizers.Adam(lr=1e-4), 'categorical_crossentropy', metrics=['accuracy'])

In [233]:
model.fit(x=trn_X, y=trn_y, batch_size=6, epochs=40, validation_data=(val_X, val_y), verbose=2)

Train on 6 samples, validate on 500 samples
Epoch 1/40
1s - loss: 0.6546 - acc: 0.5000 - val_loss: 2.9313 - val_acc: 0.6500
Epoch 2/40
0s - loss: 0.1762 - acc: 1.0000 - val_loss: 1.6842 - val_acc: 0.7440
Epoch 3/40
0s - loss: 0.1462 - acc: 1.0000 - val_loss: 1.0972 - val_acc: 0.8020
Epoch 4/40
0s - loss: 0.1407 - acc: 1.0000 - val_loss: 0.7913 - val_acc: 0.8200
Epoch 5/40
0s - loss: 0.1385 - acc: 1.0000 - val_loss: 0.6022 - val_acc: 0.8420
Epoch 6/40
0s - loss: 0.1368 - acc: 1.0000 - val_loss: 0.4833 - val_acc: 0.8480
Epoch 7/40
0s - loss: 0.1353 - acc: 1.0000 - val_loss: 0.4062 - val_acc: 0.8580
Epoch 8/40
0s - loss: 0.1342 - acc: 1.0000 - val_loss: 0.3549 - val_acc: 0.8700
Epoch 9/40
0s - loss: 0.1333 - acc: 1.0000 - val_loss: 0.3201 - val_acc: 0.8780
Epoch 10/40
0s - loss: 0.1326 - acc: 1.0000 - val_loss: 0.2960 - val_acc: 0.8820
Epoch 11/40
0s - loss: 0.1321 - acc: 1.0000 - val_loss: 0.2794 - val_acc: 0.8800
Epoch 12/40
0s - loss: 0.1316 - acc: 1.0000 - val_loss: 0.2680 - val_acc: 

<keras.callbacks.History at 0x7efca0264780>

Let's validate on the entire training set.

In [234]:
val_X = train_X[val_ids, ...]
val_y = train_y[val_ids]

In [235]:
model.fit(x=trn_X, y=trn_y, batch_size=6, epochs=1, validation_data=(val_X, val_y), verbose=2)

Train on 6 samples, validate on 24994 samples
Epoch 1/1
14s - loss: 0.1264 - acc: 1.0000 - val_loss: 0.3422 - val_acc: 0.8997


<keras.callbacks.History at 0x7efca097c5c0>

In [242]:
[train_filenames[idx] for idx in trn_ids]

['dogs/dog.10586.jpg',
 'dogs/dog.3688.jpg',
 'cats/cat.12408.jpg',
 'dogs/dog.7210.jpg',
 'cats/cat.1041.jpg',
 'cats/cat.9606.jpg']