In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test', 'sample_submission.csv', 'train']


In [2]:
from keras.layers import Dense, Flatten, Dropout, Lambda, Input
from keras.models import Model
from keras.applications import *
from keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.callbacks import ModelCheckpoint

Using TensorFlow backend.


In [3]:
filenames = os.listdir("../input/train/train")
labels = []
for file in filenames:
    category = file.split('.')[0]
    if category == 'cat':
        labels.append('cat')
    else:
        labels.append('dog')

In [4]:
df = pd.DataFrame({
    'filename': filenames,
    'label': labels
})
train_df, validation_df = train_test_split(df, test_size=0.1, random_state = 42)
train_df = train_df.reset_index(drop=True)
validation_df = validation_df.reset_index(drop=True)

In [5]:
height = 224
width = 224
channels = 3
batch_size = 32
train_num = len(train_df)
validation_num = len(validation_df)

In [6]:
#first train without data augmentation
train_datagen = ImageDataGenerator()
train_generator = train_datagen.flow_from_dataframe(
    train_df,
    '../input/train/train/',
    x_col = 'filename',
    y_col = 'label',
    target_size = (height, width),
    class_mode = 'binary',
    batch_size = batch_size
)

validation_datagen = ImageDataGenerator()

validation_generator = validation_datagen.flow_from_dataframe(
    validation_df,
    '../input/train/train/',
    x_col = 'filename',
    y_col = 'label',
    target_size = (height, width),
    class_mode = 'binary',
    batch_size = batch_size
)

Found 22500 images belonging to 2 classes.
Found 2500 images belonging to 2 classes.


In [7]:
#define vgg model
lambda_fun = vgg16.preprocess_input
inp = Input(shape = (height, width, channels))
x = Lambda(lambda_fun)(inp)
base_model = vgg16.VGG16(input_tensor = x, weights = 'imagenet', 
                         include_top = False, pooling = 'avg')
x = Dropout(0.5)(base_model.output)
x = Dense(256, activation = 'relu')(x)
x = Dropout(0.5)(x)
x = Dense(1, activation='sigmoid')(x)
vgg_finetune_model = Model(inp, x)
for layer in base_model.layers:
    layer.trainable = False

vgg_finetune_model.compile(loss = 'binary_crossentropy',
                          optimizer = 'rmsprop',
                          metrics = ['accuracy'])

vgg_finetune_model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
lambda_1 (Lambda)            (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_po

In [8]:
checkpointer = ModelCheckpoint(filepath='dogcat.weights.best.hdf5', verbose=1, 
                               save_best_only=True)
vgg_finetune_model.fit_generator(
    train_generator,
    epochs = 10,
    steps_per_epoch = train_num // batch_size,
    validation_data = validation_generator,
    validation_steps = validation_num // batch_size,
    verbose = 0,
    callbacks = [checkpointer]
)

Instructions for updating:
Use tf.cast instead.

Epoch 00001: val_loss improved from inf to 0.05055, saving model to dogcat.weights.best.hdf5

Epoch 00002: val_loss improved from 0.05055 to 0.04885, saving model to dogcat.weights.best.hdf5

Epoch 00003: val_loss did not improve from 0.04885

Epoch 00004: val_loss did not improve from 0.04885

Epoch 00005: val_loss did not improve from 0.04885

Epoch 00006: val_loss improved from 0.04885 to 0.04697, saving model to dogcat.weights.best.hdf5

Epoch 00007: val_loss did not improve from 0.04697

Epoch 00008: val_loss did not improve from 0.04697

Epoch 00009: val_loss improved from 0.04697 to 0.04209, saving model to dogcat.weights.best.hdf5

Epoch 00010: val_loss did not improve from 0.04209


<keras.callbacks.History at 0x7fc0ee189710>

In [9]:
vgg_finetune_model.load_weights('dogcat.weights.best.hdf5')

In [10]:
for layer in base_model.layers[:16]:
    layer.trainable = False

for layer in base_model.layers[16:]:
    layer.trainable = True

In [11]:
from keras.optimizers import SGD
vgg_finetune_model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), 
              loss = 'binary_crossentropy',
              metrics = ['accuracy'])

In [12]:
#add data_augmentation
train_aug_datagen = ImageDataGenerator(
    rotation_range = 15,
    shear_range = 0.1,
    zoom_range = 0.2,
    width_shift_range = 0.1,
    height_shift_range = 0.1,
    horizontal_flip = True
)
train_aug_generator = train_aug_datagen.flow_from_dataframe(
    train_df,
    '../input/train/train/',
    x_col = 'filename',
    y_col = 'label',
    target_size = (height, width),
    class_mode = 'binary',
    batch_size = batch_size
)

Found 22500 images belonging to 2 classes.


In [13]:
vgg_finetune_model.fit_generator(
    train_aug_generator,
    epochs = 20,
    steps_per_epoch = train_num // batch_size,
    validation_data = validation_generator,
    validation_steps = validation_num // batch_size,
    verbose = 0,
    callbacks = [checkpointer]
)


Epoch 00001: val_loss improved from 0.04209 to 0.03129, saving model to dogcat.weights.best.hdf5

Epoch 00002: val_loss did not improve from 0.03129

Epoch 00003: val_loss improved from 0.03129 to 0.02896, saving model to dogcat.weights.best.hdf5

Epoch 00004: val_loss improved from 0.02896 to 0.02852, saving model to dogcat.weights.best.hdf5

Epoch 00005: val_loss did not improve from 0.02852

Epoch 00006: val_loss improved from 0.02852 to 0.02667, saving model to dogcat.weights.best.hdf5

Epoch 00007: val_loss did not improve from 0.02667

Epoch 00008: val_loss did not improve from 0.02667

Epoch 00009: val_loss improved from 0.02667 to 0.02571, saving model to dogcat.weights.best.hdf5

Epoch 00010: val_loss improved from 0.02571 to 0.01989, saving model to dogcat.weights.best.hdf5

Epoch 00011: val_loss did not improve from 0.01989

Epoch 00012: val_loss did not improve from 0.01989

Epoch 00013: val_loss did not improve from 0.01989

Epoch 00014: val_loss did not improve from 0.01

<keras.callbacks.History at 0x7fc0ee177c50>

In [14]:
vgg_finetune_model.load_weights('dogcat.weights.best.hdf5')

In [15]:
test_filenames = os.listdir("../input/test/test")
test_df = pd.DataFrame({
    'filename': test_filenames
})
num_test = len(test_df)

test_datagen = ImageDataGenerator()

test_generator = test_datagen.flow_from_dataframe(
    test_df,
    '../input/test/test/',
    x_col = 'filename',
    y_col = None,
    class_mode = None,
    target_size = (height, width),
    batch_size = batch_size,
    shuffle = False
)

Found 12500 images.


In [16]:
prediction = vgg_finetune_model.predict_generator(test_generator, 
                                         steps=np.ceil(num_test/batch_size))
prediction = prediction.clip(min = 0.005, max = 0.995)

In [17]:
submission_df = pd.read_csv('../input/sample_submission.csv')
for i, fname in enumerate(test_generator.filenames):
    index = int(fname[fname.rfind('/')+1:fname.rfind('.')])
    submission_df.at[index-1, 'label'] = prediction[i]
submission_df.to_csv('submission.csv', index=False)