In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os,cv2,re,random
from keras.preprocessing.image import ImageDataGenerator
from keras.preprocessing.image import img_to_array , load_img
from keras import layers,models,optimizers
from keras import backend as K
from sklearn.model_selection import train_test_split
    
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:

train_dir = "../input/train/"
test_dir = "../input/test/"
train_img_dogs_cat = [train_dir + i for i in os.listdir(train_dir)]#use this f or use whole dataset
test_img_dogs_cat = [test_dir + i for i in os.listdir(test_dir)]


Helper function to sort the image files based on the numeric value in each file name.

In [None]:
def atoi(text):
    return int(text) if text.isdigit() else text
def natural_keys(text):
    return [atoi(c) for c in re.split('(\d+)', text) ]

Sort the traning set. Use 1300 images each of cats and dogs instead of all 25000 to speed up the learning process.

Sort the test set

In [None]:
train_img_dogs_cat.sort(key = natural_keys)
train_img_dogs_cat = train_img_dogs_cat[0:1300] + train_img_dogs_cat[12500 :13800]
test_img_dogs_cat.sort(key = natural_keys)


Now the images have to be represented in numbers. For this, using the openCV library read and resize the image.

Generate labels for the supervised learning set.

Below is the helper function to do so.

In [None]:
img_height = 150
img_width = 150


def prepare_data(list_of_images):
    """
    Returns two arrays: 
        x is an array of resized images
        y is an array of labels
    """
    x = [] # images as arrays
    y = [] # labels
    
    for image in list_of_images:
        x.append(cv2.resize(cv2.imread(image), (img_width,img_height), interpolation=cv2.INTER_CUBIC))
    
    for i in list_of_images:
        if 'dog' in i:
            y.append(1)
        elif 'cat' in i:
            y.append(0)
        #else:
            #print('neither cat nor dog name present in images')
            
    return x, y
    
    

Generate X and Y using the helper function above

Since K.image_data_format() is channel_last, input_shape to the first keras layer will be (img_width, img_height, 3). '3' since it is a color image

In [None]:
X,Y = prepare_data(train_img_dogs_cat)
print(K.image_data_format())

Split the data set containing 2600 images into 2 parts, training set and validation set. Later, you will see that accuracy and loss on the validation set will also be reported while fitting the model using training set.****

In [None]:
X_train, X_val ,y_train, y_val = train_test_split(X,Y,test_size = 0.2 ,random_state = 1)

In [None]:
nb_train_samples = len(X_train)
nb_validation_samples = len(X_val)
batch_size = 16

We will be using the Sequential model from Keras to form the Neural Network. Sequential Model is used to construct simple models with linear stack of layers.

More info on Sequential model and Keras in general at https://keras.io/getting-started/sequential-model-guide/ and https://github.com/keras-team/keras

In [None]:
model = models.Sequential()
model.add(layers.Conv2D(32,(3,3),input_shape=(img_width,img_height,3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size = (2,2)))

model.add(layers.Conv2D(32,(3,3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size = (2,2)))

model.add(layers.Conv2D(64,(3,3)))
model.add(layers.Activation('relu'))
model.add(layers.MaxPooling2D(pool_size = (2,2)))

model.add(layers.Flatten())
model.add(layers.Dense(64))
model.add(layers.Activation('relu'))
model.add(layers.Dropout(0.5))
model.add(layers.Dense(1))
model.add(layers.Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

model.summary()

This is the augmentation configuration we will use for training and validation



In [None]:
train_datagen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

val_datagen = ImageDataGenerator(
    rescale=1. / 255,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True)

Prepare generators for training and validation sets

In [None]:
train_generator = train_datagen.flow(np.array(X_train),y_train,batch_size = batch_size)
validation_generator = train_datagen.flow(np.array(X_val),y_val,batch_size = batch_size)

Start training the model!

For better accuracy and lower loss, we are using an epoch of 30. Epoch value can be increased for better results.

In [None]:
hystory = model.fit_generator(
                train_generator,
                steps_per_epoch = nb_train_samples // batch_size,
                epochs = 30,
                validation_data = validation_generator,
                validation_steps=nb_validation_samples // batch_size
                 )

In [None]:
X_test, Y_test = prepare_data(test_img_dogs_cat) #Y_test in this case will be []

In [None]:
test_datagen = ImageDataGenerator(rescale=1. / 255)

In [None]:
steps = nb_train_samples // batch_size
test_generator = val_datagen.flow(np.array(X_test), batch_size=batch_size)

prediction_probabilities = model.predict_generator(test_generator, verbose=1,steps=782)

In [None]:
counter = range(1, len(test_img_dogs_cat) + 1)
solution = pd.DataFrame({"id": counter, "label":list(prediction_probabilities)})
cols = ['label']

for col in cols:
    solution[col] = solution[col].map(lambda x: str(x).lstrip('[').rstrip(']')).astype(float)

solution.to_csv("dogsVScats.csv", index = False)