In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import zipfile
import random
import shutil
import tensorflow as tf
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input

from shutil import copyfile
from os import getcwd

In [None]:
# This code block unzips the full Cats-v-Dogs dataset to /tmp
# which will create a tmp/PetImages directory containing subdirectories
# called 'Cat' and 'Dog' (that's how the original researchers structured it)
path_cats_and_dogs = "/kaggle/input/dogs-vs-cats/train.zip"

local_zip = path_cats_and_dogs
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()

In [None]:
path_cats_and_dogs = "/kaggle/input/dogs-vs-cats/test1.zip"

local_zip = path_cats_and_dogs
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall()
zip_ref.close()

In [None]:
filenames = os.listdir("/kaggle/working/train")

In [None]:
len(filenames)

**Make a proper directory structure for splitting training and validation images. 
This will also help in using Keras Image generators for augmentation and preprocessing tasks**

In [None]:
base_dir = '/kaggle/working/'
source_dir = '/kaggle/working/train/'



cat_dir = os.path.join(base_dir, 'cat')
os.mkdir(cat_dir)
dog_dir = os.path.join(base_dir, 'dog')
os.mkdir(dog_dir)

train_dir = os.path.join(base_dir, 'train1')
validation_dir = os.path.join(base_dir, 'validation1')
os.mkdir(train_dir)
os.mkdir(validation_dir)

# Directory with our training cat/dog pictures
train_cats_dir = os.path.join(train_dir, 'cats')
train_dogs_dir = os.path.join(train_dir, 'dogs')
os.mkdir(train_cats_dir)
os.mkdir(train_dogs_dir)


# Directory with our validation cat/dog pictures
validation_cats_dir = os.path.join(validation_dir, 'cats')
validation_dogs_dir = os.path.join(validation_dir, 'dogs')
os.mkdir(validation_cats_dir)
os.mkdir(validation_dogs_dir)



**Move all the files from source to their respective training and validation folder**

In [None]:
for c in filenames:
    category = c.split('.')[0]
    if category == "cat":
        temp_source = source_dir +'/'+ c
        temp_dest   = cat_dir +'/'+ c
        copyfile(temp_source,temp_dest)
    else:
        temp_source = source_dir +'/' + c
        temp_dest   = dog_dir +'/'+ c
        copyfile(temp_source,temp_dest)

def split_data(SOURCE, TRAINING, VALIDATION, SPLIT_SIZE):
    f = os.listdir(SOURCE)
    train_size   =  int(SPLIT_SIZE * len(f))
    test_size    =  int(len(f) - train_size)
    final_files  =  random.sample(f,len(f))
    train_files  =  final_files[0:train_size] 
    test_files   =  final_files[-test_size:]
    
    for i in train_files:
        temp_source = SOURCE +'/'+ i
        temp_dest   = TRAINING +'/'+ i
        copyfile(temp_source,temp_dest)
        
    for j in test_files:
        temp_source = SOURCE + '/'+ j
        temp_dest   = VALIDATION +'/'+ j
        copyfile(temp_source,temp_dest)
        

split_size = .9
split_data(cat_dir, train_cats_dir, validation_cats_dir, split_size)
split_data(dog_dir, train_dogs_dir, validation_dogs_dir, split_size)


***Check the number of files in each directory***

In [None]:
test_dir ='/kaggle/working/test1'
print("Total images in cat directory" , len(os.listdir(cat_dir)))
print("Total images in dog directory" , len(os.listdir(dog_dir)))
print("Total images in train/cat directory" , len(os.listdir(train_cats_dir)))
print("Total images in train/dog directory" , len(os.listdir(train_dogs_dir)))
print("Total images in validation/cat directory" , len(os.listdir(validation_cats_dir)))
print("Total images in validation/dog directory" , len(os.listdir(validation_dogs_dir)))
print("Total images in test directory" , len(os.listdir(test_dir)))


***Check the names of files in destination folders randomly *******

In [None]:
train_cat_fnames = os.listdir( train_cats_dir )
train_dog_fnames = os.listdir( train_dogs_dir )
test_names = os.listdir(test_dir)

print(train_cat_fnames[:10])
print(train_dog_fnames[:10])

**Setup some code to view the pictures using matplotlib**

In [None]:
%matplotlib inline

import matplotlib.image as mpimg
import matplotlib.pyplot as plt

# Parameters for our graph; we'll output images in a 4x4 configuration
nrows = 4
ncols = 4

pic_index = 0 # Index for iterating over imag

In [None]:
# Set up matplotlib fig, and size it to fit 4x4 pics
fig = plt.gcf()
fig.set_size_inches(ncols*4, nrows*4)

pic_index+=8

next_cat_pix = [os.path.join(train_cats_dir, fname) 
                for fname in train_cat_fnames[ pic_index-8:pic_index] 
              ]

next_test_pix = [os.path.join(test_dir, fname) 
                for fname in test_names[ pic_index-8:pic_index]
               ]

for i, img_path in enumerate(next_cat_pix+next_test_pix):
  # Set up subplot; subplot indices start at 1
  sp = plt.subplot(nrows, ncols, i + 1)
  sp.axis('Off') # Don't show axes (or gridlines)

  img = mpimg.imread(img_path)
  plt.imshow(img)

plt.show()

**Modelling Starts**

Simple VGG3 baseline models give about 75-80% accuracy. After some iterations I have done VGG5 model with dropouts. 

In [None]:
model = tf.keras.models.Sequential([
    # Note the input shape is the desired size of the image 150x150 with 3 bytes color
    tf.keras.layers.Conv2D(32, (3,3), activation='relu', input_shape=(150, 150, 3)),
   # tf.keras.layers.Dropout(0.3),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Conv2D(64, (3,3), activation='relu'),
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.MaxPooling2D(2,2), 
    tf.keras.layers.BatchNormalization(),

    tf.keras.layers.Conv2D(128, (3,3), activation='relu'), 
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.BatchNormalization(),
    
    tf.keras.layers.Conv2D(128, (3,3), activation='relu'), 
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.MaxPooling2D(2,2),
    tf.keras.layers.BatchNormalization(),
          
    # Flatten the results to feed into a DNN
    tf.keras.layers.Flatten(), 
    # 512 neuron hidden layer
    tf.keras.layers.Dense(512, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    
    # Only 1 output neuron. It will contain a value from 0-1 where 0 for 1 class ('cats') and 1 for the other ('dogs')
    tf.keras.layers.Dense(1, activation='sigmoid')  
])

In [None]:
from tensorflow.keras.optimizers import RMSprop

model.compile(optimizer=RMSprop(learning_rate=0.0001),
              loss='binary_crossentropy',
              metrics = ['accuracy'])

In [None]:
train_datagen = ImageDataGenerator(
      rescale=1./255,
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.1,
      horizontal_flip=True,
      fill_mode='nearest')

train_generator = train_datagen.flow_from_directory(
        train_dir,  # This is the source directory for training images
        target_size=(150, 150),  # All images will be resized to 150x150
        batch_size=30,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')


validation_datagen = ImageDataGenerator(rescale=1/255)

# NOTE: YOU MUST USE A BACTH SIZE OF 10 (batch_size=10) FOR THE 
# VALIDATION GENERATOR.
validation_generator = validation_datagen.flow_from_directory(
       validation_dir,  # This is the source directory for training images
        target_size=(150, 150),  # All images will be resized to 150x150
        batch_size=30,
        # Since we use binary_crossentropy loss, we need binary labels
        class_mode='binary')

In [None]:
history = model.fit_generator(train_generator,
                              epochs=30,
                              verbose=1,
                              validation_data=validation_generator)


In [None]:
def plot_acc_loss():
    
#-----------------------------------------------------------
# Retrieve a list of list results on training and test data
# sets for each training epoch
#-----------------------------------------------------------
    acc      = history.history[     'accuracy' ]
    val_acc  = history.history[ 'val_accuracy' ]
    loss     = history.history[    'loss' ]
    val_loss = history.history['val_loss' ]

    epochs   = range(len(acc)) # Get number of epochs

#------------------------------------------------
# Plot training and validation accuracy per epoch
#------------------------------------------------
    plt.plot  ( epochs,     acc )
    plt.plot  ( epochs, val_acc )
    plt.title ('Training and validation accuracy')
    plt.figure()

#------------------------------------------------
# Plot training and validation loss per epoch
#------------------------------------------------
    plt.plot  ( epochs,     loss )
    plt.plot  ( epochs, val_loss )
    plt.title ('Training and validation loss'   )

In [None]:
plot_acc_loss()

In [None]:
test_files = os.listdir("/kaggle/working/test1")
test_df = pd.DataFrame({'filename' : test_files})    
samples = test_df.shape[0]

In [None]:
test_data = ImageDataGenerator(rescale=1./255)
test_generator = test_data.flow_from_dataframe(
    test_df, 
    "./test1/", 
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=[150,150],
    batch_size=30,
    shuffle=False)




In [None]:
predict = model.predict_generator(test_generator, steps=np.ceil(samples/30))

In [None]:
test_df['category'] = np.argmax(predict, axis=-1)
test_df['category'] = test_df['category'].replace({ 'dog': 1, 'cat': 0 })

In [None]:
submission_df = test_df.copy()
submission_df['id'] = submission_df['filename'].str.split('.').str[0]
submission_df['label'] = submission_df['category']
submission_df.drop(['filename', 'category'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)