# This notebook works with the Cat VS Dog dataset.

## Importing the libraries

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 
import zipfile
import os 
import random
import cv2

## Extracting the files using zipfile module into designated folders

In [None]:
with zipfile.ZipFile('/kaggle/input/dogs-vs-cats/train.zip','r') as zip_ref:
    zip_ref.extractall('train')

In [None]:
with zipfile.ZipFile('/kaggle/input/dogs-vs-cats/test1.zip','r') as zip_ref:
    zip_ref.extractall('test')

## Building a dataframe with the list of all training images and their labels 

In [None]:
filenames=os.listdir('train/train')
category=[]

for filename in filenames:
    if 'dog' in filename:
        category.append('dog')
    else:
        category.append('cat')

df={'filename':filenames,'category':category}
df=pd.DataFrame(df)

In [None]:
df.head()

## Opening a random image

In [None]:
any_file=random.choice(df.filename)
print(any_file)

In [None]:
img=cv2.imread('train/train/'+str(any_file))
img=cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
plt.imshow(img)

## Inorder to build a good model we need to check the size of each images and resize the images which we will do later

In [None]:
def checking_size(path,n=5):
    filenames=os.listdir(path)
    randomly_selected_files=[]
    sizes=[]
    for i in range(n):
        randomly_selected_files.append(random.choice(filenames))
    for j in randomly_selected_files:
        img=cv2.imread(path+'/'+str(j))
        sizes.append(img.shape)
    for k in range(len(sizes)):
        print(sizes[k])


In [None]:
checking_size('train/train')

In [None]:
checking_size('test/test1')

## Since the size of the images are very different we would be resizing them in a standard size of 128X128X3.


In [None]:
imageHeight=128
imageWidth=128
channels=3

## We are going to build a deep learning model using tensorflow api

In [None]:
# Creating the model
import tensorflow as tf 
from tensorflow.keras.layers import Input,Conv2D,BatchNormalization,MaxPooling2D,Dense,Dropout,GlobalMaxPooling2D,Flatten
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint

In [None]:
i=Input((imageHeight,imageWidth,channels))
x=Conv2D(32,(3,3),activation='relu',padding='same')(i)
x=BatchNormalization()(x)
x=Conv2D(32,(3,3),activation='relu',padding='same')(x)
x=BatchNormalization()(x)
x=MaxPooling2D((2,2))(x)

x=Conv2D(64,(3,3),activation='relu',padding='same')(x)
x=BatchNormalization()(x)
x=Conv2D(64,(3,3),activation='relu',padding='same')(x)
x=BatchNormalization()(x)
x=MaxPooling2D((2,2))(x)

x=Conv2D(128,(3,3),activation='relu',padding='same')(x)
x=BatchNormalization()(x)
x=Conv2D(128,(3,3),activation='relu',padding='same')(x)
x=BatchNormalization()(x)
x=GlobalMaxPooling2D()(x)
x=Dropout(0.2)(x)
x=Flatten()(x)
x=Dense(2056,activation='relu')(x)
x=Dropout(0.2)(x)
x=Dense(2,activation='softmax')(x)

model=Model(i,x)
model.summary()

## Splitting the training dataset into training and validation set

In [None]:
from sklearn.model_selection import train_test_split
train_df,validation_df=train_test_split(df)
print(train_df.shape)
print(validation_df.shape)

## Building the training, validation and test generators

In [None]:
train_data_generator=tf.keras.preprocessing.image.ImageDataGenerator(rotation_range=20,rescale=1/255,width_shift_range=0.1,
                                                                     height_shift_range=0.1,horizontal_flip=True)

train_data=train_data_generator.flow_from_dataframe(train_df,'train/train',x_col='filename',y_col='category',
                                                    target_size=(imageHeight,imageWidth),batch_size=32)

In [None]:
validation_data_generator=tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)

validation_data=validation_data_generator.flow_from_dataframe(validation_df,'train/train',x_col='filename',y_col='category',
                                                    target_size=(128,128),batch_size=32)

In [None]:
test_filenames=os.listdir('test/test1')
test_df=pd.DataFrame(test_filenames,columns=['filename'])

In [None]:
test_df.head()

In [None]:
test_data_generator=tf.keras.preprocessing.image.ImageDataGenerator(rescale=1/255)
test_data=test_data_generator.flow_from_dataframe(test_df,'test/test1',x_col='filename',y_col=None,class_mode=None,
                                                    target_size=(128,128),batch_size=32)

## Inorder to avoid overfitting we will be using earlyStopping method 
## We then compile the model and run it for a maximum 50 epochs

In [None]:
callbacks=[EarlyStopping(monitor='val_loss',patience=3)] #Early Stopping
model.compile(loss='categorical_crossentropy',metrics=['accuracy'],optimizer='adam')
r=model.fit(train_data,validation_data=validation_data,steps_per_epoch=train_df.shape[0]//32,batch_size=32,callbacks=callbacks,epochs=50)

## Plotting loss and accuracy

In [None]:
plt.plot(r.history['loss'],label='loss')
plt.plot(r.history['val_loss'],label='val_loss')
plt.legend()
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.show()

In [None]:
plt.plot(r.history['accuracy'],label='accuracy')
plt.plot(r.history['val_accuracy'],label='val_accuracy')
plt.legend()
plt.ylabel('Accuracy')
plt.xlabel('Epochs')
plt.show()

# Predicting the results

In [None]:
y_hat=model.predict(test_data)

In [None]:
y_pred=np.argmax(y_hat,axis=1)

In [None]:
test_df['label']=y_pred

In [None]:
test_df.head()

# Testing the checked result

In [None]:
def check_prediction():
    filename=test_df.filename
    filename=filename[:5]
    label=test_df.label
    label=label[:5]
    for i in range(len(filename)):
        img=cv2.imread('test/test1/'+filename[i])
        plt.imshow(img)
        plt.title(str(label[i]))
        plt.show()

In [None]:
check_prediction()