<a href="https://www.kaggle.com/sdysch/cats-vs-dogs?scriptVersionId=88749240" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import PIL

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [None]:

data_set = "dogs-vs-cats"

import zipfile 
with zipfile.ZipFile("/kaggle/input/"+ data_set +"/train.zip","r") as z:
    z.extractall(".")
    # save all files to kaggle/files/images
    destination = '/kaggle/files/images/train'
    z.extractall(destination)
    
with zipfile.ZipFile("/kaggle/input/"+ data_set +"/test1.zip","r") as z:
    z.extractall(".")
    # save all files to kaggle/files/images
    destination = '/kaggle/files/images/test'
    z.extractall(destination)

In [None]:
#print(os.listdir('/kaggle/files/images/test/test1'))
#print(os.listdir('/kaggle/files/images/train/train'))

In [None]:
# create a train/test df, containing the image filepaths and training label
import os
 
def list_full_paths(directory):
    return [os.path.join(directory, file) for file in os.listdir(directory)]

train = pd.DataFrame({'filepath': list_full_paths('/kaggle/files/images/train/train')})
train['truth_label'] = np.where(train['filepath'].str.contains('dog'), 'dog', 'cat')

test = pd.DataFrame({'filepath': list_full_paths('/kaggle/files/images/test/test1')})

In [None]:
train.head(20)

In [None]:
test.head()

In [None]:
# examples of image visualisation
basepath = '/kaggle/files/images/train/train/'
fig, axs = plt.subplots(2, 2, figsize=(10, 10))
_ = axs[0, 0].imshow(PIL.Image.open(basepath + 'dog.1.jpg'))
_ = axs[0, 0].axis('off')

_ = axs[0, 1].imshow(PIL.Image.open(basepath + 'dog.2.jpg'))
_ = axs[0, 1].axis('off')

_ = axs[1, 0].imshow(PIL.Image.open(basepath + 'cat.1.jpg'))
_ = axs[1, 0].axis('off')

_ = axs[1, 1].imshow(PIL.Image.open(basepath + 'cat.2.jpg'))
_ = axs[1, 1].axis('off')

In [None]:
# split train into train and validation, ensuring reproducability
from sklearn.model_selection import train_test_split
train, validation = train_test_split(train, test_size=0.2, random_state=1, shuffle=True)

In [None]:
#train.head(10)
#validation.head(10)
print(np.sum(train['truth_label'] == 'cat') / len(train))
print(np.sum(train['truth_label'] == 'dog') / len(train))

print(np.sum(validation['truth_label'] == 'cat') / len(validation))
print(np.sum(validation['truth_label'] == 'dog') / len(validation))

# roughly even splitting, no need for reweighting/resampling

In [None]:
# data pre-processing
train_gen = ImageDataGenerator(rescale=1. / 255)
val_gen = ImageDataGenerator(rescale=1. / 255)

train_generator      = train_gen.flow_from_dataframe(dataframe=train, x_col='filepath', y_col='truth_label', class_mode='categorical')
validation_generator = val_gen.flow_from_dataframe(dataframe=validation, x_col='filepath', y_col='truth_label', class_mode='categorical')

# TODO data augmentation
* Random flips/rotatations
* Random zooms
* Colour scaling/reversing?

In [None]:
# data augmentation
from tensorflow.keras.layers import RandomFlip, RandomRotation, RandomZoom

input_shape = (255, 255, 3)

data_augmentation = keras.Sequential()
data_augmentation.add(RandomFlip('horizontal', input_shape=input_shape))
data_augmentation.add(RandomRotation(0.1))
data_augmentation.add(RandomZoom(0.1))

# model definition
model = Sequential()

model.add(data_augmentation)

model.add(Conv2D(input_shape=input_shape, filters=32, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(input_shape=input_shape, filters=64, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))

model.add(Conv2D(input_shape=input_shape, filters=128, kernel_size=(3, 3), activation='relu'))
model.add(MaxPooling2D(2, 2))


model.add(Flatten())

model.add(Dense(10, activation='relu'))
model.add(Dense(2, activation='softmax'))

model.summary()

In [None]:
# fit data
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# callbacks
es = EarlyStopping(monitor='val_loss', patience=15, verbose=1, mode='min', restore_best_weights=True)
lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, mode='min', min_lr=0.00000001)

epochs = 100
with tf.device('/device:GPU:0'):
    history = model.fit(train_generator, validation_data=validation_generator, epochs=epochs, callbacks=[es, lr])

In [None]:
fig, axs = plt.subplots(1, 2, figsize = (10, 5))
_ = axs[0].plot(history.history['loss'], label='Test')
_ = axs[0].plot(history.history['val_loss'], label='Validation')
axs[0].set_xlabel('epochs')
axs[0].set_ylabel('loss')
_ = axs[0].legend()

_ = axs[1].plot(history.history['accuracy'], label='Test')
_ = axs[1].plot(history.history['val_accuracy'], label='Validation')
axs[1].set_xlabel('epochs')
axs[1].set_ylabel('accuracy')
_ = axs[1].legend()

In [None]:
# pred on validation, submit predictions to kaggle
#y_pred = np.argmax(model.predict(X_test), axis=1)
#from sklearn.metrics import ConfusionMatrixDisplay
#_ = ConfusionMatrixDisplay.from_predictions(test.class_protein_localization.values, y_pred, normalize='true')