In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import cv2
import zipfile
#import visualkeras

from sklearn.model_selection import train_test_split

Using the os library we can print the files contained inside a directory, this should contain 3 files as can be seen below:

In [None]:
print(os.listdir("../input/dogs-vs-cats-redux-kernels-edition"))

test.zip and train.zip contains the images to use in our project, because of this we have to extract them using the library zipfile, the images extracted will appear in the output directory of kaggle '/kaggle/working/', path which we have to take to explore each image.

In [None]:
with zipfile.ZipFile('../input/dogs-vs-cats-redux-kernels-edition/test.zip','r') as z:
    z.extractall('.')
    
with zipfile.ZipFile('../input/dogs-vs-cats-redux-kernels-edition/train.zip','r') as z:
    z.extractall('.')

Let's print the list of images contained in the train folder, as we know this contain 25000 images which is huge and in order to just have an idea how they are named 10 samples of them will be shown:

In [None]:
os.listdir('/kaggle/working/train/')[:10]

Above the list of images is huge and we would be better if we print the length of such lists to see how many images each folder contain:

In [None]:
len(os.listdir('/kaggle/working/train/')), len(os.listdir('/kaggle/working/test/'))

In [None]:
print('Sample of file in train directory: ',os.listdir('/kaggle/working/train/')[0])
print('Sample of file in test directory: ',os.listdir('/kaggle/working/test/')[0])

Our images in training dataset are labeled differently than we are used to see, in the current project the images contain their class in the name of the file. i.e. cat.11724.jpg means image of cat, index 11724, jpg format. For this reason we have to iterate through each image and assign its respective class according to the name as can be seen in the code below:

In [None]:
print('Label of image: ',os.listdir('/kaggle/working/train/')[0].split('.')[0])
print('Index of image: ',os.listdir('/kaggle/working/train/')[0].split('.')[1])
print('Format of image: ',os.listdir('/kaggle/working/train/')[0].split('.')[2])

In the following line we will apply such idea were if the label corresponds to a cat the class will be '0', whereas if it's a dog the class will be '1', then the images will be read as RGB channels in resized to 150 pixels for width and height, to finally store the image and its class as one instance inside a list.

In [None]:
IMG_SIZE = 120
Images_train = []
Images_label = []
for i in os.listdir('/kaggle/working/train/'):
    label = i.split('.')[0]
    if label == 'cat':
        label = 0
    elif label == 'dog':
        label = 1
    img = cv2.imread('/kaggle/working/train/'+i, cv2.IMREAD_COLOR)
    img = cv2.resize(img,(IMG_SIZE,IMG_SIZE), interpolation=cv2.INTER_CUBIC)
    Images_train.append([np.array(img), np.array(label)])

Let's see the first instance of our training dataset preprocessed, this should contain 2 objects, the first one corresponds to the RGB image as numpy array and the second is the class 'label':

In [None]:
Images_train[0]

In [None]:
Images_train[0][0]

In [None]:
Images_train[0][1]

As the preprocessing was applied in the same order as the images were stored in the folder training we have to take into account that these were stored as the first 12500 images were cats and second 12500 were dogs, such sorting can make our model perform poorly and thus will be shuffled. 

In [None]:
import random

random.shuffle(Images_train)

One we have finished the preprocessing step we have to reshape our images to a 4-dimentional array were (number of images, width, height, color channels), and our labels keep being the same:

In [None]:
Images = np.array([i[0] for i in Images_train]).reshape(-1,IMG_SIZE,IMG_SIZE,3)
Label = np.array([i[1] for i in Images_train])

In [None]:
Images.shape, Label.shape

In [None]:
pd.DataFrame(Label).value_counts()

Let's select 10 images randomly from our training dataset and show with their corresponding label:

In [None]:
plt.figure(figsize=(15,15))
for k in range(10):
    plt.subplot(2, 5, k+1)
    img=random.randint(0,25000)
    plt.imshow(Images[img])
    plt.title('DOG' if Label[img]==1 else 'CAT')

We can see the images look perfect and the next step is splitting into training and validation sets to be used in modeling process:

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(Images, Label, test_size = 0.1)

In [None]:
X_train.shape, Y_train.shape, X_val.shape, Y_val.shape

In [None]:
#X_train=X_train/255.0
#X_val=X_val/255.0

# Modeling:

The following models will be built and compared using their corresponding error measurements:

- Convolutional Neural Network by scratch.
- Pre-trained ResNet50.
- Pre-trained VGG16.
- Pre-trained EfficientNetB0.

Let's import some libraries useful in the process of building the first network by scratch:

In [None]:
import tensorflow as tf
from sklearn.metrics import confusion_matrix
import itertools

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.optimizers import RMSprop,Adam,SGD,Adadelta
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In order to create some changes in our images we can use the ImageDataGenerator for data augmentation, as these correspond to images of animals and we know they can be in different positions we can play with lots of arguments of this function below:

In [None]:
datagen = ImageDataGenerator(
    featurewise_center=False,
    samplewise_center=False,
    featurewise_std_normalization=False,
    samplewise_std_normalization=False,
    zca_whitening=False,
    rotation_range=10,
    zoom_range = 0.1,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=False) 

Let's create two constraints or 'callbacks' which can help us improve the training (ReduceLROnPlateau) and stop the training once it has reached a high threshold (Callback):

In [None]:
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('val_accuracy')>0.98):
      print("\nReached 98% accuracy so cancelling training!")
      self.model.stop_training = True
        
callbacks = myCallback()

from keras.callbacks import ReduceLROnPlateau
lr_reduction = ReduceLROnPlateau(monitor='val_accuracy',
                                 patience=1, 
                                 verbose=1, 
                                 factor=0.5, 
                                 min_lr=0.000001)

from keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_accuracy', 
                               min_delta=0.005,
                               patience=3, 
                               verbose=1, 
                               mode='auto')

The following network was evaluated with four optimizers (Adam, SGD, RMSProp and Adadelta) and we got the highest train/validation accuracy using Adam, as can be seen below:

In [None]:
optimizer = Adam(learning_rate=0.001,beta_1=0.9,beta_2=0.999)

In [None]:
model=Sequential()
model.add(Conv2D(64,(3,3),strides=1,padding='Same',activation='relu',input_shape=(X_train.shape[1],X_train.shape[2],3)))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(128,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Conv2D(128,(3,3), strides=1,padding= 'Same', activation='relu'))
model.add(MaxPool2D(2,2))
model.add(BatchNormalization())
model.add(Flatten())
model.add(Dense(1024, activation = "relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation = "sigmoid"))


#optimizer = SGD(learning_rate=0.01)
#optimizer = RMSprop(learning_rate=0.001,rho=0.9,momentum=0.0,epsilon=1e-07)
#optimizer = Adadelta(learning_rate=0.001,rho=0.95, epsilon=1e-07)
model.compile(optimizer = optimizer , loss = "binary_crossentropy", metrics=["accuracy"])

In [None]:
datagen.fit(X_train)

In [None]:
history = model.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                              validation_data=(X_val,Y_val), epochs=20, verbose=1,
                              callbacks=[callbacks, lr_reduction])

In [None]:
pd.DataFrame(history.history)

In [None]:
def metrics_plot(history):
  acc = history.history['accuracy']
  val_acc = history.history['val_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']

  epochs = range(len(acc))

  plt.plot(epochs, acc, 'r', label='Training accuracy')
  plt.plot(epochs, val_acc, 'b', label='Validation accuracy')
  plt.title('Training and validation accuracy')
  plt.legend()
  plt.figure()

  plt.plot(epochs, loss, 'r', label='Training Loss')
  plt.plot(epochs, val_loss, 'b', label='Validation Loss')
  plt.title('Training and validation loss')
  plt.legend()

  plt.show()

In [None]:
metrics_plot(history)

In [None]:
from keras.models import load_model

model.save('CNN_model.h5')

## ResNet50:

Every pre-trained network will make use of the weights belonging to imagenet.

In [None]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input

In [None]:
from tensorflow.keras import Model

In [None]:
model_RN=Sequential()
model_RN.add(ResNet50(input_shape=(120,120,3),
            include_top=False,
            weights='imagenet',
            pooling='max'))

In [None]:
model_RN.summary()

In [None]:
model_RN.layers[0].trainable=False
model_RN.summary()

In [None]:
model_RN.add(Dense(512,activation='relu'))
model_RN.add(Dropout(0.2))
model_RN.add(Dense(1,activation='sigmoid'))

In [None]:
model_RN.summary()

In [None]:
model_RN.layers

In [None]:
model_RN.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history2 = model_RN.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                                  validation_data=(X_val,Y_val), epochs=20, verbose=1,
                                  callbacks=[callbacks, lr_reduction, early_stopping])

In [None]:
pd.DataFrame(history2.history)

In [None]:
metrics_plot(history2)

In [None]:
model_RN.save('ResNet_model.h5')

## VGG16:

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

In [None]:
model_VGG=Sequential()
model_VGG.add(VGG16(input_shape=(120,120,3),
                    include_top=False,
                    pooling='max',
                    weights='imagenet'))

In [None]:
model_VGG.summary()

In [None]:
model_VGG.layers[0].trainable=False
model_VGG.summary()

In [None]:
model_VGG.add(Dense(512,activation='relu'))
model_VGG.add(Dropout(0.2))
model_VGG.add(Dense(1,activation='sigmoid'))

In [None]:
model_VGG.summary()

In [None]:
model_VGG.layers

In [None]:
model_VGG.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history3 = model_VGG.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                                  validation_data=(X_val,Y_val), epochs=20, verbose=1,
                                  callbacks=[callbacks, lr_reduction, early_stopping])

In [None]:
pd.DataFrame(history3.history)

In [None]:
metrics_plot(history3)

In [None]:
model_VGG.save('VGG_model.h5')

## EfficientNetB0:

Important to mention that InceptionV3, Xception, ResNet152V2, DenseNet201 didn't work well, I mean reached no more than 0.75 in validation accuracy, I tried with EfficientNetB0 and finally I reached a satisfactory performance as you will see below:

In [None]:
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input

In [None]:
model_EF=Sequential()
model_EF.add(EfficientNetB0(input_shape=(120,120,3),
                            include_top=False,
                            pooling='max',
                            weights='imagenet'))

In [None]:
model_EF.summary()

In [None]:
model_EF.layers[0].trainable=False
model_EF.summary()

In [None]:
model_EF.add(Dense(512,activation='relu'))
model_EF.add(Dropout(0.2))
model_EF.add(Dense(1,activation='sigmoid'))

In [None]:
model_EF.summary()

In [None]:
model_EF.layers

In [None]:
model_EF.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history4 = model_EF.fit_generator(datagen.flow(X_train, Y_train, batch_size=32),
                                  validation_data=(X_val,Y_val), epochs=10, verbose=1,
                                  callbacks=[callbacks, lr_reduction, early_stopping])

In [None]:
pd.DataFrame(history4.history)

In [None]:
metrics_plot(history4)

In [None]:
model_EF.save('EF_model.h5')

Now I'm going to load the 4 models saved and compute their corresponding metrics which should match those from the last epoch for each one.

In [None]:
from keras.models import load_model

In [None]:
model1 = load_model('../input/models-saved/CNN_model.h5')
model2 = load_model('../input/models-saved/ResNet_model.h5')
model3 = load_model('../input/models-saved/VGG_model.h5')
model4 = load_model('../input/models-saved/EF_model.h5')

In [None]:
train_loss_cnn, train_acc_cnn = model1.evaluate(X_train,  Y_train, verbose=2)
test_loss_cnn, test_acc_cnn = model1.evaluate(X_val,  Y_val, verbose=2)

In [None]:
train_loss_rn, train_acc_rn = model2.evaluate(X_train,  Y_train, verbose=2)
test_loss_rn, test_acc_rn = model2.evaluate(X_val,  Y_val, verbose=2)

In [None]:
train_loss_vgg, train_acc_vgg = model3.evaluate(X_train,  Y_train, verbose=2)
test_loss_vgg, test_acc_vgg = model3.evaluate(X_val,  Y_val, verbose=2)

In [None]:
train_loss_ef, train_acc_ef = model4.evaluate(X_train,  Y_train, verbose=2)
test_loss_ef, test_acc_ef = model4.evaluate(X_val,  Y_val, verbose=2)

Given the computed metrics for every model loaded I will summarize and show a table which can facilitate comparing them:

In [None]:
data = {'Scratch model':[train_acc_cnn,train_loss_cnn,test_acc_cnn,test_loss_cnn],
        'ResNet50':[train_acc_rn,train_loss_rn,test_acc_rn,test_loss_rn],
        'VGG16': [train_acc_vgg,train_loss_vgg,test_acc_vgg,test_loss_vgg],
        'EfficientNet': [train_acc_ef,train_loss_ef,test_acc_ef,test_loss_ef]}
 
pd.DataFrame(data, index=['Train accuracy','Train loss','Val accuracy','Val loss'])

In the table above we can see the model build by scrath is the one with lowest performance, despite it is not too far from the others the difference is significant. About the pre-trained models again there is not a big difference, but something important to look and take into account is that when we trained these models the metrics were different than in the table above, the reason of such change could be the stochastic nature of convolutional networks have when evaluating on different batches. Having said that I will continue with VGG16 for having the best accuracy on training and validation sets, obviously you can choose whatever you want, but such model outstandingly!

Let's see the summary of the model chosen:

In [None]:
model3.summary()

In [None]:
predicted_val_prob = model3.predict(X_val, batch_size=32)

Remember that as this project is binary classification the last layer had sigmoid activation function, therefore our predicted output will have values of probabilities between 0-1, where 1 means 'Dog' and 0 means 'Cat':

In [None]:
predicted_val_prob

Once we have this probabilities we have to convert it to discrete values, either 1 or 0, such task can be achieved by using the np.round function:

In [None]:
Y_val_pred= np.round(predicted_val_prob)

In [None]:
Y_val_pred

Let's see the classification report for this model, we can see the considerably high metrics, later we will in more detail why every model didn't achieve a perfect performance.

In [None]:
from sklearn.metrics import classification_report

report = classification_report(Y_val, Y_val_pred)

print(report)

In [None]:
from sklearn.metrics import confusion_matrix

f,ax = plt.subplots(figsize=(15, 15))
confusion_mtx = confusion_matrix(Y_val, Y_val_pred)
sns.set(font_scale=1.4)
sns.heatmap(confusion_mtx, annot=True, linewidths=0.01,cmap="Greens",linecolor="gray",ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix Validation set")
plt.show()

The model which had the best performance was the VGG16 with 119 misclassification out of 2500 instances, as I said before such number can change due to stochastic nature of these models, since now I will use VGG16 to predict the classes of instances stored in testing folder, but firstly let's plot a sample of the images misclassified and see if there is a pattern or general reason of this problem.

In [None]:
l = []
for i in range(len(Y_val_pred)):
    if Y_val[i] != Y_val_pred[i]:
        l.append(i)

In [None]:
print('Number of misclassifications in validation dataset: ', len(l))

The list 'l' is storing the indexes of those instances which were misclassified, having said that let's see the first 20 misclassifications and then show these images with their corresponding sigmoid output (probability):

In [None]:
l[:20]

In [None]:
plt.figure(figsize=(35,35))
c = 1
for i in l[:20]:
    plt.subplot(4,5, c)
    plt.imshow(X_val[i])
    plt.title('DOG:{}\nTrue label:{}'.format(predicted_val_prob[i], Y_val[i])
              if predicted_val_prob[i]>= 0.5 else 'CAT:{}\nTrue label:{}'.format(predicted_val_prob[i],Y_val[i]))
    plt.axis('off')
    c = c+1

Above we see only 20 misclassifications, what is more worrying is that some of these images had a relatively high probability of prediction (let's say ~0.8 and ~0.2), which is bad if we want to have a high accuracy when predicting the classes of testing images. One big reason could be that some of them belong to uncommon or less frequent breeds of dogs and cats, thus there are present only a few pictures of them. This problem could be solved by adding more images in order to balance the breeds in the training set. Another big problem is that some of the images do not contain only one animal or 'object', some of them contain humans close to animals, some contain cats and dogs in same image, some contain multiple images, etc. This without a doubt sidetracks the prediction of every model and for such images the method to use should be 'Object detection', one of the well-known models are YOLO's which as output of any image highlights each object contained drawing a bounding box and the class predicted with corresponding probability.

As I said in order to correct such problems YOLO approaches should be used and this is a challenge for a next project, which I will be working on.

As a final step let's compute and show the error metrics (recall, precision, f1-score, accuracy and AUC) for the best model.

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import label_binarize

In [None]:
metrics = []
models = ['VGG16 model']
predictions=[Y_val_pred]

for lab,i in zip(models, predictions):
    precision, recall, fscore, _ = score(Y_val, i, average='weighted')
    accuracy = accuracy_score(Y_val, i)
    auc = roc_auc_score(label_binarize(Y_val, classes=[0,1]),
                        label_binarize(i, classes=[0,1]),
                        average='weighted')
    metrics.append(pd.Series({'precision':precision, 'recall':recall,
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, name=lab))
    
metrics = pd.concat(metrics, axis=1)

In [None]:
metrics

## Prediction of testing images:

Using the os.listdir function let's print the name of some images contained in the test folder: 

In [None]:
os.listdir('/kaggle/working/test/')[:20] 

As the testing dataset does not contain the labels included in the name of the image we will see how they are presented printing one sample of them:

In [None]:
print('Sample of image: ',os.listdir('/kaggle/working/test/')[0])
print('Index of image: ',os.listdir('/kaggle/working/test/')[0].split('.')[0])
print('Format of image: ',os.listdir('/kaggle/working/test/')[0].split('.')[1])

The following function will extract the index of the image and the image as numpy array and save both as an instance in the list Images_test:

In [None]:
Images_test = []
for j in os.listdir('/kaggle/working/test/'):
    index = j.split('.')[0]
    img = cv2.imread('/kaggle/working/test/'+j, cv2.IMREAD_COLOR)
    img = cv2.resize(img,(IMG_SIZE,IMG_SIZE), interpolation = cv2.INTER_CUBIC)
    Images_test.append([np.array(img), np.array(index)])

The numpy arrays representing the images will be stored in a new variable 'X_test' from which will be predicted the class. 

In [None]:
X_test = np.array([j[0] for j in Images_test]).reshape(-1,IMG_SIZE, IMG_SIZE, 3)
Index = np.array([j[1] for j in Images_test])

In [None]:
test_prediction = model3.predict(X_test, batch_size = 32)

In [None]:
submission=pd.DataFrame(test_prediction, columns=['label'], index=pd.Series(Index, name='id'))
submission.head()

In [None]:
submission.to_csv('submission.csv')

I would like to know any feedback in order to increase the performance of the models or tell me if you found a different one even better!

If you liked this notebook I would appreciate so much your upvote if you want to see more projects/tutorials like this one. I encourage you to see my projects portfolio, am sure you will love it.

Thank you!