# Introduction
This kernel will explore image processing using a convolutional neural network running on tensor backend with keras as the programming interface.  The Dogs vs. Cats dataset will be used in this study, data augmentation will be applied to both a custom CNN and VGG16 feature extraction with fine tuning.

## 1.0 Data exploration
Load and explore data

The dataset for this kernel will be the Dogs vs. Cats dataset.

In [None]:
#load libraries for data manipulation and visualization
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import os
import random
from tensorflow.keras.preprocessing.image import load_img
# warnings
import string
import warnings
warnings.filterwarnings('ignore')

Create data directories

In [None]:
!unzip -q /kaggle/input/dogs-vs-cats/train.zip
!unzip -q /kaggle/input/dogs-vs-cats/test1.zip
TRAIN_DIR = "/kaggle/working/train/"
TEST_DIR = "/kaggle/working/test1/"

In [None]:
# gather train data into a dataframe
filenames = os.listdir(TRAIN_DIR )
categories = []
for filename in filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        categories.append('dog')
    else:
        categories.append('cat')

all_df = pd.DataFrame({
    'filename': filenames,
    'category': categories
})

# gather test data into a dataframe
test_filenames = os.listdir(TEST_DIR)
test_df = pd.DataFrame({
   'id': test_filenames
})

In [None]:
# display train data
all_df.sample(5)

In [None]:
# show counts for train data
all_df['category'].value_counts()

In [None]:
# display test data
test_df.sample(5)

In [None]:
# show sample size of test data
test_df.shape[0]

#### Visualizing train data

In [None]:
# display sample train images
sample = all_df.head(9)
sample.head()
plt.figure(figsize=(12, 12))
for index, row in sample.iterrows():
    filename = row['filename']
    category = row['category']
    img = load_img(TRAIN_DIR+filename, target_size=(96,96))
    plt.subplot(3, 3, index+1)
    plt.imshow(img)
    plt.xlabel(filename)
plt.tight_layout()
plt.show()

## 2.0  Data Preparation
Split the whole dataset into training, validation and test sets

In [None]:
from sklearn.model_selection import train_test_split
# split into train/validate 
train_df, validate_df = train_test_split(all_df, test_size=0.20, random_state=0)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

The train set has 20,000 samples while the validate set has 5,000 samples

In [None]:
# show the count by category for train set
train_df['category'].value_counts()

In [None]:
# show the count by category for validate set
validate_df['category'].value_counts()

## 3.0 Data preprocessing
Data augmentation and transformation of  jpeg image files on disk to floating point tensors

Data augmentation increases the sample size by slightly altering the given sample to generate more samples. In a CNN the larger the training sample the better, a large sample enables a CNN to read/extract features from sample without the need for feature engineering. In cases where the sample size is small, the performance of a CNN can be improved by data augmentation.

The data exists on disk as jpeg image files, the files need to be decoded to RGB grids of pixels and converted to floating point tensors. The pixel values lie between (0, 255) and will be rescaled to (0, 1) interval for faster processing. The Keras class ImageDataGenerator can automatically turn image files on disk to preprocessed tensors will be utilized for the transformation.

In [None]:
# define train data augmentation configuration
from tensorflow.keras.preprocessing.image import ImageDataGenerator
train_datagen = ImageDataGenerator(
        rescale=1./255,
rotation_range=30,
width_shift_range=0.15,
height_shift_range=0.15,
shear_range=0.15,
zoom_range=0.15,
horizontal_flip=True,
fill_mode='nearest')

Generating sample images to illustrate augmentation

In [None]:
# using ImageDataGenerator to generate sample images
sample_df = train_df.sample(n=1).reset_index(drop=True)
sample_generator = train_datagen.flow_from_dataframe(
    sample_df, 
    TRAIN_DIR, 
    x_col='filename',
    y_col='category',
    target_size = (128, 128),
    class_mode='categorical'
)

In [None]:
plt.figure(figsize=(8, 8))
for i in range(0, 4):
    plt.subplot(2, 2, i+1)
    for X, Y in sample_generator:
        image = X[0]
        plt.imshow(image)
        break
plt.tight_layout()
plt.show()

Reading train, validation and test data from disk and converting to floating point tensors using ImageDataGenerator.

In [None]:
# reading train data
train_generator = train_datagen.flow_from_dataframe(
        train_df, 
        TRAIN_DIR,
        x_col='filename',
        y_col='category',
        target_size=(128, 128),
        batch_size=75,
        class_mode='binary')

In [None]:
# reading validation data
test_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = test_datagen.flow_from_dataframe(
        validate_df, 
        TRAIN_DIR,
        x_col='filename',
        y_col='category',
        target_size=(128, 128),
        batch_size=50,
        class_mode='binary')

In [None]:
# reading test data
test_generator = test_datagen.flow_from_dataframe(
        test_df, 
        TEST_DIR,
        x_col='id',
        y_col=None,
        class_mode=None,
        target_size=(128, 128),
        batch_size=12500//50)

### 4.0 Pretrained CNN
A pretrained CNN can be used to extract features from a small sample if it contains related information with improvement in performance. The Pretrained CNN for this study is the vGG16  trained on ImageNet dataset with many classes including different breeds of cats and dogs. There are 2 steps to using a pretrained CNN: feature extraction and fine tuning.

In [None]:
# using the pretrained convolutional base
from tensorflow.keras.applications import VGG16
conv_base = VGG16(weights='imagenet',
include_top=False,
input_shape=(128, 128, 3))
conv_base.summary()

Feature extraction will be performed by adding dense layers on top of the conv-base and running it end to end on the input data.The feature map from the conv-base that will be passed to the densely connected classifier has a shape of (4,4,512).

In [None]:
from tensorflow.keras import layers
from tensorflow.keras import models
model = models.Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.Dense(256, activation='relu'))
model.add(layers.Dense(1, activation='sigmoid'))
model.summary()

Feature extraction is freezing the conv-base before compilation, freezing prevents the weights of a layer from being updated during training, only the weights of the classifier will be updated during training. Fine tuning consists of unfreezing a few of the top layers frozen during feature extraction and training the newly added layer and the fully connected layer.


In [None]:
# freezing all layers up to a specific one
conv_base.trainable = True
set_trainable = False
for layer in conv_base.layers:
    if layer.name == 'block5_conv1':
        set_trainable = True
    if set_trainable:
        layer.trainable = True
    else:
        layer.trainable = False


In [None]:
# optimizing model performance
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
callbacks = [
    EarlyStopping(patience=5, verbose=1),
    ReduceLROnPlateau(factor=0.1, patience=2, min_lr=0.000001, verbose=1),
    ModelCheckpoint('model3.h5', verbose=1, save_best_only=True, save_weights_only=True)
]

In [None]:
# configuring the model for training
import  tensorflow.keras.optimizers as optimizers
model.compile(optimizer=optimizers.RMSprop(lr=1e-5),
loss='binary_crossentropy',
metrics=['acc'])


In [None]:
# fitting the model 
history = model.fit_generator(
        train_generator,
        steps_per_epoch=200,
        epochs=30,
        validation_data=validation_generator,
        validation_steps=100,
        callbacks=callbacks)

In [None]:
# plotting the results
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'r', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'r', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

#### Evaluating Pretrained CNN

In [None]:
# making prediction
predictions2 = model.predict_generator(test_generator, steps=np.ceil(12500/50))

In [None]:
# converting predictions to 1 and 0
predictions2 = [1 if y > 0.5 else 0 for y in predictions2]

test_df['label'] = predictions2

# restore back to class names (dog or cat)
label_map = dict((v,k) for k,v in train_generator.class_indices.items())
test_df['label'] = test_df['label'].replace(label_map)

# encoding according to submission format, dog = 1, cat = 0
test_df['label'] = test_df['label'].replace({ 'dog': 1, 'cat': 0 })

test_df.to_csv('submission2.csv', index=False)

In [None]:
submit_df.sample(5)