<h1> Dogs vs Cats Competiton </h1>
Creating a model to distinguish dogs from cats in images using Keras and Tensorflow

<a id="toc"></a>
<h2> Table of Contents </h2>
<div class = "alert alert-block alert-info">
    <ol>
        <li><a href="#import"> Importing libraries </a></li>
        <li><a href="#download"> Downloading data </a></li>        
        <li><a href="#analyse"> Analyse the data </a></li>        
        <li><a href="#model"> Modeling </a></li>
        <li><a href="#reference"> Reference </a></li>        
    </ol>
</div>

<a id="import"></a>
<h2> Importing libraries </h2>

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
%matplotlib inline

import os
import zipfile
from tqdm import tqdm

#from PIL import Image

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

<a href="#toc"> Back to top </a>
<hr>

<a id="download"></a>
<h2> Downloading data </h2>

We have the option of using either `os.walk()` or `os.listdir()` to get all the files or images. In this project, we will use `os.lisdir()` 

In [None]:
#help(os.walk)
# help(os.listdir)

In [None]:
print(os.listdir('../'))
print(os.listdir('../input'))
print(os.listdir('../input/dogs-vs-cats'))

In [None]:
input_path = '/kaggle/input/dogs-vs-cats/'
os.listdir(input_path)

In [None]:
test_zip = input_path + os.listdir(input_path)[0]
train_zip = input_path + os.listdir(input_path)[1]
sample_sub_path = input_path + os.listdir(input_path)[2]
# sanity check
#print(train_zip)
#print(test_zip)

Taking a look at the format of submission required. 

In [None]:
sample_sub = pd.read_csv(sample_sub_path)
sample_sub.head()

The images are contained in zip files and so we will need to use `zipfile.ZipFile()` to extract the images.

In [None]:
# help(zipfile.ZipFile)

In [None]:
%%time
image_path = '/kaggle/temp/'
with zipfile.ZipFile(test_zip, 'r') as myzip:
    myzip.extractall(image_path)
    
with zipfile.ZipFile(train_zip, 'r') as myzip:
    myzip.extractall(image_path)

In [None]:
train_path = image_path + 'train/'
print(train_path)
d = {'image': os.listdir(train_path)}
train_df = pd.DataFrame(data=d)
train_df['label'] = train_df['image'].apply(lambda x: x.split('.')[0])
train_df['path'] = train_df['image'].apply(lambda x: train_path+x)
train_df.head()

In [None]:
test_path = image_path + 'test1/'
print(test_path)
d = {'image': os.listdir(test_path)}
test_df = pd.DataFrame(data=d)
test_df['label'] = None
test_df['path'] = test_df['image'].apply(lambda x: test_path+x)
test_df['id'] = test_df['image'].apply(lambda x: int(x.split('.')[0]))
#test_df.sort_values(by='id', inplace=True)
test_df.set_index('id', inplace=True)
test_df.sort_index(inplace=True)
#test_df.reset_index(inplace=True)
test_df.head()

<a href="#toc"> Back to top </a>

<a id="analyse"></a>
<h2> Analysing the data </h2>

In [None]:
print(f'Number of training images: {len(train_df)}')
print(f'Number of test images: {len(test_df)}')

In [None]:
sns.countplot(x='label', data=train_df)
plt.title('Number of training images of dogs and cats')
plt.show()

print(train_df['label'].value_counts())

We will print out some images for quick check

In [None]:
def print_images(n, df, ncols=5):
    '''
    helper function to print images and labels
    n: number of images to print
    df: dataframe from which image is chosen
    '''
    nrows = int(np.ceil(n / ncols))

    plt.figure(figsize=(10, 6))
    for i in range(n):
        img_sample = df.sample()
        img_name = img_sample.iloc[0].image
        img_path = img_sample.iloc[0].path
        img_label = img_sample.iloc[0].label
        img = mpimg.imread(img_path)
        ax = plt.subplot(nrows, ncols, i+1)
        ax.imshow(img)
        ax.set_title(f'{img_name}\n, label: {img_label}')
    plt.tight_layout()
    plt.show()

In [None]:
print_images(10, train_df)

In [None]:
print_images(10, test_df)

The images are of different shapes and sizes. We will do a simple visualisation of the minimum, maximum as well as the distribution of the sizes. 

In [None]:
train_df['shape'] = train_df['path'].apply(lambda x: mpimg.imread(x).shape)
train_df['size'] = train_df['shape'].apply(lambda x: x[0]*x[1]*x[2])
train_df.head()

In [None]:
test_df['shape'] = test_df['path'].apply(lambda x: mpimg.imread(x).shape)
test_df['size'] = test_df['shape'].apply(lambda x: x[0]*x[1]*x[2])
test_df.head()

In [None]:
temp_img_df = train_df[train_df['size'] == train_df['size'].min()]
print(temp_img_df)
print_images(1, temp_img_df)
print('-' * 80)
temp_img_df = train_df[train_df['size'] == train_df['size'].max()]
print(temp_img_df)
print_images(1, temp_img_df)

In [None]:
temp_img_df = test_df[test_df['size'] == test_df['size'].min()]
print(temp_img_df)
print_images(1, temp_img_df)
print('-'*20)
temp_img_df = test_df[test_df['size'] == test_df['size'].max()]
print(temp_img_df)
print_images(1, temp_img_df)


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(8, 4))
sns.histplot(x='size', data=train_df, bins=20, ax=ax[0])
sns.histplot(x='size', data=test_df, bins=20, ax=ax[1])
plt.tight_layout()
plt.show()

<a id='housekeeping'></a>
<h2>Housekeeping</h2>

In order to use keras' data generator, we will need to move the files into respective labeled folders. This is to achieve an end-to-end model. The moving of images can be achieved by different methods, `os.rename()`, `os.replace()` and `shutil.move()`. We will be using `os.rename()`

In [None]:
dog_df = train_df[train_df.label=='dog']
cat_df = train_df[train_df.label=='cat']

In [None]:
dog_path = train_path + 'dog/'
cat_path = train_path + 'cat/'

os.mkdir(dog_path)
os.mkdir(cat_path)

for name in dog_df.image:
    os.rename((train_path+name), (dog_path+name))
    
for name in cat_df.image:
    os.rename((train_path+name), (cat_path+name))

<a href='#toc'> Back to Top </a>

<a id='model'></a>
<h2> Modeling with Keras </h2>

In [None]:
image_size = (128, 128)
batch_size = 32
seed = 2

train_ds = keras.preprocessing.image_dataset_from_directory(
    train_path,
    batch_size=32,
    image_size=image_size,
    seed=seed,
    validation_split=.2,
    subset='training'
)

val_ds = keras.preprocessing.image_dataset_from_directory(
    train_path,
    batch_size=32,
    image_size=image_size,
    seed=seed,
    validation_split=.2,
    subset='validation'
)

In [None]:
#help(train_ds.take)

`train_ds.take(count)` creates a `Dataset` with at most `count` elements (or batch if you will). We can try running the code below with `range(33)` and it will return an error because each `batch_size` has maximum 32 images

In [None]:
plt.figure(figsize=(12, 12))

for images, labels in train_ds.take(1):
    for i in range(32):
        ax = plt.subplot(7, 5, i + 1)
        plt.imshow(images[i].numpy().astype("uint8"))
        plt.title(int(labels[i]))
        plt.axis("off")
    plt.show()

Creating a data augmentation layer. We will have 2 basic layers in the augmentation layer `RandomFlip()` and `RandomRotation()`.

In [None]:
#help(layers.RandomFlip)
#help(layers.RandomRotation)

In [None]:
# data_augmentation
data_augmentation = keras.Sequential(
    [layers.RandomFlip("horizontal"),
     layers.RandomRotation(0.1)]
)

In [None]:
plt.figure(figsize=(10, 10))

for images, _ in train_ds.take(1):
    for i in range(9):
        augmented_images = data_augmentation(images)
        ax = plt.subplot(3, 3, i + 1)
        plt.imshow(augmented_images[0].numpy().astype("uint8"))
        plt.axis("off")

Our images have RGB channel values in the range [0, 255]. We will have standardise them to the range [0, 1]. 

<h2>Building the model</h2>

In [None]:
model = keras.Sequential()
model.add(keras.Input(shape=image_size+(3,)))
model.add(data_augmentation)
model.add(layers.Rescaling(1.0/255))
model.add(layers.Conv2D(32, 3, activation='relu'))
model.add(layers.MaxPool2D(pool_size=2))
model.add(layers.Conv2D(64, 3, activation='relu'))
model.add(layers.MaxPool2D(pool_size=2))
model.add(layers.Conv2D(64, 3, activation='relu'))
model.add(layers.MaxPool2D(pool_size=2))
model.add(layers.Conv2D(128, 3, activation='relu'))
model.add(layers.MaxPool2D(pool_size=2))
model.add(layers.Flatten())
model.add(layers.Dense(512, activation='relu'))
model.add(layers.Dropout(.5))
model.add(layers.Dense(1, activation='sigmoid'))
#model.add(layers.Flatten())

In [None]:
model.output

In [None]:
model.summary()

In [None]:
epochs = 50

callbacks = [
    keras.callbacks.ModelCheckpoint("save_at_{epoch}.h5"),
]

model.compile(
    optimizer='adam',
    loss="binary_crossentropy",
    metrics=["accuracy"],
)

hist = model.fit(
    train_ds, epochs=epochs, validation_data=val_ds, verbose=1
)

In [None]:
fit_df = pd.DataFrame.from_dict(hist.history)
fit_df['epoch'] = fit_df.index + 1
fit_df.head()

In [None]:
plt.figure(figsize=(12, 5))
plt.subplot(121)
sns.lineplot(x='epoch', y='loss', data=fit_df)
sns.lineplot(x='epoch', y='val_loss', data=fit_df)
plt.legend(['loss', 'val_loss'])

plt.subplot(122)
sns.lineplot(x='epoch', y='accuracy', data=fit_df)
sns.lineplot(x='epoch', y='val_accuracy', data=fit_df)
plt.legend(['accuracy', 'val_accuracy'])
plt.show()

<a href="#toc"> Back to top </a>

<h2>Predictions</h2>

In [None]:
test_path

In [None]:
test_ds = keras.preprocessing.image_dataset_from_directory(
    test_path,
    label_mode=None,
    image_size=image_size,
)

In [None]:
predictions = model.predict(test_ds)

In [None]:
pred = np.array([1 if x>=0.5 else 0 for x in predictions])

In [None]:
sns.histplot(predictions)
plt.show()

In [None]:
test_df['label'] = pred
test_df

In [None]:
test_df['label'].value_counts()

In [None]:
print_images(20, test_df)

<a href="#toc"> Back to top </a>

<a id="reference"></a>
<h2> Reference </h2>

1. [Understanding os.walk](https://docs.python.org/3/library/os.html)
2. [Understanding zipfile.ZipFile](https://docs.python.org/3/library/zipfile.html)
3. [Introduction to Keras for Engineers](https://keras.io/getting_started/intro_to_keras_for_engineers/)
4. [Image classification from scratch](https://keras.io/examples/vision/image_classification_from_scratch/)
5. [How to use Kaggle TPU](https://www.kaggle.com/docs/tpu)

<a href="#toc"> Back to top </a>