In [None]:
import zipfile


train_path = "../input/dogs-vs-cats-redux-kernels-edition/train.zip"
test_path = "../input/dogs-vs-cats-redux-kernels-edition/test.zip"

with zipfile.ZipFile(train_path, "r") as train_zip_ref:
    train_zip_ref.extractall(".")

with zipfile.ZipFile(test_path, "r") as test_zip_ref:
    test_zip_ref.extractall(".")

In [None]:
import numpy as np
import pandas as pd
from os import listdir


train_df = pd.DataFrame(listdir("train"), columns=["file_path"])
train_df["target"] = np.array(train_df["file_path"].str.split(".").tolist())[:, 0]
train_df["file_path"] = "train/" + train_df["file_path"]

test_df = pd.DataFrame(listdir("test"), columns=["file_path"])
test_df["file_path"] = "test/" + test_df["file_path"]

In [None]:
print("TRAIN DATASET")
train_df.head(10)

In [None]:
print("TEST DATASET")
test_df.head(10)

In [None]:
print("TRAINING DATASET INFO")
print("---------------------")
print(train_df.info())
print()

print("TARGET DISTRIBUTION")
print("-------------------")
print(train_df["target"].value_counts())

In [None]:
# Data input pipeline using ImageDataGenerator
import tensorflow as tf
from sklearn.model_selection import train_test_split


train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["target"])

BATCH_SIZE = 128
IMG_HEIGHT = 224
IMG_WIDTH = 224

datagen = tf.keras.preprocessing.image.ImageDataGenerator()

train_datagen = datagen.flow_from_dataframe(
    dataframe=train_data,
    directory=".",
    x_col="file_path",
    y_col="target",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary",
    batch_size=BATCH_SIZE
)

val_datagen = datagen.flow_from_dataframe(
    dataframe=val_data,
    directory=".",
    x_col="file_path",
    y_col="target",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode="binary",
    batch_size=BATCH_SIZE
)

In [None]:
from sklearn.preprocessing import LabelEncoder


enc = LabelEncoder()
train_df["target"] = enc.fit_transform(train_df["target"])

labels = zip(enc.classes_, enc.transform(enc.classes_))

for ctgry, label in labels:
    print(f"Class = {ctgry} Label = {label}")

In [None]:
train_df.head(10)

In [None]:
# Data input pipeline using tf.data
def read_train_images(X, y):
    X = tf.io.read_file(X)
    X = tf.io.decode_jpeg(X, channels=3)
    X = tf.image.resize(X, [IMG_HEIGHT, IMG_WIDTH])

    return (X, y)


def read_test_images(X):
    X = tf.io.read_file(X)
    X = tf.io.decode_jpeg(X, channels=3)
    X = tf.image.resize(X, [IMG_HEIGHT, IMG_WIDTH])

    return X


def build_data_pipeline(X, y=None):
    if y is None:
        tf_data = tf.data.Dataset.from_tensor_slices(X).shuffle(1000)
        tf_data = tf_data.map(read_test_images, num_parallel_calls=tf.data.AUTOTUNE)
    else:
        tf_data = tf.data.Dataset.from_tensor_slices((X, y)).shuffle(1000)
        tf_data = tf_data.map(read_train_images, num_parallel_calls=tf.data.AUTOTUNE)

    tf_data = tf_data.batch(BATCH_SIZE)
    tf_data = tf_data.prefetch(tf.data.AUTOTUNE)

    return tf_data

In [None]:
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42, stratify=train_df["target"])


train_tfdata = build_data_pipeline(train_data["file_path"], train_data["target"])
val_tfdata = build_data_pipeline(val_data["file_path"], val_data["target"])
test_tfdata = build_data_pipeline(test_df["file_path"])

In [None]:
# Printing the first 10 images
import matplotlib.pyplot as plt
%matplotlib inline


plt.style.use("seaborn")

fig, ax = plt.subplots(nrows=2, ncols=5, figsize=(20, 10))
ax = np.array(ax).ravel()

for image in train_tfdata.take(1):
    for i in range(10):
        img = image[0][i]/255.0
        ax[i].imshow(img)
        ax[i].axis("off")
        ax[i].grid("off")

        if image[1][i] == 0:
            ax[i].set_title("cat")
        else:
            ax[i].set_title("dog")

fig.show()

In [None]:
# Modelling with simple Feed Forward Neural Networks with MobileNetV2 as feature extractor
def build_model():
    pretrained_model = tf.keras.applications.EfficientNetB0(
        input_shape=(IMG_HEIGHT, IMG_WIDTH, 3),
        include_top=False,
        weights="imagenet",
        pooling="avg"
    )

    pretrained_model.trainable = False

    model = tf.keras.Sequential([
        pretrained_model,
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dense(units=1, activation="sigmoid")
    ])

    model.compile(
        optimizer="adam",
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model

In [None]:
build_model().summary()

In [None]:
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor="val_loss",
    factor=0.5,
    patience=2,
    mode="min",
    verbose=1
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=7,
    restore_best_weights=True,
    mode="min",
    verbose=1
)

callbacks = [reduce_lr, early_stop]

In [None]:
# Model training with data generated using tf.data
from time import time


start = time()
model_tfdata = build_model()

history_tfdata = model_tfdata.fit(
    x=train_tfdata,
    epochs=20,
    callbacks=callbacks,
    validation_data=val_tfdata
)

print(f"\nTime taken for training with tf.data = {time() - start} seconds")

In [None]:
# Model training with data generated using ImageDataGenerator
model_datagen = build_model()

start = time()
history_datagen = model_datagen.fit(
    x=train_datagen,
    epochs=20,
    validation_data=val_datagen,
    callbacks=callbacks
)

print(f"\nTime taken for training with ImageDataGenerator = {time() - start} seconds")

In [None]:
def plot_model_history(history):
    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 5))
    ax = np.array(ax).ravel()

    ax[0].plot(history.history["loss"], label="train loss", color="orange")
    ax[0].plot(history.history["val_loss"], label="val loss", color="green")
    ax[0].set_xlabel("Epochs")
    ax[0].set_ylabel("Loss")
    ax[0].set_title("Loss vs Epochs")
    ax[0].legend()

    ax[1].plot(history.history["accuracy"], label="train accuracy", color="orange")
    ax[1].plot(history.history["val_accuracy"], label="val accuracy", color="green")
    ax[1].set_xlabel("Epochs")
    ax[1].set_ylabel("Accuracy")
    ax[1].set_title("Accuracy vs Epochs")
    ax[1].legend()

    fig.show()

In [None]:
plot_model_history(history_tfdata)

In [None]:
plot_model_history(history_datagen)

In [None]:
y_pred_tfdata = model_tfdata.predict(test_tfdata)

test_datagen = datagen.flow_from_dataframe(
    dataframe=test_df,
    directory=".",
    x_col="file_path",
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    class_mode=None,
    batch_size=BATCH_SIZE
)

y_pred_datagen = model_datagen.predict(test_datagen)

In [None]:
test_df["target"] = (y_pred_tfdata + y_pred_datagen)/2
test_df.to_csv("submission_blend.csv", index=False)
test_df.head(10)