In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#where am I?
!pwd

In [None]:
!apt-get install tree

In [None]:
#list the directory tree of 'kaggle'
!tree '/kaggle'

In [None]:
#Unzip files
import zipfile

# Since the input folder is read-only, lets create a folder called data and put the images there
base_dir = '../input/dogs-vs-cats-redux-kernels-edition'
train_dir = '../data/train'
test_dir = '../data/test'

# Extract All Data From Zip to "../data" Directory
with zipfile.ZipFile(os.path.join(base_dir, 'train.zip')) as train_zip:
    train_zip.extractall('../data')
    
with zipfile.ZipFile(os.path.join(base_dir, 'test.zip')) as test_zip:
    test_zip.extractall('../data')
    

In [None]:
#list the directory tree but ignore the images (since it will flood the output)
!tree '/kaggle' -I '*.jpg'

In [None]:
# Check Current Directory
os.listdir(base_dir)

In [None]:
#Check the images
os.listdir(train_dir)

In [None]:
#Images in train_dir
len(os.listdir(train_dir))

In [None]:
#Images in test_dir
len(os.listdir(test_dir))

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
from PIL import Image

In [None]:
import glob

In [None]:
# put the FilePath images into a List
train_list = glob.glob(os.path.join(train_dir, '*.jpg'))
test_list = glob.glob(os.path.join(test_dir, '*.jpg'))

In [None]:
#Show some images (Train images)
fig=plt.figure(figsize=(14, 10))
columns = 4
rows = 5
for i in range(1, columns*rows +1):
    photo_num = np.random.randint(len(train_list))
    img = Image.open(train_list[photo_num])
    fig.add_subplot(rows, columns, i)
    plt.axis('off')
    plt.imshow(img)
plt.suptitle("Train Images", fontsize = 22)
plt.show()

In [None]:
#Show some images (Test images)
fig=plt.figure(figsize=(14, 10))
columns = 4
rows = 5
for i in range(1, columns*rows +1):
    photo_num = np.random.randint(len(test_list))
    img = Image.open(train_list[photo_num])
    fig.add_subplot(rows, columns, i)
    plt.axis('off')
    plt.imshow(img)
plt.suptitle("Test Images", fontsize = 22)
plt.show()

In [None]:
# Get the list of filenames in train_dir
list_of_images_names = os.listdir(train_dir)

In [None]:
def get_labels(list_of_images):
    names = [] # images names
    labels = [] # labels
    
    for i in list_of_images:
        names.append(i)
        if 'dog' in i:
            labels.append(1)
        elif 'cat' in i:
            labels.append(0)
    
    df_train_labels = pd.DataFrame({'image_id':names, 'label':labels})
            
    return df_train_labels

In [None]:
train_labels = get_labels(list_of_images_names)

In [None]:
train_labels

In [None]:
#Check if the data are balanaced
sns.countplot(train_labels.label)
plt.show()

In [None]:
train_labels.label.value_counts()

In [None]:
import tensorflow as tf

In [None]:
# Main hyperparameters
BATCH_SIZE = 8
STEPS_PER_EPOCH = len(train_labels)*0.8 / BATCH_SIZE
VALIDATION_STEPS = len(train_labels)*0.2 / BATCH_SIZE
EPOCHS = 20
TARGET_SIZE = 224

In [None]:
# Cast label to string
train_labels.label = train_labels.label.astype('str')

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
                                     validation_split = 0.2,
                                     preprocessing_function = None,
                                     zoom_range = 0.2,
                                     cval = 0.2,
                                     horizontal_flip = True,
                                     vertical_flip = True,
                                     fill_mode = 'nearest',
                                     shear_range = 0.2,
                                     height_shift_range = 0.2,
                                     width_shift_range = 0.2)

train_generator = train_datagen.flow_from_dataframe(train_labels,
                                                    directory = train_dir,
                                                    subset = "training",
                                                    x_col = "image_id",
                                                    y_col = "label",
                                                    target_size=(TARGET_SIZE, TARGET_SIZE),
                                                    batch_size=BATCH_SIZE,
                                                    class_mode='binary')

validation_datagen = tf.keras.preprocessing.image.ImageDataGenerator(validation_split = 0.2)


validation_generator = validation_datagen.flow_from_dataframe(train_labels,
                                                                directory = train_dir,
                                                                subset = "validation",
                                                                x_col = "image_id",
                                                                y_col = "label",
                                                                target_size=(TARGET_SIZE, TARGET_SIZE),
                                                                batch_size=BATCH_SIZE,
                                                                class_mode='binary')


In [None]:
#Load our pretrained model
pretrained_model = tf.keras.applications.EfficientNetB0(include_top = False, weights = 'imagenet', input_shape = (TARGET_SIZE, TARGET_SIZE, 3))
pretrained_model.summary()

In [None]:
def create_model(pretrained):
    model = tf.keras.Sequential()

    # load pretrained model
    model.add(pretrained)
    
    # Applies average pooling on the spatial dimensions until each spatial dimension is one
    model.add(tf.keras.layers.GlobalAveragePooling2D())
    
    # Classifier
    model.add(tf.keras.layers.Dense(256, activation = "relu"))
    model.add(tf.keras.layers.Dense(1, activation = "sigmoid"))

    return model

model = create_model(pretrained_model)
model.summary()

In [None]:
model.compile(optimizer = tf.keras.optimizers.Adam(lr = 0.001),
                  loss = "binary_crossentropy",
                  metrics = ["accuracy"])

In [None]:
model_save = tf.keras.callbacks.ModelCheckpoint('./best_trial0_catdog_model.h5', 
                             save_best_only = True, 
                             save_weights_only = True,
                             monitor = 'val_loss', 
                             mode = 'min', verbose = 1)

early_stop = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', min_delta = 0.001, 
                           patience = 5, mode = 'min', verbose = 1,
                           restore_best_weights = True)

reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor = 'val_loss', factor = 0.3, 
                              patience = 2, min_delta = 0.001, 
                              mode = 'min', verbose = 1)

history = model.fit_generator(
    train_generator,
    steps_per_epoch = STEPS_PER_EPOCH,
    epochs = EPOCHS,
    validation_data = validation_generator,
    validation_steps = VALIDATION_STEPS,
    callbacks = [model_save, early_stop, reduce_lr]
)

In [None]:
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(acc) + 1)

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.set_style("white")
plt.suptitle('Train history', size = 15)

ax1.plot(epochs, acc, "bo", label = "Training acc")
ax1.plot(epochs, val_acc, "b", label = "Validation acc")
ax1.set_title("Training and validation acc")
ax1.legend()

ax2.plot(epochs, loss, "bo", label = "Training loss", color = 'red')
ax2.plot(epochs, val_loss, "b", label = "Validation loss", color = 'red')
ax2.set_title("Training and validation loss")
ax2.legend()

plt.show()

In [None]:
model.evaluate_generator(generator=validation_generator, steps=VALIDATION_STEPS)

In [None]:
model.save('./trial0_catdog_model_overfitted_97.h5')

In [None]:
ss = pd.read_csv(os.path.join(base_dir, "sample_submission.csv"))
ss

In [None]:
ss.info()

In [None]:
ss.id = ss.id.astype('str')

In [None]:
os.listdir(test_dir)

For each image in the test set, you should predict a probability that the image is a dog (1 = dog, 0 = cat).

In [None]:
image = Image.open(os.path.join(train_dir,'cat.3660'+'.jpg'))
image

In [None]:
image = np.expand_dims(image, axis = 0)
model.predict(image)

In [None]:
image = Image.open(os.path.join(test_dir,'1000'+'.jpg'))
image

In [None]:
image = image.resize((TARGET_SIZE, TARGET_SIZE))
image = np.expand_dims(image, axis = 0)
model.predict(image)

In [None]:
preds = []

for image_id in ss.id:
    image = Image.open(os.path.join(test_dir, image_id+'.jpg'))
    image = image.resize((TARGET_SIZE, TARGET_SIZE))
    image = np.expand_dims(image, axis = 0)
    preds.append(model.predict(image))

In [None]:
ss['label'] = preds
ss

In [None]:
ss.label = ss.label.astype('float32')
ss

In [None]:
ss.describe()

In [None]:
ss.to_csv('submission.csv',header=True,index = False)

In [None]:
!tree