In [None]:
import numpy as np
import pandas as pd
import os
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import random

# keras
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from keras.preprocessing.image import ImageDataGenerator

# sklearn library
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report 

In [None]:
# unzipping data files

! unzip "../input/dogs-vs-cats-redux-kernels-edition/train.zip" -d train
! unzip "../input/dogs-vs-cats-redux-kernels-edition/test.zip" -d test

# Pre-processig Data

In [None]:
# creating dataframe

# labels
file = os.listdir("./train/train")
Labels = list(map(lambda x: x.split('.')[0], file))

# for filenames
f = Path("./train/train")
File_Path = list(f.glob(r"**/*.jpg"))

# dataframe
File_Path = pd.Series(File_Path).astype(str)
Labels = pd.Series(Labels)
df = pd.concat([File_Path,Labels],axis=1)
df.columns = ['filename', 'category']

In [None]:
df.head()

In [None]:
# ensuring data isn't biased

df['category'].value_counts().plot.bar(color=['green', 'orange']);

In [None]:
# Viewing dataframe images

fig, axes = plt.subplots(nrows=3,ncols=3, figsize=(7, 7),
                        subplot_kw={'xticks': [], 'yticks': []})

# iterating over images
for name, ax, in enumerate(axes.flatten()):
    ax.imshow(plt.imread(df.filename[name]))   # reading images
    ax.set_title(df.category[name])            #labelling images

plt.tight_layout()
plt.show()

In [None]:
# splitting the dataset
# specified random state for split to give same output

train_set, test_data = train_test_split(df, test_size= 0.2, random_state = 42)
train_data, val_data = train_test_split(train_set, test_size= 0.2, random_state = 42)


print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

In [None]:
# resetting index

train_data = train_data.reset_index(drop=True)
test_data = test_data.reset_index(drop=True)
val_data = val_data.reset_index(drop=True)

# Data Augmentation and Generators

In [None]:
img_size = (128, 128)
input_shape = (128, 128, 3)

In [None]:
# image Generator

img_gen = ImageDataGenerator(    
    rotation_range=10,
    rescale=1./255,
    horizontal_flip=True,
)

In [None]:
# example generator

example_data = train_data.sample(n=1).reset_index(drop=True)

example_gen = img_gen.flow_from_dataframe(
    dataframe = example_data,
    x_col = 'filename', #name of the column containing the image in the test set
    y_col ='category', #name of column containing the target in the test set
    target_size = img_size,
    class_mode ='categorical',
    batch_size = 32,
    shuffle = False # not to shuffle the given data
)

for i in range(1, 10):
    plt.subplot(3, 3, i)
    for X_batch, Y_batch in example_gen:
        image = X_batch[0]
        plt.imshow(image)
        plt.axis('Off')
        break

plt.tight_layout()  # for auto padding 
plt.show()

In [None]:
# training generator
train_gen = img_gen.flow_from_dataframe(
    train_data, 
    x_col='filename',
    y_col='category',
    target_size=img_size,
    class_mode='categorical',
    batch_size=32,
    shuffle = False
)

# validation generator
validation_gen = img_gen.flow_from_dataframe(
    val_data, 
    x_col='filename',
    y_col='category',
    target_size=img_size,
    class_mode='categorical',
    batch_size=32,
    shuffle = False
)

# CNN Model

In [None]:
# creating model
model = Sequential()

# Adding layers

# input layer
model.add(Conv2D(32, (3, 3), activation='relu', strides=(1, 1),input_shape=input_shape))
model.add(MaxPooling2D(pool_size=(2, 2)))

#flatten layer
model.add(Flatten())

# dense layers with dropout
model.add(Dense(128, activation = 'relu'))
model.add(Dropout(rate = 0.3))
model.add(Dense(64, activation = 'relu'))

#output layer
model.add(Dense(2, activation='softmax'))

In [None]:
# creating model
model = Sequential()

# Adding layers

# input layer
model.add(Conv2D(32, (3, 3), activation='relu', strides=(1, 1),input_shape=input_shape))

model.add(BatchNormalization())

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), activation='relu'))


model.add(BatchNormalization())

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(128, (3, 3), activation='relu'))

model.add(BatchNormalization())

model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(512, activation='relu'))

model.add(BatchNormalization())

model.add(Dropout(0.5))
# output layer
model.add(Dense(2, activation='softmax'))

In [None]:
model.summary()

In [None]:
# compile model
model.compile(optimizer='adam',
              loss="categorical_crossentropy", 
              metrics=['accuracy'])

In [None]:
# preventing overfitting

from keras.callbacks import EarlyStopping, ReduceLROnPlateau

learning_rate_reduction = ReduceLROnPlateau(monitor='val_accuracy', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)

earlystop = EarlyStopping(patience=10)

In [None]:
# fitting model

history = model.fit(train_gen,
                    validation_data = validation_gen,
                    epochs = 20,
                    verbose=1, #for animated bar during epoch
                    callbacks = [earlystop, learning_rate_reduction])

# Model Evaluation

In [None]:
# Loss vs Accuracy

#accuracy
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

#loss
loss = history.history['loss'] 
val_loss = history.history['val_loss']

In [None]:
plt.figure(figsize=(10, 5))

# visualising Accuracy 
plt.subplot(2, 1, 1)

plt.plot(acc, label='Training Accuracy')
plt.plot(val_acc, label='Validation Accuracy')

plt.ylabel('Accuracy') 
plt.title('Training and Validation Accuracy') 

# Visualising Loss
plt.subplot(2, 1, 2)

plt.plot(loss, label='Training Loss')
plt.plot(val_loss, label='Validation Loss')

plt.ylabel('Loss')
plt.title('Training and Validation Loss')

legend = plt.legend()
plt.show()


In [None]:
# checking accuracy on the test data we left aside

test_gen = img_gen.flow_from_dataframe(
    test_data, 
    x_col='filename',
    y_col='category',
    target_size=img_size,
    class_mode='categorical',
    batch_size=32,
    shuffle = False
)

In [None]:
# Predict the label of the test_images
pred = model.predict(test_gen)
pred = np.argmax(pred,axis = 1) # pick the class with highest probability

# labelling data
labels = (train_gen.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred2 = [labels[k] for k in pred]

# expected output
y_test = test_data.category

# model accuracy
print(classification_report(y_test, pred2))

In [None]:
# Display 15 picture of the dataset with their labels --> EXTRAS

fig, axes = plt.subplots(nrows=3, ncols=5, figsize=(15, 10),
                        subplot_kw={'xticks': [], 'yticks': []})

color = "blue" if pred2[i] == test_data.category.iloc[i] else "red"
for i, ax ,in enumerate(axes.flat):
    ax.imshow(plt.imread(test_data.filename.iloc[i]))
    ax.set_title(f"True: {test_data.category.iloc[i]}\nPredicted: {pred2[i]}",color=color)
    
plt.subplots_adjust(hspace = 0.3)
plt.suptitle("Model predictions (blue: correct, red: incorrect)",y=0.98)
plt.tight_layout()
plt.show()

# Submission

In [None]:
test_filenames = os.listdir("./test/test")
test_df = pd.DataFrame({
    'filename': test_filenames
})

In [None]:
# checking accuracy on the test data we left aside

test_gen = img_gen.flow_from_dataframe(
    test_df, 
    "./test/test",
    x_col='filename',
    y_col=None,
    target_size=img_size,
    class_mode=None,
    batch_size=32,
    shuffle = False # not to shuffle the given data
)

In [None]:
# Predict the label of the test_images
pred = model.predict(test_gen)
pred = np.argmax(pred,axis = 1) # pick the class with highest probability

# labelling data
labels = (train_gen.class_indices)
labels = dict((v,k) for k,v in labels.items())
pred2 = [labels[k] for k in pred]

In [None]:
label = np.zeros(len(pred2), dtype='int')

pred2 = np.array(pred2)
pred2=='cat'

label[pred2=='cat']= 0
label[pred2=='dog']= 1

In [None]:
label = pd.Series(label)
ind = np.arange(1, len(pred2)+1)
my_submission = pd.DataFrame({'id':ind, 'label':label})
my_submission

In [None]:
my_submission.to_csv('submission.csv', index=False)

**Refresh the contents of the output directory '/kaggle/working' to find the csv file******