# 1. Import all the packages that you will need.


In [None]:
import os
import zipfile
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tensorflow.keras import layers
from tensorflow.keras import models
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# 2. Load your data and explore it. Describe how you will divide data for training, validation, and testing

In [None]:
%%time

for zipf in ['test1', 'train']:
    with zipfile.ZipFile(f"../input/dogs-vs-cats/{zipf}.zip","r") as zf:
        zf.extractall(".")
        print(f"{zipf} extracted")

os.listdir('/kaggle/working/train')[:3]

In [None]:
img_lbl = []
for i in os.listdir('/kaggle/working/train'):
    if i.split('.')[0] == 'cat':
        img_lbl.append('cat')
    else:
        img_lbl.append('dog')

df = pd.DataFrame({
    'image': os.listdir('/kaggle/working/train'),
    'label': img_lbl
})
display(df.head())
sns.countplot(data = df, x = 'label');
print('(Rows, columns):', df.shape)

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (16, 9))
img1 = mpimg.imread('/kaggle/working/train/cat.580.jpg')
img2 = mpimg.imread('/kaggle/working/train/dog.2303.jpg')
ax[0].imshow(img1)
ax[1].imshow(img2)
plt.show()

In [None]:
train, test_valid = train_test_split(df, test_size=0.2, stratify=df['label'])

test, valid = train_test_split(test_valid, test_size=0.5,  stratify=test_valid['label'])
print(f'Train size: {train.shape}')
print(f'Test size: {test.shape}')
print(f'Validation size: {valid.shape}')

We have balanced dataset with pretty large number of images. As mentioned before, we have equal number of images belonging to both classes.

Data was divided in proportion of (80, 10, 10) where major part was given to train set, while test and validation sets were divided equally by 10 percents. I used 'stratify' hyperparameter to make sure that data with our target label was divided equally.

# 3. Describe your chosen model and its architecture.

According to my research(that included reading several articles related to neural networks and deep learning. List of articles is provided in the end of notebook), convolutional neural network(CNN) is the best model that can be used in image classification problem. The main difference between ordinary neural network and CNN lies in the name of latter. CNN has convolutional layer with the filter, from which data goes to fully connected layer(traditional NN) and then generates output. Convolutional layer cuts image to a bunch of pixel, which are then turned to a matrix of numbers from 0 to 255 depending to their color and brightness. Then this matrix is being filtered using another matrix multiplication which allows to extract some useful features from data. This features then are goind to fully connected layers, that in fact are regular NNs.

In [None]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' 

model = models.Sequential()

# convolutional layer 
model.add(layers.Conv2D(32, (3, 3), activation='relu', input_shape=(224, 224, 3)))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(64, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

model.add(layers.Conv2D(128, (3, 3), activation='relu'))
model.add(layers.MaxPooling2D((2, 2)))

# flatten output of conv
model.add(layers.Flatten())

# hidden layer
model.add(layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.001)))
model.add(layers.Dropout(0.2))

# output layer
model.add(layers.Dense(1, activation='sigmoid'))

model.summary()

In [None]:
model.compile(optimizer = optimizers.Adam(learning_rate=5e-4), loss='binary_crossentropy', metrics='acc')

# 4. Describe your evaluation metrics.

My evaluation metrics are the traditional set of metrics for classification: Confusion matrix, f1 score, precision and recall. Confusion matrix gives us such valuable parameters as True Negative rate, True Positive rate, False Negative and False Positive rates. These variables are used to calculate another mentioned metrics. 

In [None]:
aug_gen = ImageDataGenerator(rescale=1./255, shear_range = 0.2, zoom_range = 0.2,
                               rotation_range=40, width_shift_range=0.2,
                               height_shift_range=0.2, horizontal_flip=True, fill_mode='nearest')

train_data = aug_gen.flow_from_dataframe(train, directory='/kaggle/working/train',
                                           x_col='image', y_col='label', class_mode='binary', target_size=(224,224))

valid_data = aug_gen.flow_from_dataframe(valid, directory='/kaggle/working/train',
                                       x_col='image', y_col='label', class_mode='binary', target_size=(224,224))
history = model.fit(train_data, validation_data = valid_data, epochs=10,
                   callbacks=[EarlyStopping(monitor='val_acc', min_delta=0.001, patience=5, verbose=1)])

In [None]:
acc = history.history['acc']
valid_acc = history.history['val_acc']
plt.figure(figsize = (16, 9))
plt.plot(acc, label = 'Train accuracy')
plt.plot(valid_acc, '--', label = 'Validation accuracy')
plt.legend()
plt.xticks(range(10))
plt.yticks(np.arange(0.7, 0.9, 0.025))
plt.ylabel('Accuracy')
plt.title('Train and validation accuracy')
plt.show()

In [None]:
model.save('model_cats_vs_gods.h5')

val_gen = ImageDataGenerator(rescale=1./255)

test_data = val_gen.flow_from_dataframe(test, directory = '/kaggle/working/train', x_col = 'image',
                                        y_col = 'label', class_mode = 'binary',
                                        target_size = (224,224), shuffle = False)

test_pred = model.predict(test_data)

### Let's evaluate model

In [None]:
def conf_matrix(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    conf_mat_df = pd.DataFrame(data = {'Predicted Positive': [tp, fp], 'Predicted Negative': [fn, tn]}, index = ['Real Positive', 'Real Negative'])
    return conf_mat_df

pred_lbl = test_pred > .5
true_lbl = test_data.classes
display(conf_matrix(true_lbl, pred_lbl))
print(classification_report(true_lbl, pred_lbl))

In [None]:
model.evaluate(test_data)

#### Overall accuracy is 93%