# Preface to notebook:

1. **In this notebbok I use transfer learning of VGG-16 architecture**

2. I didn't use ***fine tuning*** because it does not greatly increase the accuracy of the model, but increases the training time of the neural network due to retraining of the convolutional part of VGG
3. Also I didn't use ***augmentation***, because the samples have enough instances of each class, moreover, the classes in the training sample are evenly distributed
4. I could use the approach with ***deep features***, but it will be on next time. Now -> transfer learning

# Imported libraries

In [None]:
import os
import zipfile
import random
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from keras.preprocessing.image import load_img, ImageDataGenerator
from keras.models import Sequential
from keras.layers import Flatten, Dense, Dropout
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.applications.vgg16 import VGG16, preprocess_input


In [None]:
batch_size = 64

In [None]:
seed = 666
tf.random.set_seed(seed)
np.random.seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)                      
random.seed(666)

In [None]:
os.listdir("../input/dogs-vs-cats/")

# Unzipping folders and images

In [None]:
TRAIN_PATH = "../input/dogs-vs-cats/train.zip"
TEST_PATH = "../input/dogs-vs-cats/test1.zip"

FILES = "/kaggle/files/unzipped/"

with zipfile.ZipFile(TRAIN_PATH, 'r') as zipp:
    zipp.extractall(FILES)
    
with zipfile.ZipFile(TEST_PATH, 'r') as zipp:
    zipp.extractall(FILES)

In [None]:
os.listdir("/kaggle/files/unzipped/")
# for dirpath, dirnames, filenames in os.walk("/kaggle/files/unzipped"):
#     # iterate over directories
#     for dirname in dirnames:
#         print("Catalog:", os.path.join(dirpath, dirname))
#     # iterate over files
#     for filename in filenames:
#         print("File:", os.path.join(dirpath, filename))

# Pack the data into a dataframe and display it on the screen

Column 'label' has 2 cateegorical classes

train dataset

In [None]:
train_set = pd.DataFrame({"image": os.listdir("/kaggle/files/unzipped/train")})
train_set["label"] = train_set["image"].apply(lambda x: x.split(".")[0])#.replace({'dog': 1, 'cat':0})

train_set

test dataset

In [None]:
test_set = pd.DataFrame({"image": os.listdir("/kaggle/files/unzipped/test1")})

test_set

Distribution of classes

In [None]:
fig, ax = plt.subplots(figsize = (6, 6), facecolor = "#e5e5e5")
ax.set_facecolor("#e5e5e5")

sns.countplot(x = "label", palette="Set1", data = train_set, ax = ax)

ax.set_title("Distribution of Class Labels")
sns.despine()
plt.show()

In [None]:
print("Count of images with dogs in training sample: ", train_set["label"].eq('dog').sum(axis=0))
print("Count of images with cats in training sample: ", train_set["label"].eq('cat').sum(axis=0))

# Print some images from train dataset/sample

In [None]:
fig = plt.figure(1, figsize = (10, 10))
fig.suptitle("Training sample images ")

for i in range(36):

    plt.subplot(6, 6, i + 1)
    image = load_img(FILES + "train/" + train_set["image"][i])
    plt.imshow(image)
    plt.axis("off")
    
plt.tight_layout()
plt.show()

# Splitting training dataset on:
* train_data
* validation_data
* test_data

In [None]:
train_val_data, test_data = train_test_split(train_set, 
                                        test_size = 0.1, 
                                        stratify = train_set["label"], 
                                        random_state = 666)
nb_test_samples = test_data.shape[0]
print(train_val_data.shape[0], nb_test_samples)
print('Size of test set is: {} values'.format(nb_test_samples))

In [None]:
train_data, val_data = train_test_split(train_val_data, 
                                        test_size = 0.15, 
                                        stratify = train_val_data["label"], 
                                        random_state = 666)

nb_train_samples = train_data.shape[0]
nb_validation_samples = val_data.shape[0]
print(nb_train_samples, nb_validation_samples)
print('Size of train set is: {} values\nSize of validation set is: {} values'.format(nb_test_samples, nb_validation_samples))

# Creating generators for:
1. train_set
* train_generator
* val_generator
* test_generator
2. test_set
* test_set_generator

The image generator is created based on the ImageDataGenerator class. The generator divides the values of all image pixels by 255 and converts images to BGR format with an offset.

In [None]:
datagen = ImageDataGenerator(rescale=1. / 255,
                             preprocessing_function = preprocess_input)

In [None]:
train_generator = datagen.flow_from_dataframe(
    dataframe = train_data,
    directory = FILES + "train/",
    x_col = "image",
    y_col = "label",
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    seed=666)

In [None]:
val_generator = datagen.flow_from_dataframe(
    dataframe = val_data,
    directory = FILES + "train/",
    x_col = "image",
    y_col = "label",
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=True,
    seed=666)

In [None]:
test_generator = datagen.flow_from_dataframe(
    dataframe = test_data,
    directory = FILES + "train/",
    x_col = "image",
    y_col = "label",
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode='binary',
    shuffle=False,
    seed=666)

In [None]:
test_set_generator = datagen.flow_from_dataframe(
    dataframe = test_set,
    directory = FILES + "test1/",
    x_col = "image",
    target_size=(224, 224),
    batch_size=batch_size,
    class_mode=None,
    shuffle=False,
    seed=666)

# Loading the pre-trained neural network VGG-16 without classification part

In [None]:
vgg16_net = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

"Freeze" the weights of the pre-trained neural network VGG16

In [None]:
vgg16_net.trainable = False

Trainable params = 0

In [None]:
vgg16_net.summary()

# Function to create a composite neural network based on VGG16

In [None]:
def pretrained_model():
    
    model = Sequential([
        vgg16_net,
        Flatten(),
        Dense(256, activation='relu'),
        Dropout(0.5),
        Dense(1, activation='sigmoid')
    ])
    
    return model

In [None]:
model = pretrained_model()

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer=keras.optimizers.Adam(learning_rate=1e-5), 
              metrics=['accuracy'])

Trainable params = 6 423 041

In [None]:
model.summary()

# Creating callbacks

In [None]:
reduce_lr = ReduceLROnPlateau(
    monitor = "val_accuracy", 
    patience = 2,
    verbose = 1, 
    factor = 0.5, 
    min_lr = 0.000000001
)

early_stopping = EarlyStopping(
    monitor = "val_accuracy",
    patience = 5,
    verbose = 1,
    mode = "max",
)

checkpoint = ModelCheckpoint(
    monitor = "val_accuracy",
    filepath = "catdog_vgg16_.{epoch:02d}-{val_accuracy:.6f}.hdf5",
    verbose = 1,
    save_best_only = True, 
    save_weights_only = True
)

just my opportunity research of kaggle notebook

In [None]:
#!ls -alh /tmp 

#!whoami

#!ls /kaggle/files #same
#os.listdir("/kaggle/files") #same

#!pwd

# Train the model using generators

In [None]:
history = model.fit(
    train_generator,
    steps_per_epoch=nb_train_samples // batch_size,
    epochs=10,
    validation_data=val_generator,
    validation_steps=nb_validation_samples // batch_size,
    callbacks = [reduce_lr, early_stopping, checkpoint]
)

In [None]:
# tf.keras.backend.clear_session()

# model = pretrained_model()

# model.load_weights("./catdog_vgg16_.10-0.944111.hdf5")

# Plotting losses and accuracy for validation sample

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (12, 4))

sns.lineplot(x = range(len(history.history["loss"])), y = history.history["loss"], ax = axes[0], label = "Training Loss")
sns.lineplot(x = range(len(history.history["loss"])), y = history.history["val_loss"], ax = axes[0], label = "Validation Loss")

sns.lineplot(x = range(len(history.history["accuracy"])), y = history.history["accuracy"], ax = axes[1], label = "Training Accuracy")
sns.lineplot(x = range(len(history.history["accuracy"])), y = history.history["val_accuracy"], ax = axes[1], label = "Validation Accuracy")
axes[0].set_title("Loss"); axes[1].set_title("Accuracy")

sns.despine()
plt.show()

# Evaluating accuracy of testing part of our train sample

In [None]:
scores = model.evaluate_generator(test_generator, nb_test_samples // batch_size)
print("Accuracy on test data: %.2f%%" % (scores[1]*100))

# Predict results for testing part

In [None]:
test_data_pred = model.predict(test_generator, steps = np.ceil(nb_test_samples / batch_size))
#print(test_data[0:30])
test_data.loc[:, "test_data_pred"] = np.around(test_data_pred).astype(int)
test_data["test_data_pred"] = test_data["test_data_pred"].replace({1: 'dog', 0:'cat'})
print(test_data.head(30))

# Plotting confusion matrix of predicted values for testing part of training sample

In [None]:
fig, ax = plt.subplots(figsize = (9, 6))

cm = confusion_matrix(test_data["label"], test_data["test_data_pred"])

disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels = ["cat", "dog"])
disp.plot(cmap = 'YlOrBr', ax = ax)

ax.set_title("Test data Set")
plt.xlabel('Predicted values')
plt.ylabel('True values')
plt.show()

# Evaluate the quality of training using other metrics

In [None]:
from sklearn.metrics import classification_report
print(classification_report(test_data["label"], test_data["test_data_pred"]))

# Let's evaluate and see the results where our model is wrong

In [None]:
test_data_errors = test_data[(test_data.label) != (test_data.test_data_pred)].reset_index(drop = True)
test_data_errors

In [None]:
fig = plt.figure(1, figsize = (15, 5))

for i in range(5):
    
    plt.subplot(1, 5, i + 1)
    image = load_img("/kaggle/files/unzipped/train/" + test_data_errors.image[i])
    plt.imshow(image)
    plt.axis("off")
    plt.title(f"True Value: {test_data_errors['label'][i]} \nPrediction: {test_data_errors['test_data_pred'][i]}")    
    
plt.tight_layout()
plt.show()

# Lets use the trained NN on test sample

In [None]:
test_preds = model.predict(test_set_generator, steps = np.ceil(test_set.shape[0] / batch_size))

test_set["test_set_preds"] = np.around(test_preds).astype(int)
test_set["test_set_preds"] = test_set["test_set_preds"].replace({1: 'dog', 0:'cat'})

# Here we can see how our network performed on test sample

In [None]:
sample_test = test_set.sample(9).reset_index(drop = True)

fig = plt.figure(1, figsize = (10, 10))
fig.suptitle("Sample Predictions")

for i in range(len(sample_test)):
    
    plt.subplot(3, 3, i + 1)
    image = load_img("/kaggle/files/unzipped/test1/" + sample_test.image[i])
    plt.imshow(image)
    plt.axis("off")
    plt.title(f"Predicted as {sample_test['test_set_preds'][i]}")
    
plt.tight_layout()
plt.show()

# Let's check how our network will cope with a random picture from the Internet

I bring the image to the desired input format, on which the network was trained

In [None]:
import io
import PIL
import requests

image_response = requests.get('https://r4.mt.ru/r4/photo09F4/20944082743-0/png/bp.webp')
pil_im = PIL.Image.open(io.BytesIO(image_response.content)).resize((224,224))
plt.axis("off")
plt.imshow(pil_im)
img = np.array(pil_im)
x = preprocess_input(img)
x = x/255
x = np.expand_dims(x, axis=0)
x.shape

In [None]:
res = model.predict(x)
if res>=0.5:
    res1='dog'
else:
    res1='cat'
    
plt.title(f"Predicted as: {res1}, model confidence: {res[0][0]:.4f}")
plt.axis("off")
plt.imshow(pil_im)

# Materials that helped me understand the material on the basics of convolutional neural networks

eng:
1. https://www.kaggle.com/code/mustafacicek/dogs-cats-vgg16-implementation-transfer-learning#5)-Pre-Trained-VGG-16-Model-&-Transfer-Learning
2. https://keras.io/ (really a lot of pages, so summarized to the main page)
3. https://towardsdatascience.com/illustrated-10-cnn-architectures-95d78ace614d#c5a6
4. https://www.kaggle.com/code/rajmehra03/a-comprehensive-guide-to-transfer-learning/notebook
5. https://www.kaggle.com/code/uysimty/keras-cnn-dog-or-cat-classification/notebook
6. https://www.kaggle.com/code/mustafacicek/mnist-cnn-data-augmentation/notebook
7. https://scikit-learn.org/stable/modules/cross_validation.html#stratification
8. https://www.kaggle.com/code/lbronchal/keras-gpu-cpu-reproducibility-test/notebook#Conclusion (seeds)
9. https://towardsdatascience.com/understanding-8-types-of-cross-validation-80c935a4976d (remembered about cross validation)

rus:
1. https://www.asozykin.ru/courses/nnpython
2. https://proproprogs.ru/data
3. https://ru-keras.com/