# Goal: Compare several, pre-trained Keras image analysis models

### Based on data from
https://www.kaggle.com/c/petfinder-pawpularity-score

### Simple model
* conv_base from https://keras.io/api/applications/
* conv_base (not trainable) -> Flatten -> BatchNormalization -> Dense

### Results after 10 epochs ... RMSE on a 20% test - Not Kaggle submit: 
* EfficientNetB0 = 21.24
* EfficientNetB1 = 21.43
* EfficientNetB2 = **21.03**
* EfficientNetB3 = 21.05
* EfficientNetB4 = 21.20
* EfficientNetB5 = 21.04
* EfficientNetB6 = 21.10
* EfficientNetB7 = 21.15 

### Following models overfit from epoch 1... even w Dropout 
* ResNet50
* VGG16
* InceptionResNetV2
* MobileNet
* DenseNet121

I guess these NN are too big for the data. Most probably, the train generator needs image augmentation in order to prevent overfitting. 



In [None]:
import os
import pandas as pd
import numpy as np
import gc
gc.enable()

import warnings
warnings.filterwarnings("ignore")
import cv2
import PIL
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, regularizers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Dense, Flatten, Dropout, Activation, BatchNormalization, concatenate
from tensorflow.keras.models import Sequential

imgSize = 299

path = '/kaggle/input/petfinder-pawpularity-score/'
os.listdir(path)

# Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
print(train_data.shape)
train_data.sample(1)

In [None]:
len(os.listdir(path+'train'))

In [None]:
id_ = train_data.loc[1234, 'Id']
# Create file
file = id_+'.jpg'

img = cv2.imread(path+'train/'+file)
img = cv2.resize(img, (imgSize,imgSize), interpolation = cv2.INTER_AREA)
print('Image shape:', img.shape)

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
axs.set_xticklabels([])
axs.set_yticklabels([])
plt.show()

# Keras image generator

In [None]:
train_data = train_data[['Id', 'Pawpularity']]
train_data.sample(1)

In [None]:
FileLoc = []

for j in range(train_data.shape[0]):
    id_ = train_data.loc[j, 'Id']
    file = id_+'.jpg'
    file = path+'train/'+file
    FileLoc.append(file)

train_data['FileLoc'] = FileLoc  
train_data.sample(1)

In [None]:
data = train_data.FileLoc
labels = train_data.Pawpularity

# Split arrays or matrices into random TRAIN and TEST subsets
X_trainV, X_test, y_trainV, y_test = train_test_split(data, labels, test_size=0.2, random_state=42)

# Split the validation 
X_train, X_valid, y_train, y_valid = train_test_split(X_trainV, y_trainV, test_size=0.2, random_state=42)

# Create the dataframes for the Keras generator / iterator: train valid and test
TestDF = pd.concat([X_test,y_test],axis=1)
print('Test ', TestDF.shape)

TrainDF = pd.concat([X_train,y_train],axis=1)
print('Train ',TrainDF.shape)

ValidDF = pd.concat([X_valid,y_valid],axis=1)
print('Valid ',ValidDF.shape)


In [None]:
%%time

batch_size=8

datagen=ImageDataGenerator(rescale=1./255)

trainGen = datagen.flow_from_dataframe(
    TrainDF,
    x_col="FileLoc",
    y_col='Pawpularity',
    target_size=(imgSize, imgSize),
    color_mode="rgb",
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=42,
    interpolation="nearest",
    #crop_to_aspect_ratio=True,
    validate_filenames=True
)

testGen = datagen.flow_from_dataframe(
    TestDF,
    x_col="FileLoc",
    y_col='Pawpularity',
    target_size=(imgSize, imgSize),
    color_mode="rgb",
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=42,
    interpolation="nearest",
    #crop_to_aspect_ratio=True,
    validate_filenames=True
)

validGen = datagen.flow_from_dataframe(
    ValidDF,
    x_col="FileLoc",
    y_col='Pawpularity',
    target_size=(imgSize, imgSize),
    color_mode="rgb",
    class_mode="raw",
    batch_size=batch_size,
    shuffle=True,
    seed=42,
    interpolation="nearest",
    #crop_to_aspect_ratio=True,
    validate_filenames=True
)


# Model

In [None]:
# Load the conv_base trained image model

conv_base = tf.keras.applications.EfficientNetB2(
    include_top=False,
    weights="imagenet",
    input_shape=(imgSize, imgSize, 3))

In [None]:
# Make the conv_base NOT trainable:
for layer in conv_base.layers[:]:
    layer.trainable = False

model = Sequential()
model.add(conv_base)
model.add(layers.Flatten())
model.add(layers.BatchNormalization())
model.add(Dense(4, activation='relu'))
model.add(Dense(1, activation='linear'))

model.compile('Adam', loss="mse", metrics=[tf.keras.metrics.RootMeanSquaredError()])
model.summary()

In [None]:
# Fit

STEP_SIZE_TRAIN=trainGen.n//trainGen.batch_size
STEP_SIZE_VALID=validGen.n//validGen.batch_size

history = model.fit_generator(generator=trainGen,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=validGen,
                    validation_steps=STEP_SIZE_VALID,
                    epochs = 10)

In [None]:
# Learning curves

acc = history.history['root_mean_squared_error']
val_acc = history.history['val_root_mean_squared_error']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training rmse')
plt.plot(epochs, val_acc, 'r', label='Validation rmse')
plt.title('Training and validation RMSE')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Predict on test

y_test = np.array(y_test)


Preds = model.predict_generator(generator=testGen,
)

Preds = Preds.flatten()
print(Preds.shape)
print(y_test.shape)

# RMSE on test
np.sqrt(np.mean((Preds-y_test)**2))