# EDA

See [HERE](https://www.kaggle.com/samuelcortinhas/pawpularity-eda-rf-model) for my first notebook where I carried out EDA and a simple RF model on the metadata. We will try to improve on this score here using an image-based model. This model will rely on transfer learning.

# Libraries

In [None]:
# Core
import os
import pandas as pd
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')
from pathlib import Path
import time
import math

# Images
import cv2

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Tensorflow
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import callbacks
from keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers.experimental import preprocessing
from tensorflow.keras.regularizers import l2
from tensorflow.python.client import device_lib

# Data

In [None]:
# Source path
path = '../input/petfinder-pawpularity-score/'

# Read data and save as data frames
train_df = pd.read_csv(path + 'train.csv')
test_df = pd.read_csv(path + 'test.csv')

# Print dimensions of training data
print('train_df dimensions: ', train_df.shape)

# Print dimensions of test data
print('test_df dimensions: ',test_df.shape)

# Preview training metada
train_df.head()

# Model using images

In [None]:
# Given Id return full image path
def train_id_to_path(x):
    return '../input/petfinder-pawpularity-score/train/' + x + ".jpg"
def test_id_to_path(x):
    return '../input/petfinder-pawpularity-score/test/' + x + ".jpg"

# Drop metadata
train_df = train_df.drop(['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'],axis=1)
test_df = test_df.drop(['Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur'],axis=1)

# Add the .jpg extensions to the image ids
train_df["img_path"] = train_df["Id"].apply(train_id_to_path)
test_df["img_path"] = test_df["Id"].apply(test_id_to_path)

train_df.head()

**Method of binning**

In [None]:
# I haven't got this to work successfully
'''
# Binning columns (turn regression problem into classification problem)
train_df['two_bin_pawp'] = pd.qcut(train_df['Pawpularity'], q=2, labels=False)
train_df = train_df.astype({"two_bin_pawp": str})

# qcut is a quantile-based discretization function
train_df['five_bin_pawp'] = pd.qcut(train_df['Pawpularity'], q=5, labels=False)
train_df = train_df.astype({"five_bin_pawp": str})

train_df['ten_bin_pawp'] = pd.qcut(train_df['Pawpularity'], q=10, labels=False)
train_df = train_df.astype({"ten_bin_pawp": str
'''

In [None]:
'''
# Choose number of bins
num_bins=10

# Delete y
del y

# Make the bins the target
y=train_df['ten_bin_pawp']

# One-hot encoding
y=pd.get_dummies(y)

# Preview target
y.head()
'''

**Use groupby and describe to find group means**

In [None]:
'''
# For example
train_df.groupby('five_bin_pawp').describe()
'''

**Image pre-processing**

In [None]:
# Set desired image size
image_height = 224
image_width = 224

# Function that converts image url to an eager tensor
def path_to_eagertensor(image_path):
    
    # Read file
    raw = tf.io.read_file(image_path)
    
    # Decode jpeg
    image = tf.image.decode_jpeg(raw, channels=3)
    
    # Chenge type and scale to lie in [0,1]
    image = tf.cast(image, tf.float32) / 255.0
    
    image = tf.image.resize(image, (image_height, image_width))
    return image

**Visualise image pre-processing**

In [None]:
# Plot that first image with original dimensions
og_example_image = plt.imread(train_df['img_path'][0])
print(og_example_image.shape)

# Display image with plt.imshow()
plt.imshow(og_example_image)
plt.title('First Training Image')
plt.axis('off') # turns off the gridlines
plt.show()

In [None]:
# Show pre-processing on first image
example_image = path_to_eagertensor(train_df['img_path'][0])

In [None]:
# Print the type 
print('type: ', type(example_image),'\n shape: ',example_image.shape)

# Display image
plt.imshow(example_image)
plt.title('First Training Image - with preprocessing')
plt.axis('off') # turns off the gridlines
plt.show()

**Apply pre-processing to training and test sets**

In [None]:
# 30% of original dataset (to save memory)
small_train_df=train_df.iloc[:round(len(train_df)*0.3),:]
small_train_df.shape

In [None]:
# Put training set tensors into a list
X = []
for i in small_train_df['img_path']:
    X.append(path_to_eagertensor(i))
    
# Convert to numpy array
X = np.array(X)

# Print type and shape
print(type(X),X.shape)

In [None]:
# Put test set tensors into a list
X_submission = []
for i in test_df['img_path']:
    X_submission.append(path_to_eagertensor(i))
    
# Convert to numpy array
X_submission = np.array(X_submission)

# Print type and shape
print(type(X_submission),X_submission.shape)

**train-test split**

In [None]:
# Labels
y=train_df['Pawpularity']

# Train-test split
X_train, X_valid, y_train, y_valid = train_test_split(X, y[:round(len(train_df)*0.3)], 
                                                      train_size=0.9, test_size=0.1, random_state=0)

**Transfer learning**

In [None]:
# Load pre-trained model: EfficientNet
EfficientNet_path='../input/keras-applications-models/EfficientNetB0.h5'

# This needs an input shape of (224,224,3)
efficient_net = tf.keras.models.load_model(EfficientNet_path)
efficient_net.trainable = False

**Data augmentation**

In [None]:
from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
    #validation_split=0.10,
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Training generator
training_generator = datagen.flow(X_train, y_train, 
                                  batch_size=64,seed=0)

# Validation generator
validation_generator = datagen.flow(X_valid, y_valid, 
                                    batch_size=64,seed=0)

**NN model**

In [None]:
# Define model
model = keras.Sequential([
    
    # Input layer
    layers.Input(shape=(image_height, image_width, 3)),
    
    # Pretrained base
    efficient_net,
    
    # Dense layer
    layers.Flatten(),
    layers.Dense(units=256, activation='relu'),
    layers.Dropout(rate=0.4),
    
    # Output layer
    #layers.Dense(units=num_bins, activation='softmax')   # one-hot encoding
    layers.Dense(units=1, activation='relu')              # ordinal encoding
])

# Define optimizer, loss function and accuracy metric
model.compile(optimizer='adam',
              #loss='categorical_crossentropy',   # one-hot encoding
              #metrics=['categorical_accuracy'])  # one-hot encoding
              loss='mse',                         # ordinal encoding
              metrics = [tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae", "mape"])

# Early stopping criteria
early_stopping = keras.callbacks.EarlyStopping(
    patience=10,
    min_delta=0.0001,
    restore_best_weights=True,
)

**Train model**

In [None]:
# Train model
history = model.fit(training_generator,
    validation_data=validation_generator,
    epochs=15,
    steps_per_epoch=len(X_train)//64,
    #callbacks=[early_stopping],
    verbose=True
)

**Learning curves**

In [None]:
# Plot learning curves
history_df = pd.DataFrame(history.history)
#history_df.loc[1:, ['loss', 'val_loss']].plot(title='Categorical cross-entropy') # one-hot encoding
history_df.loc[1:, ['rmse', 'val_rmse']].plot(title='RMSE')   # ordinal encoding
plt.ylim([20,23])

**Model summary**

In [None]:
model.summary()

# Make predictions

In [None]:
# Predict on the submission data
preds=model.predict(X_submission)

# Put predictions alongside their corresponding Ids
sub_df = pd.DataFrame()
sub_df['Id'] = test_df['Id']
sub_df['Pawpularity'] = preds
sub_df.to_csv('submission.csv',index=False)

sub_df.head()