# Goal: Predict Pawpularity based on BOTH image and tabular data

### Results RSME on test 20%: 
* Only images  = 21.2 
* Only tabular = 21.1 
* Images & tabular = 21.05

### Retrain on all data ... predict on Kaggle test ... submit

Many thanks to:

* https://www.kaggle.com/drcapa/petfinder-my-starter
* https://www.pyimagesearch.com/2019/02/04/keras-multiple-inputs-and-mixed-data/
* https://stackoverflow.com/questions/55080465/two-parallel-but-different-datasets-in-keras-as-multiple-inputs

In [None]:
import os
import pandas as pd
import numpy as np
import gc
gc.enable()

import warnings
warnings.filterwarnings("ignore")
import cv2
import PIL
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Input, Dense, Flatten, Dropout, Activation, BatchNormalization, concatenate
from tensorflow.keras.models import Model

imgSize = 128

path = '/kaggle/input/petfinder-pawpularity-score/'
os.listdir(path)

# Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
print(train_data.shape)
train_data.sample(1)

In [None]:
len(os.listdir(path+'train'))

In [None]:
id_ = train_data.loc[1234, 'Id']
# Create file
file = id_+'.jpg'
# Is the file in folder?
file in os.listdir(path+'train')

img = cv2.imread(path+'train/'+file)
img = cv2.resize(img, (imgSize,imgSize), interpolation = cv2.INTER_AREA)
print('Image shape:', img.shape)

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
axs.set_xticklabels([])
axs.set_yticklabels([])
plt.show()

In [None]:
X_num = train_data.drop(['Id','Pawpularity'], axis=1)
y = train_data['Pawpularity']
print(X_num.shape)
y.shape

In [None]:
X_num.iloc[1234]

# Only tabular metadata

In [None]:
# Split into Train & Test

X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42)
print ('X_train: ', X_train.shape)
print ('X_test: ', X_test.shape)
print ('y_train: ', y_train.shape)
print ('y_test: ', y_test.shape)

In [None]:
# Model

def create_mlp(dim, regress=False):
    model = keras.Sequential()
    model.add(Dense(64, input_dim=dim, activation="relu"))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation="linear"))
    return model

mlp = create_mlp(X_train.shape[1], regress=False)
mlp.compile('Adam', 'mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
mlp.summary()

In [None]:
# Fit

history = mlp.fit(X_train, y_train, 
                    validation_split = 0.2,
                    batch_size = 4,
                    epochs = 20)

In [None]:
# Learning curves

acc = history.history['root_mean_squared_error']
val_acc = history.history['val_root_mean_squared_error']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training rmse')
plt.plot(epochs, val_acc, 'r', label='Validation rmse')
plt.title('Training and validation root_mean_squared_error')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Predict on test

y_test = np.array(y_test)

Preds = mlp.predict(X_test)
Preds = Preds.flatten()
print(Preds.shape)
print(y_test.shape)


# RMSE on test
np.sqrt(np.mean((Preds-y_test)**2))

# Only images

In [None]:
%%time

# Resize & Normalize the images

X_img = []
for i, row in train_data.iterrows():
    rawImg = cv2.imread(path+'train/'+row['Id']+'.jpg')
    image = cv2.resize(rawImg, (imgSize,imgSize), interpolation = cv2.INTER_AREA)
    normImg = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    X_img.append(normImg)
    if i % 1000 ==0:
        print(i)

X_img = np.array(X_img)
X_img.shape

In [None]:
# Image from normalized array

#img = PIL.Image.fromarray(X_img[1234])

img = cv2.resize(X_img[1234], (imgSize,imgSize), interpolation = cv2.INTER_AREA)
print('Image shape:', img.shape)

fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
axs.set_xticklabels([])
axs.set_yticklabels([])
plt.show()

In [None]:
# Split into Train & Test

X_train, X_test, y_train, y_test = train_test_split(X_img, y, test_size=0.2, random_state=42)
print ('X_train: ', X_train.shape)
print ('X_test: ', X_test.shape)
print ('y_train: ', y_train.shape)
print ('y_test: ', y_test.shape)

In [None]:
def create_cnn(width, height, depth, filters=(16, 32, 64), regress=False):
	# initialize the input shape and channel dimension, assuming
	# TensorFlow/channels-last ordering
	inputShape = (imgSize, imgSize, 3)
	chanDim = -1
	# define the model input
	inputs = Input(shape=inputShape)
	# loop over the number of filters
	for (i, f) in enumerate(filters):
		# if this is the first CONV layer then set the input
		# appropriately
		if i == 0:
			x = inputs
		# CONV => RELU => BN => POOL
		x = Conv2D(f, (3, 3), padding="same")(x)
		x = Activation("relu")(x)
		x = BatchNormalization(axis=chanDim)(x)
		x = MaxPooling2D(pool_size=(2, 2))(x)

	# flatten the volume, then FC => RELU => BN => DROPOUT
	x = Flatten()(x)
	x = Dense(16)(x)
	x = Activation("relu")(x)
	x = BatchNormalization(axis=chanDim)(x)
	x = Dropout(0.5)(x)
	# apply another FC layer, this one to match the number of nodes
	# coming out of the MLP
	x = Dense(4)(x)
	x = Activation("relu")(x)
	x = Dense(1, activation="linear")(x)
    
    # construct the CNN
	model = Model(inputs, x)
	# return the CNN
	return model

In [None]:
cnn = create_cnn(imgSize, imgSize, 3, regress=False)
cnn.compile('Adam', 'mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

print(cnn.summary())

In [None]:
# Fit

history = cnn.fit(X_train, y_train, 
                    validation_split = 0.2,
                    batch_size = 4,
                    epochs = 20)

In [None]:
# Learning curves

acc = history.history['root_mean_squared_error']
val_acc = history.history['val_root_mean_squared_error']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training rmse')
plt.plot(epochs, val_acc, 'r', label='Validation rmse')
plt.title('Training and validation root_mean_squared_error')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Predict on test

y_test = np.array(y_test)

Preds = cnn.predict(X_test)
Preds = Preds.flatten()
print(Preds.shape)
print(y_test.shape)

# RMSE on test
np.sqrt(np.mean((Preds-y_test)**2))

In [None]:
del X_train
del X_test
del cnn

gc.collect()

# Images AND tabular metadata: two inputs into Keras

In [None]:
# Split into Train & Test

# Tabular
X_trainNum, X_testNum, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42)
print ('X_trainNum: ', X_trainNum.shape)
print ('X_testNum: ', X_testNum.shape)
print ('y_train: ', y_train.shape)
print ('y_test: ', y_test.shape)

# Images
X_trainImg, X_testImg, y_NOTneeded, y_testNOTneeded = train_test_split(X_img, y, test_size=0.2, random_state=42)
print ('X_trainImg: ', X_trainImg.shape)
print ('X_testImg: ', X_testImg.shape)


In [None]:
def create_cnnPart(width, height, depth, filters=(16, 32, 64), regress=False):
	# initialize the input shape and channel dimension, assuming
	# TensorFlow/channels-last ordering
	inputShape = (imgSize, imgSize, 3)
	chanDim = -1
	# define the model input
	inputs = Input(shape=inputShape)
	# loop over the number of filters
	for (i, f) in enumerate(filters):
		# if this is the first CONV layer then set the input
		# appropriately
		if i == 0:
			x = inputs
		# CONV => RELU => BN => POOL
		x = Conv2D(f, (3, 3), padding="same")(x)
		x = Activation("relu")(x)
		x = BatchNormalization(axis=chanDim)(x)
		x = MaxPooling2D(pool_size=(2, 2))(x)

	# flatten the volume, then FC => RELU => BN => DROPOUT
	x = Flatten()(x)
	x = Dense(16)(x)
	x = Activation("relu")(x)
	x = BatchNormalization(axis=chanDim)(x)
	x = Dropout(0.5)(x)
	# apply another FC layer, this one to match the number of nodes
	# coming out of the MLP
	x = Dense(64)(x)
	x = Activation("relu")(x)

    
    # construct the CNN
	model = Model(inputs, x)
	# return the CNN
	return model

def create_mlpPart(dim, regress=False):
    model = keras.Sequential()
    model.add(Dense(64, input_dim=dim, activation="relu"))
    model.add(Dropout(0.5))
    
    return model

In [None]:
# Model

cnnBranch = create_cnnPart(imgSize, imgSize, 3, regress=False)
mlpBranch = create_mlpPart(X_trainNum.shape[1], regress=False)

# the MLP and CNN
combinedInput = concatenate([mlpBranch.output, cnnBranch.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(10, activation="relu")(combinedInput)
x = Dropout(0.1)(x)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP

model = Model(inputs=[mlpBranch.input, cnnBranch.input], outputs=x)

model.compile('Adam', 'mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

print(model.summary())

In [None]:
# Fit

history = model.fit([X_trainNum, X_trainImg], y_train, 
                    validation_split = 0.2,
                    batch_size = 4,
                    epochs = 20)

In [None]:
# Learning curves

acc = history.history['root_mean_squared_error']
val_acc = history.history['val_root_mean_squared_error']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(acc) + 1)
plt.plot(epochs, acc, 'bo', label='Training rmse')
plt.plot(epochs, val_acc, 'r', label='Validation rmse')
plt.title('Training and validation root_mean_squared_error')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'r', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()

In [None]:
# Predict on test

y_test = np.array(y_test)

Preds = model.predict([X_testNum, X_testImg])
Preds = Preds.flatten()
print(Preds.shape)
print(y_test.shape)

# RMSE on test
np.sqrt(np.mean((Preds-y_test)**2))

# Retrain on ALL data, predict on Kaggle test and submit

In [None]:
# Model

cnnBranch = create_cnnPart(imgSize, imgSize, 3, regress=False)
mlpBranch = create_mlpPart(X_trainNum.shape[1], regress=False)

# the MLP and CNN
combinedInput = concatenate([mlpBranch.output, cnnBranch.output])
# our final FC layer head will have two dense layers, the final one
# being our regression head
x = Dense(10, activation="relu")(combinedInput)
x = Dropout(0.1)(x)
x = Dense(1, activation="linear")(x)
# our final model will accept categorical/numerical data on the MLP

model = Model(inputs=[mlpBranch.input, cnnBranch.input], outputs=x)

model.compile('Adam', 'mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])

history = model.fit([X_num, X_img], y, 
                    batch_size = 4,
                    epochs = 20)

print('model fit')

In [None]:
del X_num
del X_img
del X_trainNum
del X_trainImg

gc.collect()

# Prep the Kaggle test

In [None]:
test_data = pd.read_csv(path+'test.csv')
print(test_data.shape)
test_data.sample(1)

In [None]:
samp_subm = pd.read_csv(path+'sample_submission.csv')
samp_subm.sample(1)

In [None]:
X_numKag = test_data.drop(['Id'], axis=1)
print(X_numKag.shape)

In [None]:
# Resize & Normalize the images

X_imgKag = []
for i, row in test_data.iterrows():
    rawImg = cv2.imread(path+'test/'+row['Id']+'.jpg')
    image = cv2.resize(rawImg, (imgSize,imgSize), interpolation = cv2.INTER_AREA)
    normImg = cv2.normalize(image, None, alpha=0, beta=1, norm_type=cv2.NORM_MINMAX, dtype=cv2.CV_32F)
    X_imgKag.append(normImg)

X_imgKag = np.array(X_imgKag)
X_imgKag.shape

In [None]:
# Predict on Kaggle test

Preds = model.predict([X_numKag, X_imgKag])
Preds = Preds.flatten()

mySubmit = pd.DataFrame(test_data.Id)
mySubmit['Pawpularity'] = Preds
mySubmit.head()

In [None]:
mySubmit.to_csv('submission.csv', index=False)