In [None]:
import os
from tqdm import tqdm

import pandas as pd
import numpy as np
import imagehash
from PIL import Image

# import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedShuffleSplit

import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Dense, Flatten, Input, Concatenate, Dropout, Conv2D, MaxPooling2D, GlobalAveragePooling2D
# from tensorflow.keras.applications.resnet50 import ResNet50
# from tensorflow.keras.models import load_model

gpus = tf.config.experimental.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(gpus[0], True)

import cv2


def getImagePath(imgId, dataType):
    if dataType == 'train':
        return os.path.join("../input/petfinder-pawpularity-score/train", imgId+'.jpg')
    if dataType == 'test':
        return os.path.join("../input/petfinder-pawpularity-score/test", imgId+'.jpg')

In [None]:
# from keras_tuner.tuners import RandomSearch
# from keras_tuner.engine.hyperparameters import HyperParameters
# from keras_tuner import Objective
# import time

# LOG_DIR = f"{int(time.time())}"

In [None]:
trainCSV = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
testCSV = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")

trainImgDir = "../input/petfinder-pawpularity-score/train"
simImgDict = {}
rowsToRemove = []
for imgName in tqdm(os.listdir(trainImgDir)):
    hash = imagehash.phash(Image.open(os.path.join(trainImgDir, imgName)))
    if hash not in simImgDict:
        simImgDict[hash] = [imgName, []]
    else:
        simImgDict[hash][1].append(imgName)

print("Len of train images before deduplication: {}".format(len(trainCSV)))
for k,v in simImgDict.items():
    if(len(v[1])>0):
        for imgPath in v[1]:
            trainCSV.drop(trainCSV.index[trainCSV["Id"] == imgPath[:-4]], inplace=True)
            
del simImgDict
print("Len of train images after deduplication: {}".format(len(trainCSV)))
trainCSV.reset_index(drop=True, inplace=True)

split = StratifiedShuffleSplit(n_splits = 1, test_size = 0.25, random_state=42)
for train_index, val_index in split.split(trainCSV, trainCSV['Pawpularity']):
    trainDF = trainCSV.loc[train_index]
    valDF = trainCSV.loc[val_index]


del trainCSV
del split


In [None]:
trainX = trainDF.drop("Pawpularity", axis=1)
trainY = trainDF.Pawpularity
valX = valDF.drop("Pawpularity", axis=1)
valY = valDF.Pawpularity

imgTrainXList = []
imgValXList = []
imgTestXList = []

IMG_SIZE = 64

for imgId in tqdm(trainX.Id):
    imgTrainXList.append(cv2.cvtColor(cv2.resize(cv2.imread(getImagePath(imgId,'train')), (IMG_SIZE, IMG_SIZE)), cv2.COLOR_BGR2RGB)/255.)

imgTrainX =  np.array(imgTrainXList).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
featuresTrainX = trainX.drop('Id', axis=1)
del imgTrainXList
del trainX
del trainDF

for imgId in tqdm(valX.Id):
    imgValXList.append(cv2.cvtColor(cv2.resize(cv2.imread(getImagePath(imgId,'train')), (IMG_SIZE, IMG_SIZE)), cv2.COLOR_BGR2RGB)/255.)
    
imgValX = np.array(imgValXList).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
featuresValX = valX.drop('Id', axis=1)
del imgValXList
del valX
del valDF
    

for imgId in tqdm(testCSV.Id):
    imgTestXList.append(cv2.cvtColor(cv2.resize(cv2.imread(getImagePath(imgId,'test')), (IMG_SIZE, IMG_SIZE)), cv2.COLOR_BGR2RGB)/255.)

imgTestX = np.array(imgTestXList).reshape(-1, IMG_SIZE, IMG_SIZE, 3)
del imgTestXList

featuresTestX = testCSV.drop('Id', axis=1)

In [None]:
# def buildModel(hp):
#     img_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
#     imgModel = Conv2D(hp.Int("img_input_units", min_value=32, max_value=256, step=32), kernel_size=(3, 3), activation='relu', input_shape=(IMG_SIZE,IMG_SIZE,3))(img_input)
    
#     for i in range(hp.Int("img_n_layers", 1, 4)):
#         imgModel = Conv2D(hp.Int("conv_{}_units".format(i), min_value=32, max_value=256, step=32), (3, 3), activation='relu')(imgModel)
#         imgModel = MaxPooling2D(pool_size=(2, 2))(imgModel)
#         imgModel = Dropout(rate=hp.Float("dropout_{}_rate".format(i), 0, 0.5, step=0.1))(imgModel)

#     imgModel = GlobalAveragePooling2D()(imgModel)
#     imgModel = Dense(128, activation='relu')(imgModel)
#     imgModel = Dropout(0.5)(imgModel)
#     imgModelOutput = Dense(1, activation='linear')(imgModel)

#     tag_input = Input(shape = featuresTrainX.shape[1:])
#     tagModel = Dense(hp.Int("features_input_units", min_value=32, max_value=256, step=32), input_dim=12, activation='relu')(tag_input)
#     for i in range(hp.Int("features_n_layers", 1, 4)):
#         tagModel = Dense(hp.Int("feature_dense_{}_units".format(i), min_value=32, max_value=256, step=32))(tagModel)
    
#     tagModelOutput = Dense(1, activation='linear')(tagModel)

#     concatenated = Concatenate(axis=-1)([imgModelOutput, tagModelOutput])
#     concatModel = Flatten()(concatenated)
#     for i in range(hp.Int("concat_n_layers",1, 4)):
#         concatModel = Dense(hp.Int("concat_dense_{}_units".format(i), min_value=32, max_value=64, step=16))(concatModel)
#     output = Dense(1, activation='linear')(concatModel)

#     model = Model([img_input, tag_input], output)
#     # model.summary()

#     hp_learning_rate = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])
#     model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=hp_learning_rate), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
#     return model

# tuner = RandomSearch(
#     buildModel,
#     objective = Objective("val_root_mean_squared_error", direction="min"),
#     max_trials = 30,
#     executions_per_trial = 1,
#     directory = LOG_DIR
# )


# earlyStopping = tf.keras.callbacks.EarlyStopping(
#     monitor='val_root_mean_squared_error', min_delta=0, patience=15, verbose=0,
#     mode='auto', baseline=None, restore_best_weights=True
# )


# tuner.search(x=[imgTrainX, featuresTrainX], 
#              y=trainY,
#             epochs = 40,
#             batch_size = 64,
#             validation_data = ((imgValX, featuresValX), valY),
#             callbacks=[earlyStopping],
#             verbose=1
#             )

# tuner.results_summary()

# Results summary
# Results in D:\Kaggle\pawpularity\keras_tuner_log\1642147429\untitled_project
# Showing 10 best trials
# Objective(name='val_root_mean_squared_error', direction='min')

# Trial summary
# Hyperparameters:
# img_input_units: 32
# img_n_layers: 4
# conv_0_units: 96
# dropout_0_rate: 0.5
# features_input_units: 128
# features_n_layers: 2
# feature_dense_0_units: 192
# concat_n_layers: 3
# concat_dense_0_units: 48
# learning_rate: 0.0001
# conv_1_units: 192
# dropout_1_rate: 0.1
# feature_dense_1_units: 32
# feature_dense_2_units: 192
# concat_dense_1_units: 48
# concat_dense_2_units: 48
# conv_2_units: 224
# dropout_2_rate: 0.1
# concat_dense_3_units: 48
# conv_3_units: 32
# dropout_3_rate: 0.0
# Score: 20.45026206970215

In [None]:
img_input = Input(shape=(IMG_SIZE, IMG_SIZE, 3))
imgModel = Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(IMG_SIZE,IMG_SIZE,3))(img_input)

imgModel = Conv2D(96, (3, 3), activation='relu')(imgModel)
imgModel = MaxPooling2D(pool_size=(2, 2))(imgModel)
imgModel = Dropout(0.5)(imgModel)

imgModel = Conv2D(192, (3, 3), activation='relu')(imgModel)
imgModel = MaxPooling2D(pool_size=(2, 2))(imgModel)
imgModel = Dropout(0.1)(imgModel)

imgModel = Conv2D(224, (3, 3), activation='relu')(imgModel)
imgModel = MaxPooling2D(pool_size=(2, 2))(imgModel)
imgModel = Dropout(0.1)(imgModel)

imgModel = Conv2D(32, (3, 3), activation='relu')(imgModel)
imgModel = MaxPooling2D(pool_size=(2, 2))(imgModel)
imgModel = Dropout(0)(imgModel)

imgModel = GlobalAveragePooling2D()(imgModel)
imgModel = Dense(128, activation='relu')(imgModel)
imgModel = Dropout(0.5)(imgModel)
imgModelOutput = Dense(1, activation='linear')(imgModel)

tag_input = Input(shape = featuresTrainX.shape[1:])
tagModel = Dense(128, input_dim=12, activation='relu')(tag_input)
tagModel = Dense(192, activation='relu')(tagModel)
tagModel = Dense(32, activation='relu')(tagModel)
tagModel = Dense(192, activation='relu')(tagModel)
tagModelOutput = Dense(1, activation='linear')(tagModel)

concatenated = Concatenate(axis=-1)([imgModelOutput, tagModelOutput])
concatModel = Flatten()(concatenated)
concatModel = Dense(48, activation='relu')(concatenated)
concatModel = Dense(48, activation='relu')(concatenated)
concatModel = Dense(48, activation='relu')(concatenated)
concatModel = Dense(48, activation='relu')(concatenated)
output = Dense(1, activation='linear')(concatModel)

model = Model([img_input, tag_input], output)
model.summary()

earlyStopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_root_mean_squared_error', min_delta=0, patience=15, verbose=0,
    mode='auto', baseline=None, restore_best_weights=True
)


model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001), loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])


model.fit(x=[imgTrainX, featuresTrainX], 
             y=trainY,
            epochs = 40,
            batch_size = 64,
            validation_data = ((imgValX, featuresValX), valY),
            callbacks=[earlyStopping],
            verbose=1
            )

In [None]:
# imgModel.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
# imgModel.fit(imgTrainX, trainY,
#     validation_data=((imgValX, valY), epochs=30, batch_size=32, callbacks=[earlyStopping])

In [None]:
# model.compile(optimizer='adam', loss='mse', metrics=[tf.keras.metrics.RootMeanSquaredError()])
# model.fit([imgTrainX, featuresTrainX], trainY,
#     validation_data=((imgValX, featuresValX), valY), epochs=30, batch_size=32, callbacks=[earlyStopping])

In [None]:
testPred = model.predict([imgTestX, featuresTestX])
testPred

In [None]:
submissionData = []

for id, pawScore in zip(testCSV.Id, testPred):
    submissionData.append([id, pawScore[0]])
submissionDF = pd.DataFrame(submissionData, columns=['Id', 'Pawpularity']) 
submissionDF
submissionDF.to_csv('submission.csv',index=False)