In [None]:
import numpy as np
import pandas as pd
import os
import glob

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras import backend as K

for gpu in tf.config.experimental.list_physical_devices('GPU'):
    tf.config.experimental.set_memory_growth(gpu, True)

from catboost import CatBoostRegressor

import optuna
from optuna.samplers import TPESampler

from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings("ignore")

import gc

In [None]:
#Parameters

batch = 32
val_split = 0.2
seed = 2021

In [None]:
# Callback function to print log messages when the best trial is updated

def logging_callback(study, frozen_trial):
    prev_best = study.user_attrs.get('prev_best', None)
    if prev_best != study.best_value:
        study.set_user_attr('prev_best', study.best_value)
        print(f"Trail {frozen_trial.number} finished with best value {frozen_trial.value}")

In [None]:
#DataFrames

train_df = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
test_df = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')
sub = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/sample_submission.csv')

train_df = train_df.sort_values(by='Id', ascending=True)
test_df = test_df.sort_values(by='Id', ascending=True)

train_df.head()

In [None]:
#Import List of Images

train_images = glob.glob("/kaggle/input/petfinder-pawpularity-score/train/*.jpg")
test_images = glob.glob("/kaggle/input/petfinder-pawpularity-score/test/*.jpg")

In [None]:
#Concatenate The Image Path

train_df['Image'] = train_images
test_df['Image'] = test_images

In [None]:
#Try Removing Outliers

from scipy import stats

train_df = train_df[(np.abs(stats.zscore(train_df['Pawpularity'])) < 2)]

train_df.hist(column='Pawpularity', bins=10)

In [None]:
#Set-up of Data Generator

train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        validation_split=val_split)

test_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)

In [None]:
#Load Train Data

train = train_datagen.flow_from_dataframe(dataframe=train_df,
    directory='/kaggle/input/petfinder-pawpularity-score/train',
    x_col="Image",
    y_col="Pawpularity",
    subset="training",
    target_size=(256,256),
    batch_size=batch,
    seed=seed,
    shuffle=True,
    class_mode="raw")

validation = train_datagen.flow_from_dataframe(dataframe=train_df,
    directory='/kaggle/input/petfinder-pawpularity-score/train',
    x_col="Image",
    y_col="Pawpularity",
    subset="validation",
    target_size=(256,256),
    batch_size=batch,
    seed=seed,
    shuffle=True,
    class_mode="raw")

test = test_datagen.flow_from_dataframe(dataframe=test_df,
    directory='/kaggle/input/petfinder-pawpularity-score/test',
    x_col="Image",
    y_col=None,
    target_size=(256,256),
    batch_size=batch,
    seed=seed,
    shuffle=False,
    class_mode=None)

train_imgs = test_datagen.flow_from_dataframe(dataframe=train_df,
    directory='/kaggle/input/petfinder-pawpularity-score/train',
    x_col="Image",
    y_col=None,
    target_size=(256,256),
    batch_size=batch,
    seed=seed,
    shuffle=False,
    class_mode=None)

In [None]:
#Transfer Learning - Xception

base_model = tf.keras.applications.efficientnet.EfficientNetB7(weights='../input/tfkerasefficientnetimagenetnotop/efficientnetb7_notop.h5',
                                                               input_shape=(256, 256, 3),include_top=False)
base_model.trainable = False

inputs = tf.keras.Input(shape=(256, 256, 3))
x = base_model(inputs, training=False)
x = tf.keras.layers.GlobalAveragePooling2D()(x)

outputs = tf.keras.layers.Dense(1)(x)

model = tf.keras.Model(inputs, outputs)
model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=tf.keras.metrics.RootMeanSquaredError())

In [None]:
"""

#Build Model

model = Sequential()
model.add(Conv2D(32, (3, 3), input_shape=(256, 256, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('linear'))

model.compile(loss='mean_squared_error',
              optimizer='adam',
              metrics=tf.keras.metrics.RootMeanSquaredError())

"""

In [None]:
# Early Stopping to prevent overfitting
early_stopper = tf.keras.callbacks.EarlyStopping(
    monitor="val_root_mean_squared_error", 
    patience=5, 
    verbose=2, 
    restore_best_weights=True)

In [None]:
#Fit Model

model.fit(
        train,
        steps_per_epoch=round(len(train_df)*(1-val_split)) // batch,
        epochs=25,
        validation_data=validation,
        validation_steps=round(len(train_df)*(val_split)) // batch,
        callbacks=[early_stopper])

In [None]:
#Prediction/Train Features

preds_img_train = model.predict(train_imgs)

preds_img_test = model.predict(test)

In [None]:
#Add Image Prediction to DataFrames

train_df['img_inf'] = preds_img_train
test_df['img_inf'] = preds_img_test

train_df.drop(['Image','Id'], inplace=True, axis=1)
test_df.drop(['Image','Id'], inplace=True, axis=1)

train_df.head()

In [None]:
#Put y to Array & Drop from DF

y = np.array(train_df['Pawpularity'])
train_df.drop(['Pawpularity'], inplace=True, axis=1)

train_df = train_df.round(0)
test_df = test_df.round(0)

train_df = train_df.values
test_df = test_df.values

In [None]:
#Clear Up RAM/Cache

del preds_img_train
del preds_img_test
del model
del train
del validation
del test
del train_imgs
del train_images
del test_images

gc.collect()

In [None]:
# Setup CatBoost hyperparameters

def get_catb_hyperparams(trial):
    catb_params = {
        'loss_function': 'RMSE',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'iterations': trial.suggest_int('iterations', 2000, 20000),
        'od_wait': trial.suggest_int('od_wait', 500, 2000),
        'learning_rate': trial.suggest_uniform('learning_rate', 0.01, 0.3),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 1e-4, 100),
        'subsample': trial.suggest_uniform('subsample', 0, 1),
        'random_strength': trial.suggest_uniform('random_strength', 10, 50),
        'depth': trial.suggest_int('depth', 1, 15),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 30),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 15)
    }
    return catb_params

In [None]:
# Define objective function

def objective_catb(trial, X, Y, n_splits=3):
    
    catb_params = get_catb_hyperparams(trial)
    
    kfolds = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    
    rmse_score = []
    
    for train_index, val_index in kfolds.split(X, Y):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
    
        catb_opt = CatBoostRegressor(**catb_params)
    
        catb_opt = catb_opt.fit(x_train, y_train, eval_set=[(x_val,y_val)], early_stopping_rounds=200)
        preds = catb_opt.predict(x_val)
        
        rmse_score.append(mean_squared_error(y_val, preds, squared=False))
    
    rmse_score = np.mean(rmse_score)
    
    return rmse_score

In [None]:
"""

study = optuna.create_study(sampler=TPESampler(seed=seed), 
                            direction='minimize', 
                            study_name='catb_tuning')
objc = lambda trial : objective_catb(trial, train_df, y)

study.optimize(objc, timeout=60*30, callbacks=[logging_callback])

print(f"Best RMSE value: {study.best_value}")
print(f"Best params: ")
for param, value in study.best_params.items():
    print(f"\t{param} : {value}")

"""

In [None]:
#Result of Optuna
    
best_params ={'loss_function': 'RMSE',
        'task_type': 'GPU',
        'bootstrap_type': 'Bernoulli',
        'iterations': 12000,
        'od_wait': 2000,
        'learning_rate': 0.22110182960210167,
        'reg_lambda': 51.45097942274997,
        'subsample': 0.001465836636805773,
        'random_strength': 10.126462267372595,
        'depth': 6,
        'min_data_in_leaf': 26,
        'leaf_estimation_iterations': 2}

In [None]:
kfolds = KFold(n_splits=3, shuffle=True, random_state=seed)

preds = []

for train_index, val_index in kfolds.split(train_df, y):
    x_train, x_val = train_df[train_index], train_df[val_index]
    y_train, y_val =  y[train_index],  y[val_index]

    catb = CatBoostRegressor(**best_params)

    catb = catb.fit(x_train, y_train, eval_set=[(x_val,y_val)], early_stopping_rounds=300, use_best_model=True, verbose=False)
    pred = catb.predict(test_df)

    preds.append(pred)
    
preds = np.mean(preds, axis=0)

In [None]:
#Make Submission

sub['Pawpularity'] = preds

sub.to_csv("submission.csv", index = False)