In [None]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import PIL
import PIL.Image
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.applications import EfficientNetB2
from tensorflow.keras.models import Sequential
from sklearn.model_selection import StratifiedKFold, KFold

data_dir = '/kaggle/input/petfinder-pawpularity-score'
# need to Add Data petfinder-pawpularity-score at the right sidebar
train = pd.read_csv(os.path.join(data_dir, 'train.csv'))
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
sample_submission = pd.read_csv(os.path.join(data_dir, 'sample_submission.csv'))

# load data

In [None]:
train.head()

In [None]:
sample_submission

In [None]:
test.head()

# train image shape

In [None]:
def convert_id_to_jpg(Id: str) -> str:
    return f'{Id}.jpg'

In [None]:
sample_train_image_fn = os.path.join(data_dir, 'train', convert_id_to_jpg(train['Id'][0]))
sample_test_image_fn = os.path.join(data_dir, 'test', convert_id_to_jpg(test['Id'][0]))

In [None]:
image_shape_list = [PIL.Image.open(os.path.join(data_dir, 'train', convert_id_to_jpg(ids))).size for ids in train['Id']]

df_image_shape = pd.DataFrame()
df_image_shape['image_shape'] = image_shape_list
df_image_shape

In [None]:
df_image_shape['image_shape'].value_counts() / df_image_shape.shape[0] * 100

In [None]:
print(df_image_shape['image_shape'].apply(lambda x: x[0]).mean())
print(df_image_shape['image_shape'].apply(lambda x: x[1]).mean())

In [None]:
# mininum image size
df_image_shape['image_shape'].min()

In [None]:
# maximum image size
df_image_shape['image_shape'].max()

In [None]:
sample_train_image = PIL.Image.open(sample_train_image_fn)
print(sample_train_image.size)
sample_train_image

In [None]:
# Example Test Data
# In addition to the training data, we include some randomly generated example test data to help you author submission code. 
# When your submitted notebook is scored, this example data will be replaced by the actual test data (including the sample submission).
# The actual test data comprises about 6800 pet photos similar to the training set photos.

# need to check actual test data set size is 128*128 or 405, 720
sample_test_image = PIL.Image.open(sample_test_image_fn)
print(sample_test_image.size)
sample_test_image

# meta data information

In [None]:
train.describe()

In [None]:
test.describe()

# build tf dataset

In [None]:
# Reference: 
# https://www.kaggle.com/awsaf49/tf-petfinder-image-tpu-train?scriptVersionId=77237363&cellId=39
# https://keras.io/examples/vision/image_classification_efficientnet_fine_tuning/

In [None]:
IMG_SIZE = 224
target_size = [IMG_SIZE, IMG_SIZE]
input_shape = (*target_size, 3)
batch_size = 32
KFOLD = 5
TRAIN = True
IS_NORMALIZE_TARGET = True

In [None]:
def decode(path):
    file_bytes = tf.io.read_file(path)
    img = tf.image.decode_jpeg(file_bytes, channels=3)
    # forcefully resize & casting image is critical. 
    # keep origin image format and use efficientnet preprocessing
    img = tf.cast(tf.image.resize_with_pad(img, IMG_SIZE, IMG_SIZE), dtype=tf.int32)
    return tf.keras.applications.efficientnet.preprocess_input(img)

In [None]:
def decode_with_labels(path, label):
    return decode(path), tf.cast(label, tf.float32) if label is not None else None

In [None]:
def build_dataset(df: pd.DataFrame, is_train: bool = True):
    img_path_list = df['Id'].apply(lambda x: os.path.join(data_dir, 'train' if is_train else 'test', convert_id_to_jpg(x))).tolist()
    target = df['Pawpularity'].tolist() if is_train else None
    
    ds = tf.data.Dataset.from_tensor_slices((img_path_list, target))
    if is_train:
        ds = ds.repeat()
    ds = ds.map(decode_with_labels, tf.data.AUTOTUNE).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    steps_per_epoch = df.shape[0] // batch_size
    return ds, steps_per_epoch

In [None]:
skf = KFold(n_splits=KFOLD, random_state=123, shuffle=True)


train['fold'] = -1

for nth, (_, valid_index) in enumerate(skf.split(train)):
    train.loc[valid_index, 'fold'] = nth

In [None]:
# skf = StratifiedKFold(n_splits=KFOLD, random_state=123, shuffle=True)

# train['fold'] = -1

# for nth, (_, valid_index) in enumerate(skf.split(train, train['Pawpularity'])):
#     train.loc[valid_index, 'fold'] = nth

# build model

In [None]:
def build_model(input_shape=(*target_size, 3), drop_rate=0.20, use_img_augmentation=False):
    
    img_augmentation = Sequential(
        [
            layers.experimental.preprocessing.RandomRotation(factor=0.15),
            layers.experimental.preprocessing.RandomTranslation(height_factor=0.1, width_factor=0.1),
            layers.experimental.preprocessing.RandomFlip(),
            layers.experimental.preprocessing.RandomContrast(factor=0.1),
            layers.experimental.preprocessing.RandomCrop(IMG_SIZE, IMG_SIZE, seed=123),
#             layers.experimental.preprocessing.Resizing(IMG_SIZE, IMG_SIZE)
        ],
        name="img_augmentation",
    )
    
    inputs = layers.Input(shape=input_shape)
    x = img_augmentation(inputs) if use_img_augmentation else inputs

    # downloaded pre-trained weights from tensorflow applications Efficientnet
    # Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb2_notop.h5
    # !wget https://storage.googleapis.com/keras-applications/efficientnetb2_notop.h5
    # load noisy-student pretrained weights with tf.applications.EfficientNetB2 failed.
    pre_trained_model = '/kaggle/input/efficientnetb2/efficientnetb2_notop.h5'
    model = EfficientNetB2(include_top=False, input_tensor=x, weights=pre_trained_model)

#     # Freeze the pretrained weights
#     model.trainable = False

    # Rebuild top
    x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)    
    x = layers.Dense(128, activation='relu')(x)
    # dropout to reduce overfitting.
    x = layers.Dropout(drop_rate, name="top_dropout")(x)
    outputs = layers.Dense(1, activation="relu", name="pred")(x)


    # Compile
    model = tf.keras.Model(inputs, outputs, name="EfficientNet")
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), 
        loss='mse', 
        metrics=[tf.keras.metrics.RootMeanSquaredError()]
    )
    return model

In [None]:
def train_model(fold_idx: int) -> tf.keras.Model:
    # without augmentation CV RMSE 17.XX train RMSE 19.XX
    train_ds, train_steps = build_dataset(train[train['fold'] != fold_idx], True)
    valid_ds, valid_steps = build_dataset(train[train['fold'] == fold_idx], True)
    
    tf.keras.backend.clear_session()
    
    monitor = 'val_loss'
    best_weights_only = True

#     checkpoint_path = f'model_ckpt_{fold_idx}'
#     checkpoint_dir = os.path.dirname(checkpoint_path)

    es = tf.keras.callbacks.EarlyStopping(patience=3, monitor=monitor, restore_best_weights=best_weights_only)
#     cp = tf.keras.callbacks.ModelCheckpoint(
#         filepath=checkpoint_path,
#         monitor=monitor,
#         save_best_only=best_weights_only,
#         save_weights_only=True,
#     )

    model = build_model(use_img_augmentation=False)
    
    model.fit(
        train_ds, 
        steps_per_epoch=train_steps, 
        validation_data=valid_ds, 
        validation_steps=valid_steps, 
        epochs=100, 
        callbacks=[es]
    )
    return model

In [None]:
def test_model(model: tf.keras.Model) -> np.ndarray:
    test_ds, test_steps = build_dataset(test, False)

    predicted_list = []
    for inputs, _ in test_ds:
        predicted = model(inputs)
        predicted_list.append(predicted.numpy())
        
    return np.concatenate(predicted_list)

In [None]:
def valid_model(model: tf.keras.Model, i: int):
    valid_ds, valid_steps = build_dataset(train[train['fold'] == i], True)

    valid_predicted_list, valid_output_list = [], []

    for steps, (inputs, outputs) in enumerate(valid_ds):
        if steps >= valid_steps:
            break

        predicted = model(inputs)
        valid_predicted_list.append(predicted.numpy())
        valid_output_list.append(outputs.numpy())

    valid_predicted_arr = np.concatenate(valid_predicted_list)
    valid_output_arr = np.concatenate(valid_output_list)
    print('='*20, f'{i}', '='*20)
    print(f'validation RMSE: {np.mean((valid_output_arr - valid_predicted_arr)**2)**0.5}\n')

In [None]:
test_pred_list = []

for i in range(KFOLD):    
    model = train_model(i)

    predicted = test_model(model)
    test_pred_list.append(predicted)
    
pred_arr = np.mean(test_pred_list, axis=0)

In [None]:
sample_submission['Pawpularity'] = pred_arr
sample_submission.to_csv('submission.csv',index=False)
sample_submission