In [None]:
# /kaggle/input/petfinder-pawpularity-score/sample_submission.csv
# /kaggle/input/petfinder-pawpularity-score/train.csv
# /kaggle/input/petfinder-pawpularity-score/test.csv
# /kaggle/input/petfinder-pawpularity-score/test/

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split

import tensorflow as tf

from PIL import Image

import os, shutil

from tqdm import tqdm

In [None]:
class PetsDataProcessor:
    
    def __init__(self, dataframe_path, images_train_path, images_valid_path, target_col, shuffle=True, random_seed=42, test_size=0.15):
        self.df = pd.read_csv(dataframe_path)
        self.df['Id'] = self.df['Id'].apply(lambda x: '.'.join([x, 'jpg']))
        
        self.target_col = target_col
        
        self.shuffle = shuffle
        self.random_seed = random_seed
        self.test_size = test_size
        
        self.images_train_path = images_train_path
        self.images_valid_path = images_valid_path
        
    def split_dataframe(self):
        df_train, df_valid, _, _ = train_test_split(self.df, self.df[self.target_col], shuffle=self.shuffle, random_state=self.random_seed, test_size=self.test_size)
#         self.split_images(df_valid) REMOVE FOR KAGGLE SYSTEM
        return df_train, df_valid
        
    def split_images(self, df_valid):
        if not os.path.exists(self.images_valid_path):
            os.mkdir(self.images_valid_path)
        
        valid_files = os.listdir(self.images_valid_path)

        for v_file in valid_files:
            src_file_path = os.path.join(self.images_valid_path, v_file)
            dst_file_path = os.path.join(self.images_train_path, v_file)
            os.replace(src_file_path, dst_file_path)

        for v_file in df_valid['Id'].values:
            src_file_path = os.path.join(self.images_train_path, v_file)
            dst_file_path = os.path.join(self.images_valid_path, v_file)
            os.replace(src_file_path, dst_file_path)

In [None]:
IMAGE_SIZE = (256, 256)
TRAIN_DATA_PATH = '/kaggle/input/petfinder-pawpularity-score/train/'
TEST_DATA_PATH = '/kaggle/input/petfinder-pawpularity-score/test/'

In [None]:
data = PetsDataProcessor(dataframe_path='/kaggle/input/petfinder-pawpularity-score/train.csv',
                         images_train_path=TRAIN_DATA_PATH,
                         images_valid_path='validation/', target_col='Pawpularity')
df_train, df_valid = data.split_dataframe()

In [None]:
target_columns = [x for x in df_train.columns if x not in ['Id', 'Pawpularity']]

In [None]:
obj_columns = {}
for col in target_columns:
    obj_columns[col] = (df_train[df_train[col] == 1], df_valid[df_valid[col] == 1])

In [None]:
cols_generators = {}
for col, df_tuple in obj_columns.items():
    print(f'-- {col} --')
    train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255, horizontal_flip=True, rotation_range=30, zoom_range=0.2)
    train_generator = train_datagen.flow_from_dataframe(dataframe=df_tuple[0],
                                                        directory=TRAIN_DATA_PATH,
                                                        x_col="Id", y_col="Pawpularity", 
                                                        class_mode="raw",
                                                        target_size=IMAGE_SIZE, batch_size=12)
    valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale = 1./255)
    valid_generator = valid_datagen.flow_from_dataframe(dataframe=df_tuple[1],
                                                        directory=TRAIN_DATA_PATH,
                                                        x_col="Id", y_col="Pawpularity", 
                                                        class_mode="raw",
                                                        target_size=IMAGE_SIZE, batch_size=4)
    cols_generators[col] = (train_generator, valid_generator)
    print('-- --')

In [None]:
cols_models = {}
for col in target_columns:
    base_model = tf.keras.models.Sequential([
        tf.keras.layers.Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', input_shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3)),
        tf.keras.layers.MaxPool2D((2, 2)),
        tf.keras.layers.Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.MaxPool2D((2, 2)),
        tf.keras.layers.Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.MaxPool2D((2, 2)),
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_uniform'),
        tf.keras.layers.Dropout(rate=0.25),
        tf.keras.layers.Dense(1, activation='sigmoid', kernel_initializer='glorot_uniform'),
        tf.keras.layers.Lambda(lambda x: x * 100.)
    ])
    cols_models[col] = base_model
    cols_models[col].compile(optimizer='adadelta', loss=tf.keras.losses.Huber(), metrics=[tf.keras.metrics.RootMeanSquaredError(name='rmse')])

In [None]:
for col in target_columns:
    es = tf.keras.callbacks.EarlyStopping(patience=5)
    print(f'--- LEARN {col} --------')
    train_gen = cols_generators[col][0]
    valid_gen = cols_generators[col][1]
    steps_per_epoch = obj_columns[col][0].shape[0] // 12
    validation_steps = obj_columns[col][0].shape[1] // 4
    cols_models[col].fit(train_gen, epochs=20, steps_per_epoch=steps_per_epoch, validation_data=valid_gen, validation_steps=validation_steps, callbacks=[es])

In [None]:
class Predictor:
    
    def __init__(self, models):
        self.models = models
        
    def predict(self, data, image_col, image_path, image_size):
        images = data[image_col]
        data = data.drop(['Pawpularity', 'Id'], axis=1, errors='ignore')
        predictions = pd.Series(index=data.index)
        for i in tqdm(data.index):
            target_columns = [col for col in data.columns if data[col].loc[i]]
            img = Image.open(os.path.join(image_path, images.loc[i]))
            img = img.resize(image_size)
            img = np.asarray(img, dtype='float32') / 255.
            img = np.expand_dims(img, 0)
            prediction = np.mean(self.generate_predictions(img, target_columns, data.columns))
            predictions.loc[i] = prediction
        return predictions
        
    def evaluate(self, data, image_col, target_col, image_path, image_size):
        predictions = self.predict(data, image_col, image_path, image_size)
        target = data[target_col]
        metric = tf.keras.metrics.RootMeanSquaredError(name='rmse')
        metric.update_state(y_true=target, y_pred=predictions)
        print(f'rmse: {metric.result().numpy()}')
        print('-------')
            
    def generate_predictions(self, image, columns, all_columns):
        predictions = []
        target_columns = columns if columns else all_columns
        for col in target_columns:
            predictions.append(self.models[col].predict(image))
        return predictions

In [None]:
predictor = Predictor(cols_models)

In [None]:
df_test = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/test.csv')
df_test['Id'] = df_test['Id'].apply(lambda x: '.'.join([x, 'jpg']))

In [None]:
predictions = predictor.predict(df_test, 'Id', TEST_DATA_PATH, IMAGE_SIZE)
df_test['Pawpularity'] = predictions
df_test['Id'] = df_test['Id'].apply(lambda x: x.split('.')[0])
df_test[['Id', 'Pawpularity']].to_csv("submission.csv", index=False)