# Import Useful Packages

In [None]:
# Basic packages
import pandas as pd
import numpy as np
import os
# Analysis visualization packages
import matplotlib.pyplot as plt
import seaborn as sns
# Tensorflow packages for model building
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Conv2D, BatchNormalization, MaxPooling2D, Flatten, Concatenate
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, CSVLogger, TensorBoard
from tensorflow.keras import backend as K
from sklearn.model_selection import train_test_split

# Enable GPU Use

In [None]:
# Find GPU device
physical_device = tf.config.experimental.list_physical_devices('GPU')
print(f'Device found : {physical_device}')
# If there is more than 1 visible GPU on the host
if (len(physical_device) >= 1):
    # Check if GPU is on used for training or not
    if (tf.config.experimental.get_memory_growth(physical_device[0]) == 1):
        # If the check returns False or nothing => set GPU for training
        tf.config.experimental.set_memory_growth(physical_device[0],True)

# Import Dataset from CSV files

In [None]:
dir_csv = '../input/petfinder-pawpularity-score/'

train_df = pd.read_csv(dir_csv+'train.csv')
test_df = pd.read_csv(dir_csv+'test.csv')

# Check if there are NaN values in the train & test dataset
print('Train dataset has NaN values: ', train_df.isnull().values.any())
print('Test dataset has NaN values: ', test_df.isnull().values.any())

# Analyze Train dataset

## Visualize Train dataset

In [None]:
train_df.head()

## Visualize Correlation between Train dataset labels

In [None]:
corr_train_df = train_df.corr()
plt.figure(figsize=(14, 8))
sns.set(font_scale=1)
ax = sns.heatmap(corr_train_df,
        vmin=-1, vmax=1, annot=True, linewidths=.5,
        xticklabels=corr_train_df.columns,
        yticklabels=corr_train_df.columns)
ax.set_ylim(len(corr_train_df.keys()),0)

## Visualize Correlation for Pawpularity 

In [None]:
corr_train_df = train_df.corr()
plt.figure(figsize=(10, 8))
sns.set(font_scale=1)
ax = sns.heatmap(corr_train_df[['Pawpularity']],
        vmin=-1, vmax=1, annot=True, linewidths=.5,
        xticklabels=['Pawpularity'],
        yticklabels=corr_train_df.columns
        )
ax.set_ylim(len(corr_train_df.keys()),0)

### Remarks
Not all features are positively correlated with our output feature(i.e. pawpularity).

### Solution: Attention layer
We will later add an attention layer to our model so that our model can learn to select interesting features and learn from them.

### Pawpularity Min&Max Values

In [None]:
print('Min value of pawpularity: ', train_df['Pawpularity'].values.min())
print('Max value of pawpularity: ', train_df['Pawpularity'].values.max())

## Visualize Pawpularity(ylabel) distribution

In [None]:
%matplotlib inline
train_df['Pawpularity'].plot(kind="hist", bins=100)

### Remarks
We can see from the distribution is our data is imbalanced.

### Solution: Oversampling
To address this problem of imbalanced data, we can resample the dataset by oversampling the minority classes.
Here we can say that each pawpularity score(i.e. integers that ranges from 1 to 100) is an individual class. Thus, we will oversample the minority classes so that we have the same distribution.

## Count number of occurence for each pawpularity class

In [None]:
# Number of occurence sorted by pawpularity class
train_df['Pawpularity'].value_counts().sort_index()

In [None]:
# Number of occurence sorted by the highest number of occurence
train_df['Pawpularity'].value_counts()

### Remarks
We can note that the dominating classes are pawpularity=28 and pawpularity=30 with the number of occurence of 318.
The remaining classes will be oversampled to the same number of occurence of 318.

## Split Train dataset into Training and Validation datasets
Before oversampling split the Train dataset into Training and Validation datasets for the training so the same samples are not seen in the validation dataset (for generalization)

Ratio of 0.9 for Train and 0.1 for Validation

In [None]:
tr_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2)
print(tr_df.shape)
print(val_df.shape)

### Visualize Training dataset after data splitting

In [None]:
tr_df['Pawpularity'].plot(kind="hist", bins=100)

In [None]:
# New data fram for sampled train dataset
sampled_tr_df = pd.DataFrame(columns=tr_df.keys())

In [None]:
# number of max occurence
max_occ = tr_df['Pawpularity'].value_counts().max()

for class_i in range(1,101):
    # If the class is not the dominating class (i.e. number of occurence <= max_occ), do oversampling
    # and append to sampled_tr_df
    if(tr_df[tr_df['Pawpularity'] == class_i]['Pawpularity'].value_counts().values[0] < max_occ):
        ids_class_i = tr_df.index[tr_df['Pawpularity'] == class_i].tolist()
        sampled_ids_class_i = np.random.choice(ids_class_i, max_occ)
        sampled_tr_df = pd.concat([sampled_tr_df, tr_df.loc[sampled_ids_class_i]])
    # If it is the dominating class, directly append to sampled_tr_df
    else:
        ids_class_i = tr_df.index[tr_df['Pawpularity'] == class_i].tolist()
#         sampled_tr_df = sampled_tr_df.append(tr_df.loc[ids_class_i])
        sampled_tr_df = pd.concat([sampled_tr_df, tr_df.loc[ids_class_i]])
        
# Reindex sampled_tr_df
sampled_tr_df = sampled_tr_df.reset_index(drop=True)

## Visualize the Sampled Pawpularity(ylabel) distribution

In [None]:
sampled_tr_df['Pawpularity'].plot(kind="hist", bins=100)

In [None]:
# Dataframe info
print('tr_df:', tr_df.info())
print('sampled_tr_df:', sampled_tr_df.info())

In [None]:
# Change data type (i.e. dtype) of all features except 'Id' to int64 
for key in sampled_tr_df.keys()[1:]:
    sampled_tr_df[key] = sampled_tr_df[key].astype('int64')
print('sampled_tr_df:', sampled_tr_df.info())

# Data Generation
As the previously loaded datasets(i.e. train and test datasets from the CSV files) do not contain the image data(i.e. only image ids), a custom data generation class could be created. In the custom data generation class, the data will be split into batches, to enable batch training, and the image data of the corresponding image id will be generated along side the other feature data.

## Training Data Generator

In [None]:
class CustomTrainDataGen(Sequence):
    
    def __init__(self, df, X_col, y_col,
                 batch_size,
                 input_size=(250, 250, 3),
                 shuffle=True): 
        self.df = df.copy()
        self.X_col = X_col
        self.y_col = y_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.list_IDs = np.arange(len(self.df.index))
        self.indexes = np.arange(len(self.df.index))
        self.shuffle = shuffle 
        self.n = len(self.df)
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path, target_size):
        # Check if train dataset: train_df
        if (len(self.df.keys()) == len(train_df.keys())):
            img_path = dir_csv+'train/'+str(path)+'.jpg'
        else:
            print("Generator Image Data Generation Error")
            return -1
        image = tf.keras.preprocessing.image.load_img(img_path)
        image_arr = tf.keras.preprocessing.image.img_to_array(image)
        image_arr = tf.image.resize(image_arr,(target_size[0], target_size[1])).numpy()

        return image_arr/255.
    
    def __get_data(self, batches):
        # Generates data containing batch_size samples
        subfocus_batch = self.df.iloc[batches, :]['Subject Focus'].values
        eyes_batch = self.df.iloc[batches, :]['Eyes'].values
        face_batch = self.df.iloc[batches, :]['Face'].values
        near_batch = self.df.iloc[batches, :]['Near'].values
        action_batch = self.df.iloc[batches, :]['Action'].values
        acc_batch = self.df.iloc[batches, :]['Accessory'].values
        group_batch = self.df.iloc[batches, :]['Group'].values
        collage_batch = self.df.iloc[batches, :]['Collage'].values
        human_batch = self.df.iloc[batches, :]['Human'].values
        occlusion_batch = self.df.iloc[batches, :]['Occlusion'].values
        info_batch = self.df.iloc[batches, :]['Info'].values
        blur_batch = self.df.iloc[batches, :]['Blur'].values
        # Resize to (self.batch_size, 1)
        subfocus_batch = np.expand_dims(subfocus_batch, axis=1)
        eyes_batch = np.expand_dims(eyes_batch, axis=1)
        face_batch = np.expand_dims(face_batch, axis=1)
        near_batch = np.expand_dims(near_batch, axis=1)
        action_batch = np.expand_dims(action_batch, axis=1)
        acc_batch = np.expand_dims(acc_batch, axis=1)
        group_batch = np.expand_dims(group_batch, axis=1)
        collage_batch = np.expand_dims(collage_batch, axis=1)
        human_batch = np.expand_dims(human_batch, axis=1)
        occlusion_batch = np.expand_dims(occlusion_batch, axis=1)
        info_batch = np.expand_dims(info_batch, axis=1)
        blur_batch = np.expand_dims(blur_batch, axis=1)

        id_batch = self.df.iloc[batches, :]['Id'].values      
        image_batch = np.asarray([self.__get_input(id, self.input_size) for id\
             in id_batch])
        # Reshape to (self.batch_size, input_shape[0]*input_shape[1]*input_shape[2])
        image_batch = np.reshape(image_batch,(self.batch_size,-1))
        
        pawpularity_batch = self.df.iloc[batches, :]['Pawpularity'].values
#         # Convert pawpularity that ranges from 0 to 100 to a range of 0 to 1
#         pawpularity_batch = pawpularity_batch / 100

        X_batch = np.concatenate((subfocus_batch, eyes_batch, face_batch, near_batch,\
            action_batch, acc_batch, group_batch, collage_batch, human_batch,\
               occlusion_batch, info_batch, blur_batch, image_batch),axis=1)
        y_batch = pawpularity_batch
        return X_batch, y_batch
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]    
        X, y = self.__get_data(indexes)  
        return X, y
    
    def __len__(self):
        return self.n // self.batch_size

## Testing Data Generator
Likewise a data generator for the testing is needed.

In [None]:
class CustomTestDataGen(Sequence):
    
    def __init__(self, df, X_col,
                 batch_size,
                 input_size=(250, 250, 3),
                 shuffle=True): 
        self.df = df.copy()
        self.X_col = X_col
        self.batch_size = batch_size
        self.input_size = input_size
        self.list_IDs = np.arange(len(self.df.index))
        self.indexes = np.arange(len(self.df.index))
        self.shuffle = shuffle 
        self.n = len(self.df)
    
    def on_epoch_end(self):
        if self.shuffle:
            self.df = self.df.sample(frac=1).reset_index(drop=True)
    
    def __get_input(self, path, target_size):
        # Check if test dataset: test_df
        if (len(self.df.keys()) == len(test_df.keys())):
            img_path = dir_csv+'test/'+str(path)+'.jpg'
        else:
            print("Generator Image Data Generation Error")
            return -1
        image = tf.keras.preprocessing.image.load_img(img_path)
        image_arr = tf.keras.preprocessing.image.img_to_array(image)
        image_arr = tf.image.resize(image_arr,(target_size[0], target_size[1])).numpy()

        return image_arr/255.
    
    def __get_data(self, batches):
        # Generates data containing batch_size samples
        subfocus_batch = self.df.iloc[batches, :]['Subject Focus'].values
        eyes_batch = self.df.iloc[batches, :]['Eyes'].values
        face_batch = self.df.iloc[batches, :]['Face'].values
        near_batch = self.df.iloc[batches, :]['Near'].values
        action_batch = self.df.iloc[batches, :]['Action'].values
        acc_batch = self.df.iloc[batches, :]['Accessory'].values
        group_batch = self.df.iloc[batches, :]['Group'].values
        collage_batch = self.df.iloc[batches, :]['Collage'].values
        human_batch = self.df.iloc[batches, :]['Human'].values
        occlusion_batch = self.df.iloc[batches, :]['Occlusion'].values
        info_batch = self.df.iloc[batches, :]['Info'].values
        blur_batch = self.df.iloc[batches, :]['Blur'].values

        # Resize to (self.batch_size, 1)
        subfocus_batch = np.expand_dims(subfocus_batch, axis=1)
        eyes_batch = np.expand_dims(eyes_batch, axis=1)
        face_batch = np.expand_dims(face_batch, axis=1)
        near_batch = np.expand_dims(near_batch, axis=1)
        action_batch = np.expand_dims(action_batch, axis=1)
        acc_batch = np.expand_dims(acc_batch, axis=1)
        group_batch = np.expand_dims(group_batch, axis=1)
        collage_batch = np.expand_dims(collage_batch, axis=1)
        human_batch = np.expand_dims(human_batch, axis=1)
        occlusion_batch = np.expand_dims(occlusion_batch, axis=1)
        info_batch = np.expand_dims(info_batch, axis=1)
        blur_batch = np.expand_dims(blur_batch, axis=1)
        id_batch = self.df.iloc[batches, :]['Id'].values      
        image_batch = np.asarray([self.__get_input(id, self.input_size) for id\
             in id_batch])
        # Reshape to (self.batch_size, input_shape[0]*input_shape[1]*input_shape[2])
        image_batch = np.reshape(image_batch,(self.batch_size,-1))

        X_batch = np.concatenate((subfocus_batch, eyes_batch, face_batch, near_batch,\
            action_batch, acc_batch, group_batch, collage_batch, human_batch,\
               occlusion_batch, info_batch, blur_batch, image_batch),axis=1)
        return X_batch
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]    
        X = self.__get_data(indexes)  
        return X
    
    def __len__(self):
        return self.n // self.batch_size

# Model

## Build model
We will build a very simple model(i.e. CNN + DNN + Attention layer).
The model embeds the image data with:
- 3 convolution layers(i.e. Conv2D => BN => MaxPool)
- 1 dense layer
Then the other feature data is concatenated with the image embedding.
An attention layer is applied and the output is created with a linear activation.

In [None]:
def build_model(nb_annotations, image_shape):

    # Our input features of 12 annotations and corresponding image
    input_shape = nb_annotations + image_shape[0]*image_shape[1]*image_shape[2]
    inputs = Input(shape=input_shape)

    annotations_input = inputs[:,:nb_annotations]
    img_input = inputs[:,nb_annotations:]
    # Reshape flattened image to original image_shape
    img_input = tf.reshape(img_input,(tf.shape(inputs)[0],image_shape[0],image_shape[1],image_shape[2]))

    # 3 convolution layers
    x = Conv2D(16, 3, activation='relu')(img_input)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(2)(x)

    x = Conv2D(32, 3, activation='relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(2)(x)

    x = Conv2D(64, 3, activation='relu')(x)
    x = BatchNormalization(axis=-1)(x)
    x = MaxPooling2D(2)(x)

    # Flatten feature map to a 1-dim tensor so we can add fully connected layers
    x = Flatten()(x)

    # Create a fully connected layer with ReLU activation
    x = Dense(16, activation='relu')(x)
    x = BatchNormalization(axis=-1)(x)

    # Concatenate 12 annotation inputs and image embedding
    x = Concatenate()([annotations_input, x])

    # Simple attention layer
    attention = Dense(32, activation='relu')(x)
    attention = Dense(28, activation='softmax')(attention)
    x = attention*x

    # Create output layer with a single node and linear activation
    output = Dense(1, activation='linear')(x)

    # Create model
    model = Model(inputs=inputs, outputs=output)
    
    # Compile model
    model.compile(loss='mse', optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError()])
    
    return model

## Train Model

### Model training parameters

In [None]:
num_epochs = 100
batch_size = 32
target_size = (250, 250, 3)
annotations = ['Subject Focus', 'Eyes', 'Face', 'Near', 'Action',\
             'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info',\
                  'Blur']

### Build model

In [None]:
# IF CPU or GPU
model = build_model(len(annotations), target_size)

### Visualize model

In [None]:
model.summary()

### Generate data for Train and Validation datasets

In [None]:
# Train data
traingen = CustomTrainDataGen(sampled_tr_df,
                         X_col={'Id':'Id',
                         'Subject Focus':'Subject Focus',
                         'Eyes':'Eyes',
                         'Face':'Face',
                         'Near':'Near',
                         'Action':'Action',
                         'Accessory':'Accessory',
                         'Group':'Group',
                         'Collage':'Collage',
                         'Human':'Human',
                         'Occlusion':'Occlusion',
                         'Info':'Info',
                         'Blur':'Blur'},
                         y_col={'Pawpularity': 'Pawpularity'},
                         batch_size=batch_size,
                         input_size=target_size)
# Validation data
valgen = CustomTrainDataGen(val_df,
                       X_col={'Id':'Id',
                         'Subject Focus':'Subject Focus',
                         'Eyes':'Eyes',
                         'Face':'Face',
                         'Near':'Near',
                         'Action':'Action',
                         'Accessory':'Accessory',
                         'Group':'Group',
                         'Collage':'Collage',
                         'Human':'Human',
                         'Occlusion':'Occlusion',
                         'Info':'Info',
                         'Blur':'Blur'},
                       y_col={'Pawpularity': 'Pawpularity'},
                       batch_size=batch_size,
                       input_size=target_size)

### Callbacks
Callbacks used for model training:
- ReduceLROnPlateau: automatically reduce the learning rate during the training
- EarlyStopping: automatically stops the training when it doesn't learn anymore
- ModelCheckpoint: saves the weights as the model trains (here we only save the weights of the best model)
- CSVLogger: saves the logs as a CSV file
- Tensorboard: saves the logs for tensorboard visualization

In [None]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.001)

early_stop = EarlyStopping(monitor='val_loss', patience=30)

dir_path_batchtr = './Train_logs'
os.makedirs(dir_path_batchtr, exist_ok=True)

# Checkpoint
dir_weight_path_batchtr = dir_path_batchtr + '/Weights'
os.makedirs(dir_weight_path_batchtr, exist_ok=True)
checkpoint_name = dir_weight_path_batchtr + '/weights_best.hdf5'
checkpoint = ModelCheckpoint(checkpoint_name, monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True, mode='min')

# Logger for history
dir_hist_path_batchtr = dir_path_batchtr + '/Histories'
os.makedirs(dir_hist_path_batchtr, exist_ok=True)
logger_name = dir_hist_path_batchtr + '/history_log.csv'
logger = CSVLogger(logger_name, append=True, separator=',')

# Tensorboard log
dir_tensorboard_log = "./tensorboard_logs"
os.makedirs(dir_tensorboard_log, exist_ok=True)
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=dir_tensorboard_log)

### Training

In [None]:
# num_epochs = 100 - 63

# checkpoint_name = '../input/version2/weights_best.hdf5'
# model.load_weights(checkpoint_name)

In [None]:
# # IF CPU or GPU
# model.fit(traingen,
#           validation_data=valgen,
#           epochs=num_epochs,
#           use_multiprocessing=False,
#           callbacks=[reduce_lr,early_stop,checkpoint,logger,tensorboard_callback],
#           verbose=2)

### Load pretrained weights

In [None]:
checkpoint_name = '../input/version21/weights_best.hdf5'
model.load_weights(checkpoint_name)

### Generate data for Test dataset

In [None]:
testgen = CustomTestDataGen(test_df,
                         X_col={'Id':'Id',
                         'Subject Focus':'Subject Focus',
                         'Eyes':'Eyes',
                         'Face':'Face',
                         'Near':'Near',
                         'Action':'Action',
                         'Accessory':'Accessory',
                         'Group':'Group',
                         'Collage':'Collage',
                         'Human':'Human',
                         'Occlusion':'Occlusion',
                         'Info':'Info',
                         'Blur':'Blur'},
                         batch_size=1,
                         input_size=target_size,
                         shuffle=False)

### Predict

In [None]:
predictions = model.predict(testgen)

### Export prediction as a CSV file

In [None]:
# New dataframe for predictions with the id from test_df
pred_df = pd.DataFrame({'Id':test_df['Id']})
pred_df['Pawpularity'] = predictions

# Save as a CSV file
pred_df.to_csv('./submission.csv', index=False)

### Recheck with validation data

In [None]:
valgen_test = CustomTrainDataGen(val_df,
                       X_col={'Id':'Id',
                         'Subject Focus':'Subject Focus',
                         'Eyes':'Eyes',
                         'Face':'Face',
                         'Near':'Near',
                         'Action':'Action',
                         'Accessory':'Accessory',
                         'Group':'Group',
                         'Collage':'Collage',
                         'Human':'Human',
                         'Occlusion':'Occlusion',
                         'Info':'Info',
                         'Blur':'Blur'},
                       y_col={'Pawpularity': 'Pawpularity'},
                       batch_size=1,
                       input_size=target_size,
                       shuffle=False)

In [None]:
predictions = model.predict(valgen_test)
print(predictions[:10])
print(val_df.iloc[:10]['Pawpularity'])