# Intro
Welcome to the [PetFinder.my - Pawpularity Contest](https://www.kaggle.com/c/petfinder-pawpularity-score/data) compedition
![](https://storage.googleapis.com/kaggle-competitions/kaggle/25383/logos/header.png)
<span style="color: royalblue;">Please vote the notebook up if it helps you. Feel free to leave a comment above the notebook. Thank you. </span>

# Libraries

In [None]:
import os
import pandas as pd
import numpy as np
import cv2
import matplotlib
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor

import tensorflow_addons as tfa

from tensorflow.keras.utils import Sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPool2D, Activation
from tensorflow.keras.optimizers import RMSprop,Adam
from tensorflow.keras.applications import ResNet50

import warnings
warnings.filterwarnings("ignore")

os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'

# Path

In [None]:
path = '/kaggle/input/petfinder-pawpularity-score/'
os.listdir(path)

# Load Data

In [None]:
train_data = pd.read_csv(path+'train.csv')
test_data = pd.read_csv(path+'test.csv')
samp_subm = pd.read_csv(path+'sample_submission.csv')

# Overview

In [None]:
print('Number of train samples: ', len(train_data.index))
print('Number of test samples: ', len(test_data.index))
print('Number of features: ', len(train_data.columns))

In [None]:
train_data.head()

# Load Image
We load and plot the first image of the train data set:

In [None]:
id_ = train_data.loc[0, 'Id']
# Create file
file = id_+'.jpg'
# Is the file in folder?
file in os.listdir(path+'train')

In [None]:
img = cv2.imread(path+'train/'+file)
print('Image shape:', img.shape)

In [None]:
fig, axs = plt.subplots(1, 1, figsize=(7, 7))
axs.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
axs.set_xticklabels([])
axs.set_yticklabels([])
plt.show()

# Plot Examples

In [None]:
fig, axs = plt.subplots(5, 5, figsize=(25, 25))
fig.subplots_adjust(hspace = .1, wspace=.1)
axs = axs.ravel()
for i in range(25):
    file = train_data.loc[i, 'Id']+'.jpg'
    Pawpularity = train_data.loc[i, 'Pawpularity']
    img = cv2.imread(path+'train/'+file)
    axs[i].imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    axs[i].set_title('Pawpularity: '+str(Pawpularity))
    axs[i].set_xticklabels([])
    axs[i].set_yticklabels([])
plt.show()

# Split Data

In [None]:
train_data, val_data = train_test_split(train_data, test_size = 0.2, random_state=2021)
train_data.index = range(len(train_data.index))
val_data.index = range(len(val_data.index))

In [None]:
X_train = train_data[train_data.columns[1:-1]]
y_train = train_data['Pawpularity']

X_val = val_data[val_data.columns[1:-1]]
y_val = val_data['Pawpularity']

X_test = test_data[test_data.columns[1:]]

# Model Based On Csv

In [None]:
model_csv = XGBRegressor(objective='reg:squarederror', n_estimators=800)
model_csv.fit(X_train, y_train)
y_val_pred = model_csv.predict(X_val)
print('MAE:', mean_absolute_error(y_val, y_val_pred))

Predict Test Data

In [None]:
y_test = model_csv.predict(X_test)
samp_subm['Pawpularity_csv'] = y_test

# Model Based On Images

**Data Generator** <br>
To load the data on demand we define da data generator.

In [None]:
batch_size = 64
img_size = 128
img_channel = 3

In [None]:
class DataGenerator(Sequence):
    def __init__(self, path, list_IDs, labels, batch_size, img_size, img_channel):
        self.path = path
        self.list_IDs = list_IDs
        self.labels = labels
        self.batch_size = batch_size
        self.img_size = img_size
        self.img_channel = img_channel
        self.indexes = np.arange(len(self.list_IDs))
        
    def __len__(self):
        len_ = int(len(self.list_IDs)/self.batch_size)
        if len_*self.batch_size < len(self.list_IDs):
            len_ += 1
        return len_
    
    
    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]
        list_IDs_temp = [self.list_IDs[k] for k in indexes]
        X, y = self.__data_generation(list_IDs_temp)
        return X, y
            
    
    def __data_generation(self, list_IDs_temp):
        X = np.zeros((self.batch_size, self.img_size, self.img_size, self.img_channel))
        y = np.zeros((self.batch_size, 1), dtype=int)
        for i, ID in enumerate(list_IDs_temp):
            #row = self.list_IDs[self.list_IDs==ID].index[0]
            img = cv2.imread(self.path+list_IDs_temp[i]+'.jpg')
            img = cv2.resize(img, (self.img_size, self.img_size))
            X[i, ] = img/255
            y[i, ] = self.labels[i]
        return X, y

Use the generator to define train and test data:

In [None]:
train_generator = DataGenerator(path+'train/', train_data['Id'], train_data['Pawpularity'],
                                batch_size, img_size, img_channel)

val_generator = DataGenerator(path+'train/', val_data['Id'], val_data['Pawpularity'],
                                batch_size, img_size, img_channel)

test_generator = DataGenerator(path+'test/', test_data['Id'], samp_subm['Pawpularity'],
                               batch_size, img_size, img_channel)

Load pretrained model:

In [None]:
weights='../input/models/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5'
conv_base = ResNet50(weights=weights,
                     include_top=False,
                     input_shape=(img_size, img_size, img_channel))
conv_base.trainable = True

In [None]:
epochs = 6

model = Sequential()
model.add(conv_base)
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='relu'))

model.compile(optimizer = RMSprop(lr=1e-3),
              loss='mse',
              metrics=['mae'])

model.summary()

history = model.fit_generator(generator=train_generator,
                              validation_data=test_generator,
                              epochs = epochs)

In [None]:
predict = model.predict_generator(test_generator, verbose=1)

In [None]:
samp_subm['Pawpularity_jpg'] = np.reshape(predict, (len(predict),))[0:len(samp_subm)]

# Compare Val Predictions
We compare the predictions based on the csv data and on the image files for the validation data set. For the final prediction we can calculate the mean of both

In [None]:
df_compare = pd.DataFrame(columns=['Pawpularity', 'Pawpularity_csv', 'Pawpularity_jpg'])
df_compare['Pawpularity'] = y_val
df_compare['Pawpularity_csv'] = model_csv.predict(X_val)
y_val_pred = model.predict_generator(val_generator, verbose=1)
df_compare['Pawpularity_jpg'] = np.reshape(y_val_pred, (len(y_val_pred),))[0:len(y_val)]

In [None]:
print('MAE_csv:', mean_absolute_error(df_compare['Pawpularity'], df_compare['Pawpularity_csv']))
print('MAE_jpg:', mean_absolute_error(df_compare['Pawpularity'], df_compare['Pawpularity_jpg']))

# Export

In [None]:
samp_subm['Pawpularity'] = 0.5*(samp_subm['Pawpularity_csv']+samp_subm['Pawpularity_jpg'])
samp_subm.head()

In [None]:
samp_subm[['Id', 'Pawpularity']].to_csv('submission.csv', index=False)