In [None]:


import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import math
import os

The dataset contains image files of pet animals. Also the features related to images are given in an additional csv file.
Lets look into the train csv file.

In [None]:
#reading train.csv file
train_csv=pd.read_csv("../input/petfinder-pawpularity-score/train.csv")

In [None]:
train_csv.head(20)

In [None]:
train_csv.describe()

train_csv.file contains 14 columns.

* id: this column denotes the file name in train image dataset.

Below features have binary values.

* subject focus: denotes if the animal is looking into the camera.
* eyes,face: denotes if eyes and face is visible in image.
* near: dentoes if the animal is far or near in image.
* accessory: denotes if animal is wearing something.
* group:denotes if there is a single animal or a group.
* collage: if the image is collage or not.
* Human: if image has human or not.
* occlusion: is the animal is blocked by anything in image.
* info : if image has any info related to animal.
* blur: if the animal is blurred or not.

The target variable is:
* pawpularity:it is a integer value that lies between 1 and 100

In [None]:
#let's look int test.csv
test_csv=pd.read_csv("../input/petfinder-pawpularity-score/test.csv")

In [None]:
test_csv

Test.csv have all features except the pawpularity feature which we have to predict.

let's look into submission format

In [None]:
sample_sub=pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

In [None]:
sample_sub

In [None]:
#the target value is pawpularity column.
#Visualizing its distribution


train_csv['Pawpularity'].plot(kind='hist', bins=100, figsize=(15, 6));
plt.title("Target distribution", weight='bold', fontsize=16);

In [None]:
train_csv['Pawpularity'].describe()

In [None]:
plt.figure(figsize=(16,10))
sns.heatmap(train_csv.corr(), annot=True, fmt='.1g', cmap='coolwarm', square=True)
plt.title('Correlation Matrix', fontsize=20, fontweight='bold')
plt.show()

In [None]:
train_csv.corrwith(train_csv.Pawpularity)

From above correlation values we can see that the features that are most correlated with pawpularity column are:

Blur

Group

Accesory

Image with highest pawpularity score and its features

In [None]:
most_pawpular = train_csv[train_csv["Pawpularity"] == train_csv["Pawpularity"].max()].iloc[0]
path = "../input/petfinder-pawpularity-score/train/"+most_pawpular['Id']+".jpg"
im = plt.imread(path)
plt.figure(figsize=(15, 6))
plt.imshow(im)
plt.title(path.split("/")[-1])
plt.xticks([]), plt.yticks([])
print(f"Accompanying features:")
train_csv[train_csv['Id']==path.split('/')[-1].split('.')[0]]

Image with least pawpularity score and its features

In [None]:
least_pawpular = train_csv[train_csv["Pawpularity"] == train_csv["Pawpularity"].min()].iloc[0]
path = "../input/petfinder-pawpularity-score/train/"+least_pawpular['Id']+".jpg"
im = plt.imread(path)
plt.figure(figsize=(15, 6))
plt.imshow(im)
plt.title(path.split("/")[-1])
plt.xticks([]), plt.yticks([])
print(f"Accompanying features:")
train_csv[train_csv['Id']==path.split('/')[-1].split('.')[0]]

In [None]:

train_csv['img_path'] = train_csv['Id'].apply(lambda x: f'../input/petfinder-pawpularity-score/train/{str(x)}.jpg')


**Evaluation Metric**

Metric for Evaluation
 Root Mean Square Error(RMSE) is used for evaluation of results.
 
RMSE is defined as


$$\sqrt{\Sigma_{i=1}^{n}{\Big(\frac{\hat{y}_i - y_i}{n}\Big)^2}}$$

where $n$ denotes the number of samples, $y_i$ the ground truth value and $\hat{y}_i$ the prediction value.

In [None]:
train_csv.head(2)

In [None]:
train_csv['ind']=np.arange(0,len(train_csv))

In [None]:
train_data=[]

target=[]
path="../input/petfinder-pawpularity-score/train"

for i in range(len(train_csv)):
    train1=train_csv.drop(['Id','ind','img_path','Pawpularity'],axis=1)[train_csv.index==i].values
    
    target.append(train_csv.Pawpularity[i])
    path=train_csv.img_path[i]
    img1=cv2.imread(path)
    img1=cv2.resize(img1,(128,128))
    img1=cv2.cvtColor(img1, cv2.COLOR_BGR2RGB)
    
    train_data.append([img1,train1[0]])

In [None]:
#train_data=np.array(train_data)
from sklearn.model_selection import train_test_split
X_train,X_valid,y_train,y_valid=train_test_split(train_data,target,test_size=0.1,random_state=42,shuffle=True)

In [None]:
train_img_data=[]
train_feature_data=[]
for i,j in X_train:
    train_img_data.append(i)
    train_feature_data.append(j)

In [None]:
train_img_data=np.array(train_img_data)
train_img_data=train_img_data/255.0
train_img_data.shape

In [None]:
train_feature_data=np.array(train_feature_data)
train_feature_data.shape

In [None]:

y_train=np.array(y_train)

y_valid=np.array(y_valid)

In [None]:
X_valid1,X_valid2,y_valid1,y_valid2=train_test_split(X_valid,y_valid,test_size=0.5,random_state=42,shuffle=True)

In [None]:
validation_img_data2=[]
validation_feature_data2=[]
for i,j in X_valid2:
    validation_img_data2.append(i)
    validation_feature_data2.append(j)
    
validation_img_data1=[]
validation_feature_data1=[]
for i,j in X_valid1:
    validation_img_data1.append(i)
    validation_feature_data1.append(j)

In [None]:
validation_img_data2=np.array(validation_img_data2)
validation_img_data2=validation_img_data2/255.0
validation_feature_data2=np.array(validation_feature_data2)

validation_img_data1=np.array(validation_img_data1)
validation_img_data1=validation_img_data1/255.0
validation_feature_data1=np.array(validation_feature_data1)

In [None]:
import tensorflow as tf
from tensorflow import keras

In [None]:
from tensorflow.keras.models import Sequential

from tensorflow.keras.layers import Flatten,Dropout,BatchNormalization
from tensorflow.keras.layers import Convolution2D
from tensorflow.keras.layers import MaxPooling2D
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Concatenate
from tensorflow.keras import Input
from tensorflow.keras.layers import concatenate
from tensorflow.keras.callbacks import EarlyStopping,ReduceLROnPlateau

In [None]:
from tensorflow.keras.applications import resnet50


resnet50 = resnet50.ResNet50(weights='../input/resnet50-weights/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5',include_top=False,input_shape=(128, 128, 3))

In [None]:
len(resnet50.layers)

In [None]:
import sklearn

for layer in resnet50.layers[:80]:
    layer.trainable=False
first= Flatten()(resnet50.output)
first= Dense(4096,activation="relu")(first)
first=BatchNormalization()(first)
first=Dropout(0.5)(first)

tabular_input = Input(shape=(12,))
second=Dense(4096,activation="relu")(tabular_input)
second=BatchNormalization()(second)
second=Dropout(0.5)(second)
combined = Concatenate(axis=1)([first, second])
result=Dense(4096,activation="relu")(combined)
result=BatchNormalization()(result)
result=Dropout(0.5)(result)
result=Dense(1,activation="relu")(result)

    
image_input = Input(shape=(128,128,3))
model = tf.keras.Model(inputs=[resnet50.input, tabular_input], outputs=[result])
  
early_stopping = EarlyStopping(patience = 70)
reduce_lr = ReduceLROnPlateau(factor=0.1, patience=40,min_lr=1e-9)

model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer='adam', metrics=[tf.keras.metrics.RootMeanSquaredError(name="rmse"), "mae", "mape"])
predictor=model.fit((train_img_data,train_feature_data),y_train,validation_split=0.2, epochs=200,batch_size=32, callbacks=[early_stopping,reduce_lr])
   
y_pred=model.predict((validation_img_data1,validation_feature_data1))
y_pred2=model.predict((validation_img_data2,validation_feature_data2))
    
y_valid1=y_valid1.reshape(-1,1)
y_valid2=y_valid2.reshape(-1,1)
    
mse1 = sklearn.metrics.mean_squared_error(y_valid1,y_pred)
mse2 = sklearn.metrics.mean_squared_error(y_valid2,y_pred2)
rmse0 = math.sqrt(mse1)
rmse1 = math.sqrt(mse2)


In [None]:
print("RMSE on predicted data with first validation set= ",rmse0)
print("RMSE on predicted data with second validation set= ",rmse1)

In [None]:
c= np.arange(len(y_valid1))
plt.scatter(y_valid1,c,label="True target value")
plt.scatter(y_pred,c,label="Predicted target value")
plt.legend()

In [None]:
c= np.arange(len(y_valid2))
plt.scatter(y_valid2,c,label="True target value")
plt.scatter(y_pred2,c,label="Predicted target value")
plt.legend()