# PetFinder.my Pawpularity Contest
## Metadata and image anaysis
![cute_kitty](https://www.petfinder.my/images/cuteness_meter.jpg)


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import cv2
import os
import math
import seaborn as sns

# Metadata EDA
Lets do EDA using metadata. 

In [None]:
input_dir = "/kaggle/input"
sample_submission = os.path.join(input_dir,"petfinder-pawpularity-score/sample_submission.csv")
train_csv = os.path.join(input_dir,"petfinder-pawpularity-score/train.csv")
test_csv = os.path.join(input_dir,"petfinder-pawpularity-score/test.csv")

train_dir = os.path.join(input_dir,"petfinder-pawpularity-score/train")
test_dir = os.path.join(input_dir,"petfinder-pawpularity-score/test")


In [None]:
train_df = pd.read_csv(train_csv)

In [None]:
train_df.head()

In [None]:
train_df.describe()

In [None]:
train_files = os.listdir(train_dir)
len(train_files)

In [None]:
plt.figure(figsize=(10,5))
sns.histplot(data=train_df, x='Pawpularity',color='orange', kde=True)
mean_p = train_df['Pawpularity'].mean()
plt.axvline(mean_p, c='purple', ls='-', lw=4, label="Mean")
plt.legend()
plt.show()

In [None]:
print("Mean Pawpularity:", mean_p)

In [None]:
test_data = pd.read_csv(test_csv)

In [None]:
len(test_data)

Lets find mean of Pawpularity and standard deviation for different values of predictors.

In [None]:
for col in train_df.columns[1:-1]:
    set1_df = train_df[train_df[col]==1]
    set1_mean = set1_df['Pawpularity'].mean()
    set1_std = set1_df['Pawpularity'].std()
    set2_df = train_df[train_df[col]==0]
    set2_mean = set2_df['Pawpularity'].mean()
    set2_std = set2_df['Pawpularity'].std()
    print(f"{col}: {set1_mean} ({set1_std}), {set2_mean} ({set2_std})")
                          
    

Let's see the Histogram of Pawpularity based for different values of predictors. 

In [None]:
fig = plt.figure(figsize=(18,40))
for i, col in enumerate(train_df.columns[1:-1]):
    ax = plt.subplot(12,2,(i*2)+1)
    df1 = train_df[train_df[col]==1]
    sns.histplot(data=df1, x=df1['Pawpularity'], bins=25, fill=True, kde=True, ax=ax)
    ax.set_xlabel(None)
    mean_p1 = df1['Pawpularity'].mean()
    ax.axvline(mean_p1, c='red', ls='-', lw=3, label="Mean")
    ax.set_title(f"{col} =1, count={df1['Pawpularity'].count()}", fontweight='bold', color="blue")
    
    ax = plt.subplot(12,2,(i*2)+2)
    df2 = train_df[train_df[col]==0]
    sns.histplot(data=df1, x=df2['Pawpularity'], bins=25, kde=True, ax=ax)
    ax.set_xlabel(None)
    mean_p2 = df2['Pawpularity'].mean()
    ax.axvline(mean_p2, c='red', ls='-', lw=3, label="Mean")
    ax.set_title(f"{col} =0, count={df2['Pawpularity'].count()}", fontweight='bold', color="blue")

plt.suptitle("Pawpularity distribution for different predectors ", y=0.9,
           fontsize=20, fontweight='bold')
plt.show()  

Let's explore if image resolution has any influence on Pawpularity score.

In [None]:
from PIL import Image

In [None]:
train_df['im_width'] = 0
train_df['im_height'] = 0
train_df['resolution'] = 0

for index, row in train_df.iterrows():
    im_path = os.path.join(train_dir,row['Id']+".jpg")
    #im = cv2.imread(im_path) no need to load image content
    #height,width = im.shape[:2]
    im = Image.open(im_path) # hopefully efficient option to load image for dimention checking
    width, height = im.size
    train_df.loc[index,'im_width'] = width
    train_df.loc[index,'im_height'] = height
    train_df.loc[index,'resolution'] = int(round(width * height,0))

In [None]:
plt.figure(figsize=(15,7))
sns.histplot(data=train_df, x='resolution',color='orange')
mean_p = train_df['resolution'].mean()
plt.axvline(mean_p, c='purple', ls='-', lw=4, label="Mean")
plt.legend()
plt.show()

In [None]:
low_resolutions = train_df[train_df['resolution'] < 100000]
len(low_resolutions)

In [None]:
high_resolutions = train_df[train_df['resolution'] > 1500000]
len(high_resolutions)

In [None]:
fig = plt.figure(figsize=(15,7))
ax = plt.subplot(1,2,1)
sns.histplot(data=low_resolutions, x=low_resolutions['Pawpularity'], bins=25, fill=True, kde=True, ax=ax)
ax.set_xlabel(None)
mean_p1 = low_resolutions['Pawpularity'].mean()
ax.axvline(mean_p1, c='red', ls='-', lw=3, label="Mean")
ax.set_title(f"Lowest resolution Pawpularity distribution", fontweight='bold', color="#e7273e")

ax = plt.subplot(1,2,2)
sns.histplot(data=high_resolutions, x=high_resolutions['Pawpularity'], bins=25, fill=True, kde=True, ax=ax)
ax.set_xlabel(None)
mean_p2 = high_resolutions['Pawpularity'].mean()
ax.axvline(mean_p2, c='red', ls='-', lw=3, label="Mean")
ax.set_title(f"Highest resolution Pawpularity distribution", fontweight='bold', color="#e7273e")


# Conclusion based on metadata analysis. 
* The Pawpularity distribution is equally influenced by each predictor and there is no evident trend. 
* All metadata including image resolution is not biased toward any trend of Pawpularity, so\It's difficult to accurately predict Pawpularity based on just meta data.
* The only hope is to find something in images.

# Visualizing Training images
Let's visualize images randomly. 

In [None]:
def see_animals(df):
    
    max_imgs = min(18,len(df))
    
    plt.figure(figsize=(15, 30))
    cols = 3
    rows = int(math.ceil(max_imgs / cols))
    
    for i in range(max_imgs):
        
        row = df.iloc[i]
        im_path = os.path.join(train_dir,row['Id']+".jpg")
        im = cv2.imread(im_path)
        im = cv2.cvtColor(im, cv2.COLOR_BGR2RGB)
        plt.subplot(rows,cols,i+1)       
        Pawpularity = row['Pawpularity']
        title = f"Pawpularity: {Pawpularity}, "
        
        for i, col in enumerate(train_df.columns[1:-1]):
            if i % 3 == 0 :
                title+="\n"
            title+=f"{col}: {row[col]}, "
            
                
        plt.title(title)
        
        plt.imshow(im)
        plt.axis('off')
    
    plt.tight_layout()
    plt.show()
    plt.close()

### Pets with high Pawpularity

In [None]:
see_animals(train_df[train_df['Pawpularity'] > 90].sample(18))

## Pets with low Pawpularity

In [None]:
see_animals(train_df[train_df['Pawpularity']<20].sample(18))

# Conclusion based on image observations
## Most pets with high Pawpularity are looking
* Happy
* Healthy
* Funny

## Most pets with low Pawpularity are looking
* Not that happy
* Not that healthy

### So finally it seems that we need a model that can catch pets moods and health from images :)