## Pawpularity EDA

### How Pawpularity Score Is Derived

- The Pawpularity Score is derived from each pet profile's page view statistics at the listing pages, using an algorithm that normalizes the traffic data across different pages, platforms (web & mobile) and various metrics.
- Duplicate clicks, crawler bot accesses and sponsored profiles are excluded from the analysis.

In [None]:
import numpy as np
import pandas as pd 
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

import os
from PIL import Image

In [None]:
mpl.rcParams['figure.dpi'] = 200
mpl.rcParams['axes.spines.top'] = False
mpl.rcParams['axes.spines.right'] = False

In [None]:
BASE_PATH = "../input/petfinder-pawpularity-score/"
train = pd.read_csv(os.path.join(BASE_PATH, "train.csv"))

In [None]:
train.head()

## 0. Pawpularity Distribution

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12, 7))
sns.histplot(x='Pawpularity', data=train, ax=ax)
ax.set_title('Pawpularity Distribution', fontsize=20, fontweight='bold')
ax.set_ylim(0, 700)
plt.show()

**Why is it suddenly rising from 100?**

Wouldn't it be possible to get better generalization results if we didn't use this data?

In [None]:
print(train['Pawpularity'].mean())

Most high score notebooks measure Pawpularity between 37 and 38 for noisy data.

When receiving noise, it seems to be the default to spit out the average first.

## 1. Image Width/Height Ratio

I think the size of the image is a pretty important feature. The image can predict what kind of camera it is, and can guess whether it is preprocessed

In [None]:
train['image_size'] = train['Id'].apply(lambda image_id : Image.open(os.path.join(BASE_PATH, 'train', image_id + '.jpg')).size)

In [None]:
train['width'] = train['image_size'].apply(lambda x: x[0])
train['height'] = train['image_size'].apply(lambda x: x[1])

In [None]:
fig = plt.figure(figsize=(14, 12))
fig.suptitle('Image Aspect(Ratio)', fontweight='bold', fontsize=20)
fig.set_facecolor('lightgray')

gs = fig.add_gridspec(5, 6)
ax = fig.add_subplot(gs[:,:-1])
sub_axes = [fig.add_subplot(gs[idx,-1]) for idx in range(5)]

# main view
ax.scatter(x=train['width'], y=train['height'], c=train['Pawpularity'],
           vmin=0, vmax=100, s=20)
ax.set_xlim(0, 1300)
ax.set_ylim(0, 1300)
ax.set_yticks(np.arange(0,1301,100))
ax.set_xticks(np.arange(0,1301,100))
ax.set_ylabel('Height', fontsize= 12)
ax.set_xlabel('Width', fontsize= 12)

ax.axhline(960 ,color='black', linewidth=0.4, linestyle='dashdot')
ax.axvline(1080 ,color='black', linewidth=0.4, linestyle='dashdot')
ax.axvline(1280 ,color='black', linewidth=0.4, linestyle='dashdot')
ax.plot([0, 960], [0, 960],color='black', linewidth=0.4, linestyle='dashdot')
ax.plot([0, 720], [0, 960],color='black', linewidth=0.4, linestyle='dashdot')
ax.plot([0, 540], [0, 960],color='black', linewidth=0.4, linestyle='dashdot')
ax.plot([0, 1080], [0, 810],color='black', linewidth=0.4, linestyle='dashdot')

# subview
for idx, ax in enumerate(sub_axes,1):
    ax.scatter(x=train['width'], y=train['height'], color='gray', alpha=0.1, s=5)
    train_sub = train[(train['Pawpularity']<=idx*20) & ((idx-1)*20<=train['Pawpularity'])]
    ax.scatter(x=train_sub['width'], 
               y=train_sub['height'], 
               c=train_sub['Pawpularity'], 
               vmin=0, vmax=100,
               alpha=1, s=5)
    ax.set_ylabel(f'<{idx*20}', fontsize= 12)
    ax.set_xticks([])
    ax.set_yticks([])
    
fig.tight_layout()
plt.show()

Interesting results were found in the image size and the distribution of scores according to size.

- High Pawpularity images have many images with a fixed image ratio.
    - Or the height and width are fixed(960/1280).
- The ratios are `1:1`, `3:4`, `9:16`, and `12:9`. This is the screen aspect ratio that people are generally familiar with.
- However, this feature by itself did not change the performance.Based on this, I think it would be good to consider augmentation.

## 2. Each Meta Features

Let's look at each distribution and examples for meta features.

In [None]:
def meta_feature_dist(feature):
    tmp_cnt = train.groupby(feature)['Pawpularity'].mean()
    display(pd.DataFrame(tmp_cnt))
    fig, ax = plt.subplots(1,1,figsize=(12, 5))
    sns.histplot(x='Pawpularity',data=train,hue=feature,ax=ax)
    plt.show()

def meta_feature_samples(feature):
    colors = ["#ED2938", "#B25F4A", "#77945C", "#3BCA6D", "#00FF7F"]
    figs = plt.figure(constrained_layout=True, figsize=(15, 12))
    subfigs = figs.subfigures(5, 2, hspace=0.07)
    for idx, fig in enumerate(subfigs,1):
        axes = fig[0].subplots(1, 3)
        fig[0].supylabel(f'<{idx*20}', fontweight='bold')
        fig[0].set_facecolor(colors[idx-1])
        train_sub = train[(train['Pawpularity']<=idx*20) & ((idx-1)*20<=train['Pawpularity']) & 
                          (train[feature]==0)
                         ].sample(3, random_state=0)
        for image_id, ax in zip(train_sub['Id'], axes):
            ax.imshow(Image.open(os.path.join(BASE_PATH, 'train', image_id + '.jpg')))
            ax.set_xticks([])
            ax.set_yticks([])

        axes = fig[1].subplots(1, 3)
        fig[1].set_facecolor(colors[idx-1])

        train_sub = train[(train['Pawpularity']<=idx*20) & ((idx-1)*20<=train['Pawpularity']) & (train[feature]==1)].sample(3, random_state=0)
        for image_id, ax in zip(train_sub['Id'], axes):
            ax.imshow(Image.open(os.path.join(BASE_PATH, 'train', image_id + '.jpg')))
            ax.set_xticks([])
            ax.set_yticks([])

    figs.suptitle(f'{feature} 0 & 1 samples', fontweight='bold', fontsize=20)        
    figs.supylabel('Pawpularity', fontsize=18)


    plt.show()

### 2-1. Subject Focus

In [None]:
meta_feature_dist('Subject Focus')
meta_feature_samples('Subject Focus')

### 2-2. Eyes

This feature doesn't look like the quality of the data.

In [None]:
meta_feature_dist('Eyes')
meta_feature_samples('Eyes')

### 2-3. Face

In [None]:
meta_feature_dist('Face')
meta_feature_samples('Face')

### 2-4. Near

In [None]:
meta_feature_dist('Near')
meta_feature_samples('Near')

### 2-5. Action

In [None]:
meta_feature_dist('Action')
meta_feature_samples('Action')

### 2-6. Accessory

In [None]:
meta_feature_dist('Accessory')
meta_feature_samples('Accessory')

### 2-7. Group

In [None]:
meta_feature_dist('Group')
meta_feature_samples('Group')

### 2-8. Collage

In [None]:
meta_feature_dist('Collage')
meta_feature_samples('Collage')

### 2-9. Human

In [None]:
meta_feature_dist('Human')
meta_feature_samples('Human')

### 2-10. Occlusion

In [None]:
meta_feature_dist('Occlusion')
meta_feature_samples('Occlusion')

### 2-11. Info

In [None]:
meta_feature_dist('Info')
meta_feature_samples('Info')

### 2-12. Blur

This feature doesn't look like the quality of the data.

In [None]:
meta_feature_dist('Blur')
meta_feature_samples('Blur')

## 3. One-Hundred Club

In [None]:
one_hundred = train[train['Pawpularity'] == 100]
print(len(one_hundred))

In [None]:
fig, ax = plt.subplots(12, 12, figsize=(20, 20))
for idx, ax in enumerate(ax.flatten()):
    ax.imshow(Image.open(os.path.join(BASE_PATH, 'train', one_hundred['Id'].iloc[idx] + '.jpg')))
    ax.axis('off')

fig.subplots_adjust(hspace = .05)
plt.show()

In [None]:
fig, ax = plt.subplots(12, 12, figsize=(20, 20))
for idx, ax in enumerate(ax.flatten(), 144):
    ax.imshow(Image.open(os.path.join(BASE_PATH, 'train', one_hundred['Id'].iloc[idx] + '.jpg')))
    ax.axis('off')

fig.subplots_adjust(hspace = .05)    
plt.show()

## 4. One-Two Club

In [None]:
print(train['Pawpularity'].min())
one_two = train[train['Pawpularity'] < 3]
fig, ax = plt.subplots(9, 9, figsize=(15, 15))
for idx, ax in enumerate(ax.flatten()):
    ax.imshow(Image.open(os.path.join(BASE_PATH, 'train', one_two['Id'].iloc[idx] + '.jpg')))
    ax.axis('off')

fig.subplots_adjust(hspace = .05)    
plt.show()

## It will be updated continuously for the time being. If you are looking forward to the next, please press upvote.