In [None]:
import os
import time
import numpy as np
import pandas as pd
from easydict import EasyDict as edict

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

# image
import PIL
from PIL import Image
import torchvision

Goal of this competition is to predict **Pawpularity Score**, which indicates popularity of animals.
Given **picture** of an animal and **metadata of the picture**, Our model have to Pawpularity score between 0 and 100. 

Each image contains one or more animals and meta data contains binary features of each image *(such as **human in the picture(1) or not(0)**  or  **focus is clear(1) or not(0)** )*

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')

# 1. EDA

### target distribution

First, I will check distribution of the target variable(Pawpularity Score).

In [None]:
# target distribution
sns.histplot(train['Pawpularity'])

As we can see, the distribution is little right-skewed but close to normal disturbution. 

### relation to metadata

Let's check out the relationship between target and each meta-data variables.

In [None]:
fig = plt.figure(figsize = (15,10))
for i, feat in enumerate(train.columns[1:-1]):
    plt.subplot(3,4,i+1)
    sns.boxplot(data = train, x = feat, y = 'Pawpularity')
    plt.title(feat)
    plt.xlabel('')
    plt.ylabel('')

As we can see, there is no meta-data feature that has drastic effect on Pawpularity score. Using binary meta data as a raw seems to be result in poor peformance.

Therefore I tried to treat metadata as **text data** and apply SOTA multimodal encoder **CLIP** to transform text to feature vector. 

### Pawpularity score binning

Let's visually check random set of images of each Pawpularity score bin.

In [None]:
bins = np.linspace(0,100,10)
labels = [1,2,3,4,5,6,7,8,9]
train['score_bin'] = pd.cut(train['Pawpularity'], bins, labels = labels)

In [None]:
fig = plt.figure(figsize = (20, 30))
i = 1
for label in labels:
    df = train[train.score_bin == label]
    idx = np.random.choice(df.index, 5)
    imgs = train.loc[idx, 'Id']
    for img in imgs:
        img_path = os.path.join('../input/petfinder-pawpularity-score/train', img)+'.jpg'
        image = Image.open(img_path)
        plt.subplot(9,5,i)
        plt.imshow(image)
        plt.xticks([])
        plt.yticks([])
        if i%5 == 1:
            plt.ylabel('score group{}'.format(label))
        i+=1

Do you have insights about characteristics of pictures that has high Pawpularity score? Unfortunately, I don't.. Therefore we need DeepLearning based model that can extract complicated feature of the images.

### Resolution of images according to Pawpularity score

I came up with intuitive idea that "Picture of high resolution might has higher Pawpularity score". So I check out the result by visualizing relationship between Resolution and corresponding score bin. 

In [None]:
resolution_bin = {1:[], 2:[], 3:[], 4:[], 5:[], 6:[], 7:[], 8:[], 9:[]}
img_files = list(train['Id'].values)

def resolution(img_tensor):
    return img_tensor.shape[1]*img_tensor.shape[2]

start = time.time()
for i, img_file in enumerate(img_files):
    img_tensor = torchvision.transforms.ToTensor()(Image.open(os.path.join('../input/petfinder-pawpularity-score/train', img_file)+'.jpg'))
    resol = resolution(img_tensor)
    score_bin = train.loc[train['Id'] == img_file, 'score_bin'].values[0]
    resolution_bin[score_bin].append(resol)
    if i%1000 == 0:
        print('{} images processed'.format(i))
print('{} minutes elapsed'.format((time.time()-start)/60))

In [None]:
score_bin = []
resols = []
for key, item in resolution_bin.items():
    for element in item:
        score_bin.append(key)
        resols.append(element)
df = pd.DataFrame({'score_bin':score_bin, 'resolution':resols})

In [None]:
fig = plt.figure(figsize = (15, 10))
sns.boxplot(data = df, x = 'score_bin', y = 'resolution')