In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import cv2

# Readin in the training data

In [None]:
train_meta = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
train_meta.shape

In [None]:
train_meta.head()

# Pawpularity

In [None]:
sns.set_theme()
plt.figure(figsize=(10,8))
train_meta['Pawpularity'].hist(bins=20);

In [None]:
train_meta['Pawpularity'].describe()

# Meta labels

All of the meta labels are unbalanced.

## Distributions

In [None]:
meta_labels = train_meta.columns.tolist()[1:-1]

fig, axes = plt.subplots(4, 3, figsize=(30,30))
count = 0
for lab in meta_labels:
    i, j = divmod(count, 3)
    h = sns.histplot(data=train_meta, x=lab, stat='proportion', ax=axes[i,j])
    h.set_xlabel(lab, fontsize=20)
    h.set_ylabel("Proportion", fontsize=20)
    h.tick_params(labelsize=15)
    count += 1

## Pawpularity vs meta labels

* Focus - Pet stands out against uncluttered background, not too close / far.
* Eyes - Both eyes are facing front or near-front, with at least 1 eye / pupil decently clear.
* Face - Decently clear face, facing front or near-front.
* Near - Single pet taking up significant portion of photo (roughly over 50% of photo width or height).
* Action - Pet in the middle of an action (e.g., jumping).
* Accessory - Accompanying physical or digital accessory / prop (i.e. toy, digital sticker), excluding collar and leash.
* Group - More than 1 pet in the photo.
* Collage - Digitally-retouched photo (i.e. with digital photo frame, combination of multiple photos).
* Human - Human in the photo.
* Occlusion - Specific undesirable objects blocking part of the pet (i.e. human, cage or fence). Note that not all blocking objects are considered occlusion.
* Info - Custom-added text or labels (i.e. pet name, description).
* Blur - Noticeably out of focus or noisy, especially for the pet’s eyes and face. For Blur entries, “Eyes” column is always set to 0.

In [None]:
fig, axes = plt.subplots(4, 3, figsize=(50,50))
count = 0
for lab in meta_labels:
    i, j = divmod(count, 3)
    b = sns.boxplot(x=lab, y="Pawpularity", data=train_meta, ax=axes[i,j])
    b.set_xlabel(lab, fontsize=28)
    b.set_ylabel("Pawpularity", fontsize=28)
    b.tick_params(labelsize=20)
    count += 1

**Takeaways**:
* the distribution of Pawpularity is very similar in the two classes for the majority of the binary meta labels
* Pawpularity of Blur==1 images seems to be lower than Blur==0 images

## Correlation matrix

In [None]:
plt.figure(figsize=(20,20))
sns.set(font_scale=1.3)
sns.heatmap(train_meta.drop('Id', axis=1).corr(), cmap='viridis', annot=True)
plt.title('Correlation matrix');

**Takeaways**:
* all the meta labels have a low correlation to Pawpularity
* there are high correlations in the matrix that make sense
    * `Occlusion` and `Human` have a relatively high correlation, which means that in many of the images, a human is blocking part of the pet
    * `Collage` and `Info` have a relatively high correlation, which means that collage pictures often come with user-added texts
    * `Face` and `Eyes`: decently clear face often comes with decently clear eyes in the picture

## New features

In [None]:
train_meta['collage_and_info'] = train_meta['Collage'] * train_meta['Info']
train_meta['collage_or_info'] = train_meta['Collage'] + train_meta['Info']
train_meta['occlusion_and_human'] = train_meta['Occlusion'] * train_meta['Human']
train_meta['face_and_eyes'] = train_meta['Face'] * train_meta['Eyes']
train_meta['not_blur_and_eyes'] = (1-train_meta['Blur']) * train_meta['Eyes']
train_meta['not_blur_or_group_or_accessory'] = (1-train_meta['Blur']) + train_meta['Group'] + train_meta['Accessory']
train_meta['not_collage_and_info_or_not_blur_or_group_or_accessory'] = (1-train_meta['Collage']*train_meta['Info']) + (1-train_meta['Blur']) + train_meta['Group'] + train_meta['Accessory']
new_feats = ['collage_and_info','collage_or_info','occlusion_and_human','face_and_eyes','not_blur_and_eyes',
             'not_blur_or_group_or_accessory','not_collage_and_info_or_not_blur_or_group_or_accessory']

plt.figure(figsize=(18,18))
sns.set(font_scale=1.3)
sns.heatmap(train_meta[new_feats+['Pawpularity']].corr(), cmap='viridis', annot=True)
plt.title('Correlation matrix');

# Images

In [None]:
def plot_images(df: pd.DataFrame, label: str):
    plt.figure(figsize=(20, 50))
    n_rows = min(50, df.shape[0])
    
    for i in range(n_rows):
        row = df.iloc[i]
        img_path = f"../input/petfinder-pawpularity-score/train/{row['Id']}.jpg"
        img = cv2.cvtColor(cv2.imread(img_path), cv2.COLOR_BGR2RGB)
        plt.subplot(10, 5, i+1)
        plt.title(f"{label}: {row[label]}")
        plt.imshow(img)
        plt.grid(False)
        plt.axis('off')
    plt.tight_layout()
    plt.show()
    plt.close()

## Pawpularity < 5

In [None]:
df = train_meta.loc[train_meta['Pawpularity'] < 5,:]
plot_images(df, "Pawpularity")

## Pawpularity > 95

In [None]:
df = train_meta.loc[train_meta['Pawpularity'] > 95,:]
plot_images(df, "Pawpularity")

## Accessory == 1

In [None]:
df = train_meta.loc[train_meta['Accessory'] == 1,:]
plot_images(df, "Pawpularity")

## Face == 0

In [None]:
df = train_meta.loc[train_meta['Face'] == 0,:]
plot_images(df, "Pawpularity")

## Collage == 1

In [None]:
df = train_meta.loc[train_meta['Collage'] == 1,:]
plot_images(df, "Pawpularity")