In [None]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import cv2
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

In [None]:
train_folder="../input/petfinder-pawpularity-score/train"
train_df=pd.read_csv('../input/petfinder-pawpularity-score/train.csv')

print( train_df.shape )

In [None]:
def get_pawpularity_bins(p):
    if p <= 20:
        return '0-20'
    elif p>20 and p<=40:
        return '20-40'
    elif p>40 and p<=60:
        return '40-60'
    elif p>60 and p<=80:
        return '60-80'
    return '>80'

In [None]:
train_df['bin'] = train_df.Pawpularity.apply(get_pawpularity_bins)
train_df.head()

In [None]:
train_df.Pawpularity.describe()

In [None]:
_, ax=plt.subplots(1, 3, figsize=(15, 5))

sns.lineplot(x=range(len(train_df)), y=train_df.Pawpularity.sort_values().values, ax=ax[0])
sns.histplot(train_df.Pawpularity, ax=ax[1])
sns.boxplot(data=train_df, x='Pawpularity', ax=ax[2])

plt.legend(loc='best')
plt.show()

In [None]:
skew = stats.skew(train_df.Pawpularity)
kurtosis = stats.kurtosis(train_df.Pawpularity)
avg_pawpularity=train_df.Pawpularity.mean()

print("Pawpularity Skewness:{:.5f}".format(skew))
print("Pawpularity Kurtosis:{:.5f}".format(kurtosis))
print("Mean Pawpularity:{:.4f}".format(avg_pawpularity))

1. There is an exponential growth from Pawpularity (20-50 %) from the 1st graph.
2. Around 300 images have 100% Pawpularity from 2nd graph.
3. Pawpularity looks kind of gaussian with some skewness

In [None]:
train_df.bin.value_counts()

In [None]:
plt.title("Bin Count")

sns.countplot(data=train_df, x='bin', order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

bins are highly skewed to the right

In [None]:
def read_image(filename):
    filepath = os.path.join(train_folder, filename+".jpg")
    img=cv2.imread(filepath)
    img=cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return img

In [None]:
def get_images(field_name, title1, title2, rand_state=10):
    print(field_name)
    imgs1=train_df[train_df[field_name] == 0].sample(5, random_state=rand_state).Id.values
    imgs2=train_df[train_df[field_name] == 1].sample(5, random_state=rand_state).Id.values
    
    imgs=np.concatenate([imgs1, imgs2])
    imgs=[read_image(imgname) for imgname in imgs]
    
    _, ax=plt.subplots(2, 5, figsize=(17, 8), sharex=True, sharey=True)

    ax[0, 0].set_title(title1)
    ax[1, 0].set_title("\n"+title2)
    for i in range(2):
        for j in range(5):
            ax[i, j].imshow( imgs[i*5 + j] )
    plt.title(field_name)
    plt.show()

# Subject Focus

In [None]:
plt.title("Subject Focus in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Subject Focus',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Subject Focus", "Cluttered Background or Toofar or Too Close Images", "focused Images", rand_state=3)

1. Image (0, 2) --> is not cluttered, but a bit far away

# Eyes

In [None]:
plt.title("Eyes in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Eyes',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Eyes", "Low Quality of Eyes", "Both Eyes facing front and atleast one eye is clear", rand_state=3)

# Clear Face facing front

In [None]:
plt.title("Face in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Face',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Face", "Face is not clear or near-front", "face is clear and near front", rand_state=3)

# Near

In [None]:
plt.title("Near in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Near',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Near", "Not Near", "Single Pet taking Significant portion around 50%", rand_state=6)

# Action

In [None]:
plt.title("Action in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Action',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Action", "No Action", "Pet in the middle of an action", rand_state=8)

# Accessory

In [None]:
plt.title("Accessory in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Accessory',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Accessory", "No Accessory", "Accompanies Accessory", rand_state=10)

# Group

In [None]:
plt.title("Group in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Group',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Group", "No >1 pet", ">1 pet- in a group", rand_state=12)

1. In the 2nd row with >1 pet, a human in background also getting treated as pet or group

# Collage

In [None]:
plt.title("Collage in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Collage',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Collage", "No Collage", "Collage - digitally retouched photo", rand_state=14)

# Human

In [None]:
plt.title("Human in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Human',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Human", "No Human", "Human in the photo", rand_state=15)

# Occlusion

In [None]:
plt.title("Occlusion =1 Bins")
sns.countplot(data=train_df[train_df.Occlusion == 1].copy(),
              x='bin',
              hue='Occlusion',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
plt.title("Occlusion in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Occlusion',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Occlusion", "No Occlusion", "Occlusion in the photo", rand_state=16)

# Info

In [None]:
plt.title("Info Counts per Bins")
sns.countplot(data=train_df[train_df.Info == 1], 
              x='bin',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
plt.title("Info in Popularity Bins")
sns.countplot(data=train_df, 
              x='bin',
              hue='Info',
              order=['0-20', '20-40', '40-60', '60-80', '>=80'])
plt.legend(loc='best')
plt.show()

In [None]:
get_images("Info", "No Info on the pet photo", "Pet description", rand_state=18)

In [None]:
_, ax=plt.subplots(5, 3, figsize=(21, 10), sharex=True, sharey=True)
for i, b in enumerate(train_df.bin.sort_values().unique()):
    imgs=train_df[train_df.bin == b].Id.values[:3]
    imgs=[read_image(img) for img in imgs]
    ax[i,0].set_title(b)
    for j, img in enumerate(imgs):
        ax[i, j].imshow(img)