In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from PIL import Image # Show jpg images
import random #random numbers
import os # directories and files
import matplotlib.pyplot as plt # plot data
import seaborn as sns # plot data

In [None]:
# returns random image
def random_image(p, n, list_p = None):
    if list_p == None:
        n_img = len(os.listdir(p))
        sample = random.sample(list(np.arange(0,n_img)), n)
        for s in sample:
                yield os.listdir(p)[s].split(".")[0], Image.open(os.path.join(p, os.listdir(p)[s]))

    else:
        n_img = len(list_p)        
        sample = random.sample(list(np.arange(0,n_img)), n)
        for s in sample:
            yield list_p[s], Image.open(os.path.join(p, list_p[s]+".jpg"))
            
# shows image           
def show_sample(s, im):
    f, ax = plt.subplots(figsize = (7, 7))
    ax.axis("off")
    ax.imshow(im)

# Load Data

In [None]:
data_dir = os.path.join(os.getcwd(), os.pardir, 'input', 'petfinder-pawpularity-score')
meta_data = pd.read_csv(os.path.join(data_dir, 'train.csv'))

# Explore Pawpularity

In [None]:
sns.set_theme()
sns.histplot(meta_data.Pawpularity);

### Bottom Scorers

In [None]:
# Sample 25 images from all images with a pawpularity score of below 20
bottomlist = list(meta_data.loc[meta_data.Pawpularity < 20, 'Id'])
mybottomimage = random_image(os.path.join(data_dir, 'train'), 25, bottomlist)

In [None]:
s, im = next(mybottomimage)
show_sample(s, im)
pd.DataFrame(meta_data[meta_data.Id ==s])

### Top Scorers

In [None]:
# Sample 25 images from all images with a pawpularity score of over 100
toplist = list(meta_data.loc[meta_data.Pawpularity == 100, 'Id'])
mytopimage = random_image(os.path.join(data_dir, 'train'), 25, toplist)

In [None]:
s, im = next(mytopimage)
show_sample(s, im)
pd.DataFrame(meta_data[meta_data.Id ==s])

In [None]:
# Sample 25 images from all images with a pawpularity score of over 80 and below 100
toplist2 = list(meta_data.loc[((meta_data.Pawpularity < 100)&(meta_data.Pawpularity > 80)), 'Id'])
mytopimage2 = random_image(os.path.join(data_dir, 'train'), 25, toplist2)

In [None]:
s, im = next(mytopimage2)
show_sample(s, im)
pd.DataFrame(meta_data[meta_data.Id ==s])

### Medium Scorers

In [None]:
# Sample 25 images from all images with a pawpularity score of over 80 and below 100
mediumlist = list(meta_data.loc[((meta_data.Pawpularity < 40)&(meta_data.Pawpularity > 20)), 'Id'])
mymediumimage = random_image(os.path.join(data_dir, 'train'), 25, mediumlist)

In [None]:
s, im = next(mymediumimage)
show_sample(s, im)
pd.DataFrame(meta_data[meta_data.Id ==s])

# Correlation within Meta Data

In [None]:
corr = meta_data.corr()

In [None]:
sns.set_style("ticks")

# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5});

### PCA with 6 components

In [None]:
from sklearn.decomposition import PCA

In [None]:
X = meta_data.drop(columns = ["Id", "Pawpularity"]).to_numpy()

In [None]:
pca = PCA(n_components=6, random_state=0)
pca.fit(X)
X_transformed = pca.transform(X)
X_transformed = pd.DataFrame(X_transformed)

In [None]:
meta_data = meta_data.merge(X_transformed, left_index = True, right_index = True)
corr = meta_data.corr()

In [None]:
# Generate a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(230, 20, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})