In [None]:
%%capture
# Added this to avoid billion output lines
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os 
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import cv2
import imageio

In [None]:
# Load data to explore
res_path = "../input/petfinder-pawpularity-score"
train_df = pd.read_csv(os.path.join(res_path,'train.csv'))
test_df = pd.read_csv(os.path.join(res_path,'test.csv'))

In [None]:
# Now lets explore dimensions & columns, maybe some were ommitted on test
train = train_df.shape
test = test_df.shape
print(train, test)

In [None]:
train_df.columns

In [None]:
test_df.columns

In [None]:
# Notice that Pawpularity is not in the test data, so this is should not be really used to affect the conclusion all that much

In [None]:
# now lets check the features
train_df.head(10)

In [None]:
#Lets see a the distribution of Pawpularity Scores
sns.set(rc={'figure.figsize':(15,5)})
fig = plt.figure()
sns.histplot(data=train_df, x='Pawpularity', bins=100)
plt.axvline(train_df['Pawpularity'].mean(), c='red', ls='-', lw=3, label='Mean Pawpularity')
plt.axvline(train_df['Pawpularity'].median(),c='blue',ls='-',lw=3, label='Median Pawpularity')
plt.title('Distribution of Pawpularity Scores', fontsize=20, fontweight='bold')
plt.legend()
plt.show()

In [None]:
# mean Pawpularity
train_df['Pawpularity'].mean()

In [None]:
# median Pawpularity
train_df['Pawpularity'].median()

In [None]:
# So for classifier, we can suspect that more than mead/median Pawp score should lead to "success"
# Most probably - median ?

In [None]:
#later addition, check test distribution quickly
# WON`T WORK, NO PAWPULARITY ON TEST



# sns.set(rc={'figure.figsize':(15,5)})
# fig = plt.figure()
# sns.histplot(data=test_df, x='Pawpularity', bins=100)
# plt.axvline(test_df['Pawpularity'].mean(), c='red', ls='-', lw=3, label='Mean Pawpularity')
# plt.axvline(test_df['Pawpularity'].median(),c='blue',ls='-',lw=3, label='Median Pawpularity')
# plt.title('Distribution of Pawpularity Scores', fontsize=20, fontweight='bold')
# plt.legend()
# plt.show()

In [None]:
# first of all, strange distribution from 0 to approx. 30
# second anomally - almost 300 perfect scores

In [None]:
train_df[['Pawpularity']].describe()
# average animal is about 40 pawpularity

In [None]:
# Will work very fast image analysis now
# Add file paths to train df, for later use
def get_image_file_path(image_id):
    return f'/kaggle/input/petfinder-pawpularity-score/train/{image_id}.jpg'

train_df['file_path'] = train_df['Id'].apply(get_image_file_path)

In [None]:
train_df.head()

In [None]:
# Shows a batch of images
def show_batch_df(df, rows=8, cols=4):
    df = df.copy().reset_index()
    fig, axes = plt.subplots(nrows=rows, ncols=cols, figsize=(cols*4, rows*4))
    for r in range(rows):
        for c in range(cols):
            idx = r * cols + c
            img = imageio.imread(df.loc[idx, 'file_path'])
            axes[r, c].imshow(img)
            axes[r, c].set_title(f'{idx}, label: {df.loc[idx, "Pawpularity"]}')

In [None]:
# lets check visually extreme cases and average cases
show_batch_df(train_df.sort_values('Pawpularity'))

# Maybe there are some discernable patterns ?
# Also, make sure that 

In [None]:
show_batch_df(train_df.sort_values('Pawpularity', ascending=False))

In [None]:
# No conclusion, tbh - low marked animals are subjectively cute, image dimesions does not differ all that much as well.
# Therefore correlation is not as easy as "better quality image" or "more high res picture"

In [None]:
# Now lets start from the beginning
train_df = pd.read_csv(os.path.join(res_path,'train.csv'))


In [None]:
# for convenience
train_df.columns

In [None]:
#lets try and analyse default features correlation to pawpularity. Then, lets try to combine some and do that again.

In [None]:
feature_variables = train_df.columns.values
list(feature_variables)


In [None]:
# ommit id as irrelevant data and pawpularity - as it is our performance axis essentially
for variable in feature_variables[1:-1]:
    fig, ax = plt.subplots(1,2)
    sns.boxplot(data=train_df, x=variable, y='Pawpularity', ax=ax[0])
    sns.histplot(train_df, x="Pawpularity", hue=variable, kde=True, ax=ax[1])
    plt.suptitle(variable, fontsize=20, fontweight='bold')
    fig.show()

In [None]:
# OK, first results:
# Least influential features - Subject focus, Action, Collage, Info, Blur (?), Accessory
# People seem to care about "cuteness on photo" - Eyes, Face, Near, Human, Obstructed - need to dig deeper into these

In [None]:
#Lets try and get some correlation data
corr_train_df = train_df.iloc[:, 1:-1]
# train_df[1:-1].corr()
# sns.heatmap(train_df[1:-1].corr())
corr_train_df.head()

In [None]:
sns.heatmap(corr_train_df.corr(),annot = True)

In [None]:
# So there is a strong correlation between eyes and face - lets engineer feature "Eye contact" with those two
# Another strong correlation is human & occlusion - lets engineer feature "Obstructed object" with those two
# Less important correlation is between collage and info, but neither seem to affect the result as much

In [None]:
corr_train_df['Eye Contact'] = 0
corr_train_df.head()
corr_train_df['Eye Contact'] = np.where(((corr_train_df['Eyes'] == 1) & (corr_train_df['Face'] == 1)), 1, 0)

In [None]:
corr_train_df['Obstructed Subject'] = 0
corr_train_df['Obstructed Subject'] = np.where(((corr_train_df['Human'] == 1) & (corr_train_df['Occlusion'] == 1)), 1, 0)
corr_train_df.head()


In [None]:
# Lets group collage & info, just to be sure, as "Informative Image"
corr_train_df['Informative Image'] = 0
corr_train_df['Informative Image'] = np.where(((corr_train_df['Collage'] == 1) & (corr_train_df['Info'] == 1)), 1, 0)
corr_train_df.head()

In [None]:
# now maybe new corr will re-assure some assumptions
sns.heatmap(corr_train_df.corr(),annot = True)

In [None]:
train_df['Eye Contact'] = corr_train_df['Eye Contact']
train_df.drop(train_df.columns[-1], axis=1)
train_df.head()

In [None]:
train_df = pd.read_csv(os.path.join(res_path,'train.csv'))
train_df.head()

In [None]:
train_df['Eye Contact'] = corr_train_df['Eye Contact']
train_df['Obstructed Subject'] = corr_train_df['Obstructed Subject']
train_df['Informative Image'] = corr_train_df['Informative Image']

In [None]:
train_df.head(10)

In [None]:
feature_variables = train_df.columns.values
list(feature_variables)

In [None]:
# once again, but for engineered features
for variable in feature_variables[-3:]:
    fig, ax = plt.subplots(1,2)
    sns.boxplot(data=train_df, x=variable, y='Pawpularity', ax=ax[0])
    sns.histplot(train_df, x="Pawpularity", hue=variable, kde=True, ax=ax[1])
    plt.suptitle(variable, fontsize=20, fontweight='bold')
    fig.show()

In [None]:
# Conclusions for now

In [None]:
# 1 - Features are quite similar in terms of distribution between each other in sort of distribution in Pawpularity
# 2 - Photos don`t seem to have a discernable pattern, there are subjectively "cute" low-score photos and vice-versa
# 3 - Pawpularity might be not based on images alone, maybe there is some accompanying factors for this feature
# 4 - Some feature values are questionable upon further inspection, data might be noisy 
# 5 - Easily discernable patterns do not seem to affect the pawpularity score all that much, distribution remains roughly the same

In [None]:
# Addition #1, should take above median score into account
corr_train_df['Above Median'] = 0
corr_train_df['Above Median'] = np.where((train_df['Pawpularity'] >= 33), 1, 0)
corr_train_df.head()

In [None]:
# Now let`s check new correlations
sns.heatmap(corr_train_df.corr(),annot = True)

In [None]:
train_df['Above Median'] = corr_train_df['Above Median']
feature_variables = train_df.columns.values
list(feature_variables)

In [None]:
# just check above median Pawpularity
ax = sns.boxplot(data=train_df, x="Above Median", y='Pawpularity')
fig.show()


In [None]:
# finally, let`s divide dataset median-vise and chech whether there are any substantial correlations 
# 33.0
# AND RE-ARRANGE FEATURES FOR CONVENIENCE
train_df_below_median = train_df[train_df["Pawpularity"] <= 33]
# remove unused features
train_df_below_median = train_df_below_median.iloc[: , 1:-1]
# train_df_below_median = train_df_below_median.drop('Pawpularity', 1)
train_df_below_median_pawp = train_df_below_median.pop('Pawpularity')
train_df_below_median["Pawpularity"] = train_df_below_median_pawp

#-------------------------------------------------------------
train_df_above_median = train_df[train_df["Pawpularity"] > 33]
# remove unused features
train_df_above_median = train_df_above_median.iloc[: , 1:-1]
# train_df_above_median = train_df_above_median.drop('Pawpularity', 1)
train_df_above_median_pawp = train_df_above_median.pop('Pawpularity')
train_df_above_median["Pawpularity"] = train_df_above_median_pawp

train_df_above_median.head()

In [None]:
# Check & compare below and above median stats boxplots
feature_variables_below = train_df_below_median.columns.values
list(feature_variables)
for variable in feature_variables_below[:-1]:
    fig, ax = plt.subplots(1,2)
    sns.boxplot(data=train_df_below_median, x=variable, y='Pawpularity', ax=ax[0])
    sns.boxplot(data=train_df_above_median, x=variable, y='Pawpularity', ax=ax[1])
#     sns.histplot(train_df_below_median, x="Pawpularity", hue=variable, kde=True, ax=ax[1])
    plt.suptitle(variable, fontsize=20, fontweight='bold')
    fig.show()

In [None]:
# Check & compare below and above median stats histograms
feature_variables_below = train_df_below_median.columns.values
list(feature_variables)
for variable in feature_variables_below[:-1]:
    fig, ax = plt.subplots(1,2)
    sns.histplot(train_df_below_median, x="Pawpularity", hue=variable, kde=True, ax=ax[0])
    sns.histplot(train_df_above_median, x="Pawpularity", hue=variable, kde=True, ax=ax[1])
    plt.suptitle(variable, fontsize=20, fontweight='bold')
    fig.show()