In [None]:
import os

import cv2

from tqdm import tqdm

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

## Data at a glance!

In [None]:
train_df = pd.read_csv("../input/petfinder-pawpularity-score/train.csv")
test_df = pd.read_csv("../input/petfinder-pawpularity-score/test.csv")
sample_sub_df = pd.read_csv("../input/petfinder-pawpularity-score/sample_submission.csv")

In [None]:
print(f"Training data size: {train_df.shape}")
print(f'Number of images in "train/": {len(os.listdir("../input/petfinder-pawpularity-score/train"))}')

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
sample_sub_df.head()

## Target variable analysis

In [None]:
sns.displot(train_df["Pawpularity"], kind="hist", kde=True, height=8)

The pawpularity ranges between 1 to 100.

In [None]:
pd.DataFrame(train_df["Pawpularity"].describe())

### Effect of meta variables on pawpularity

In [None]:
f, ax = plt.subplots(2, 6, figsize=(16, 12))
ax = ax.flatten()

for idx, col in enumerate(list(train_df.columns)[1:-1]):
    sns.histplot(train_df[[col, "Pawpularity"]], hue=col, x="Pawpularity", kde=True, ax=ax[idx])

## Some of the most pawpular images

In [None]:
f, ax = plt.subplots(3, 5, figsize=(20, 10))
ax = ax.flatten()

for idx in range(15):
    img_id = train_df[train_df['Pawpularity'] == 100]['Id'].iloc[idx]
    ax[idx].imshow(mpimg.imread(f"../input/petfinder-pawpularity-score/train/{img_id}.jpg"))
    ax[idx].title.set_text(f"Pawpularity: {train_df[train_df['Id'] == img_id]['Pawpularity'].iloc[0]}")
    ax[idx].grid(None)

## Some of the least pawpular images

In [None]:
f, ax = plt.subplots(3, 5, figsize=(20, 10))
ax = ax.flatten()

for idx in range(15):
    img_id = train_df[train_df['Pawpularity'].isin([1, 2])].sort_values("Pawpularity")['Id'].iloc[idx]
    ax[idx].imshow(mpimg.imread(f"../input/petfinder-pawpularity-score/train/{img_id}.jpg"))
    ax[idx].title.set_text(f"Pawpularity: {train_df[train_df['Id'] == img_id]['Pawpularity'].iloc[0]}")
    ax[idx].grid(None)

## Mean Image size in training set

In [None]:
widths, heights = 0, 0
for img_id in tqdm(train_df["Id"]):
    img = cv2.imread(f"../input/petfinder-pawpularity-score/train/{img_id}.jpg")
    h, w, _ = img.shape
    heights += h
    widths += w

In [None]:
print(f"Mean Height: {heights/len(train_df)}")
print(f"Mean Width: {widths/len(train_df)}")

In [None]:
sample_sub_df["Pawpularity"] = train_df["Pawpularity"].mean()
sample_sub_df.to_csv('submission.csv', index=False)