In [None]:
# libs
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

# note:
- there may be some miss-classifications, if you spot any - let me know and i will correct the dataset.
- if you use the classifications, tag me in your projects (im interested to see what you come up with).
- spoiler: there is a difference in the pawpularity between cats vs dogs.

In [None]:
# data
df_data = pd.read_csv('/kaggle/input/petfinder-pawpularity-score/train.csv')
df_cat_or_dog = pd.read_csv('/kaggle/input/pawpularity-cat-or-dog/cat_class.csv')

# merge cat classifications
df_data = df_data.merge(df_cat_or_dog)
df_data['class'] = np.where(df_data['is_cat'] == 1, 'cat', 'dog')
del df_cat_or_dog

df_data[:1]

## counts.

In [None]:
# plot
fig, ax = plt.subplots(figsize=(12, 4))
sns.countplot(x='class', data=df_data)
ax.set_title(str(round(df_data['is_cat'].mean() * 100, 1)) + '% are cats.')
plt.show()

## pawpularity / distribution.

In [None]:
# plot
fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20,5))

# graph
sns.kdeplot(df_data['Pawpularity'][(df_data['class'] == 'dog')], color='b', label='dog', ax=ax[0])
sns.kdeplot(df_data['Pawpularity'][(df_data['class'] == 'cat')], color='r', label='cat', ax=ax[0])
sns.violinplot(data=df_data, y='Pawpularity', x='class', ax=ax[1])

# avgs
avg_dog = round(df_data['Pawpularity'][(df_data['class'] == 'dog')].mean(), 1)
avg_cat = round(df_data['Pawpularity'][(df_data['class'] == 'cat')].mean(), 1)

# annotage
ax[0].set_title('avg pawpularity for cats: ' + str(avg_cat) + '%, dogs: ' + str(avg_dog) + '%.')
ax[0].legend(loc="upper left")
plt.show()

you can see straight away, dogs get a more favorable pawpularity!

## prediction.

In [None]:
# split data
df_train, df_test = train_test_split(df_data, test_size=0.3, random_state=42)
print ('train:', df_train.shape)
print ('test:', df_test.shape)

# vars
y_target = 'Pawpularity'
x_cols = ['Eyes','Face','Near','Action','Accessory','Group','Collage','Human','Occlusion','Info','Blur']
x_cols_class = ['Eyes','Face','Near','Action','Accessory','Group','Collage','Human','Occlusion','Info','Blur','is_cat'] # with class

In [None]:
# train without classifications.
model = CatBoostRegressor(iterations=100000, verbose=False, eval_metric='RMSE')
model.fit(df_train[x_cols], df_train[y_target], eval_set=[(df_test[x_cols], df_test[y_target])], early_stopping_rounds=10)
print ('baseline: ', np.sqrt(mean_squared_error(df_test[y_target], model.predict(df_test[x_cols]))))

In [None]:
# train with classifications.
model = CatBoostRegressor(iterations=100000, verbose=False, eval_metric='RMSE')
model.fit(df_train[x_cols_class], df_train[y_target], eval_set=[(df_test[x_cols_class], df_test[y_target])], early_stopping_rounds=10)
print ('with classifications: ', np.sqrt(mean_squared_error(df_test[y_target], model.predict(df_test[x_cols_class]))))

a slight performance improvement including the classifications (which is everything on the leaderboard).

next steps:
- predict the classifications.
- include within the pawpularity prediction.
- build an image model.