In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')

In [None]:
train.head()

In [None]:
test.head()

In [None]:
# checking missing values and types od columns

train.info()

**No missing values found, all columns except for Id are numerical. Nothing to clean here.**

In [None]:
# let's look at key statistics of the train dataset

train.describe()

**All columns excpet Pawpularity have values of either 0 or 1. Pawpularuty has values from 1 to 100. Mean of Pawpularity score is around 38 (below 50), which points to right-scewed distribution**

In [None]:
plt.style.use('ggplot')
plt.rcParams.update({'font.size': 14})

plt.figure(figsize=(12, 8))
plt.hist(train.Pawpularity, bins=50)

plt.axvline(train.Pawpularity.mean(), color='green', label='Mean')
plt.axvline(train.Pawpularity.median(), color='blue', label='Median')
plt.axvline(train.Pawpularity.mode()[1], color='yellow', label='Mode')

plt.legend()

**The distriburion is indeed right-skewed, and there is a spike at 100 points score.**

In [None]:
# let's have a look at all entries that were rated 100

train_100 = train.copy()[train.Pawpularity == 100]
train_100.head()

In [None]:
# ...and all entries that were not rated 100

train_not_100 = train.copy()[train.Pawpularity != 100]
train_not_100.head()

In [None]:
# plotting the difference in means of all features between all entries rated 100 and all other entries

features = train.describe().iloc[1, :-1].index
values_not_100 = train_not_100.describe().iloc[1, :-1].values
values_100 = train_100.describe().iloc[1, :-1].values

fig, ax = plt.subplots(figsize=(12, 8))

ind = np.arange(len(features))
width = 0.4

mean_not100 = ax.barh(ind, values_not_100, width, label='All scores except 100')
mean_100 = ax.barh(ind + width, values_100, width, label='Only 100 scores')

ax.set_ylabel('Features', fontsize=14)
ax.set_xlabel('Mean score', fontsize=14)
ax.set(yticks=ind + width/2, yticklabels=features)

ax.legend()

plt.show()

**There seems no significant difference in feature means for scores of 100 and other than 100**

In [None]:
# let's have a look at correlations
sns.set(rc={'figure.figsize':(12, 8)})

sns.heatmap(np.abs(train.iloc[1:].corr()), cmap='YlOrRd')

In [None]:
train.iloc[1:].corr()['Pawpularity']

**Looks like there's almost no correlation between any feature and the score**

In [None]:
# let's check if there is any correlation between the features

sns.heatmap(np.abs(train.iloc[1:, :-1].corr()), cmap='YlOrRd')

**In genefral, the correlation between feature are also weak. There are moderate correlations between Eyes and Face, Human and Occlusion, Blur and Eyes, Collage and Info, Group and Near**

In [None]:
# let's look at key statistics of the test dataset

test.info()

In [None]:
# plotting the difference in means of all features between test and train

values_train = train.describe().iloc[1, :-1].values
values_test = test.describe().iloc[1, :].values

fig, ax = plt.subplots(figsize=(12, 8))

ind = np.arange(len(features))
width = 0.4

mean_train = ax.barh(ind, values_train, width, label='Train dataset')
mean_test = ax.barh(ind + width, values_test, width, label='Test dataset')

ax.set_ylabel('Features', fontsize=14)
ax.set_xlabel('Mean score', fontsize=14)
ax.set(yticks=ind + width/2, yticklabels=features)

ax.legend()

plt.show()

**There are significant differences between means of the features in train and test datasets. But since test dataframe has only 8 entries, the comparison is not meaningful**