In [None]:
# Necessary libraries to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# plotly
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.figure_factory as ff

## Train and Test csv files

Let's start observing the numerical data given to us as csv files for train and test dataset.

> #### 1. Data Loading

In [None]:
train = pd.read_csv('../input/petfinder-pawpularity-score/train.csv')
test = pd.read_csv('../input/petfinder-pawpularity-score/test.csv')
train_x = train[train.columns[:-1]]
train_y = train["Pawpularity"]

> #### 2.  EDA

In [None]:
train_x.head()

In [None]:
train_x.info()

In [None]:
train_x.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

In [None]:
test.head()

In [None]:
test.info()

In [None]:
test.describe().T.style.bar(subset=['mean'], color='#205ff2')\
                            .background_gradient(subset=['std'], cmap='Reds')\
                            .background_gradient(subset=['50%'], cmap='coolwarm')

#### Summary:
* Features are binary variables consists of 0 and 1.
* Test dataset contains only 8 variables, thus train and test data comparison is not very suitable for this.
* There is no missing variables for any feature.
* Eyes, Face and Near variables have higher mean values than the others.

> #### 3. Distribution Check

In [None]:
train_binary_features = train.columns[1:-1]
for i in train_binary_features:
    fig = px.pie(train, names=i, color_discrete_sequence=px.colors.sequential.RdBu)
    fig.update_layout(
    autosize=False,
    width=300,
    height=300,)
    fig.show()

In [None]:
hist_data = [train["Pawpularity"]]
group_labels = ['Pawpularity']

fig = ff.create_distplot(hist_data, group_labels)
fig.show()

> #### 4. Correlation check

In [None]:
fig, ax = plt.subplots(figsize=(18, 18))
train_corr = train_x[train_x.columns[1:]].corr()

mask = np.zeros_like(train_corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(train_corr, ax=ax,
           square=True, center=0, linewidth=.2,
           cmap=sns.diverging_palette(160, 10, as_cmap=True),
           mask=mask,
           annot=True,
           annot_kws={'size':7}
           )
ax.set_title(f'Correlation plot for all binary variables', fontweight='bold')
fig.show()

references:
* [categorical EDA 1](https://www.kaggle.com/subinium/tps-may-categorical-eda)
* [categorical EDA 2](https://www.kaggle.com/kimalpha/petfinder-simple-t-sne-fireworks)
