# EDA of Body Measurements Data

In [None]:
# PACKAGES

# standard
import numpy as np
import pandas as pd

# plots
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# load data and show structure
df = pd.read_csv('../input/body-measurements-dataset/Body Measurements _ original_CSV.csv')
df.info()

# Data Preparation

In [None]:
# one observation has a missing gender info
df[np.isnan(df.Gender)]

In [None]:
# fill the missing value and change encoding to more readable version
df.Gender = df.Gender.fillna(value=0)
df.Gender = df.Gender.replace({0:'X',1:'M',2:'F'})

# EDA

In [None]:
features_cat = ['Gender']

In [None]:
df.Gender.value_counts().plot(kind='bar')
plt.title('Gender')
plt.grid()
plt.show()

In [None]:
features_num = ['Age', 'HeadCircumference', 'ShoulderWidth', 'ChestWidth ',
                'Belly ', 'Waist ', 'Hips ', 'ArmLength ', 'ShoulderToWaist ',
                'WaistToKnee ', 'LegLength', 'TotalHeight']

In [None]:
# basic stats
df[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot distributions
sns.set_style('ticks',{'axes.grid' : False})
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,7))
    
    ax1.hist(df[f])
    ax1.grid()
    ax1.set_title('Histogram of ' + f)
    
    ax2.boxplot(df[f], vert=False)
    ax2.grid()   
    ax2.set_title('Boxplot of ' + f)
    
    plt.show()

### Plot Height vs Age by Gender

In [None]:
sns.set_style('ticks',{'axes.grid' : True})
sns.jointplot(data=df, x='Age', y='TotalHeight', 
              height=7,
              hue='Gender',
              alpha=0.5)
plt.show()

# Restrict to adults

In [None]:
# filter adults only
df_adult = df[df.Age>=18].copy()

In [None]:
# basic stats
df_adult[features_num].describe(percentiles=[0.01,0.1,0.25,0.5,0.75,0.9,0.99])

In [None]:
# plot distributions
sns.set_style('ticks',{'axes.grid' : False})
for f in features_num:
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(10,7))
    
    ax1.hist(df_adult[f])
    ax1.grid()
    ax1.set_title('Histogram of ' + f + ' (adult)')
    
    ax2.boxplot(df_adult[f], vert=False)
    ax2.grid()   
    ax2.set_title('Boxplot of ' + f + ' (adult)')
    
    plt.show()

### Plot Height vs Age by Gender (adults)

In [None]:
sns.set_style('ticks',{'axes.grid' : True})
sns.jointplot(data=df_adult, x='Age', y='TotalHeight',
              height=7,
              hue='Gender',
              alpha=0.5)
plt.show()

### Pairwise scatter plot (adults)

In [None]:
# pairwise scatter plot
sns.pairplot(df[features_num])
plt.grid()

### Correlations (adults)

In [None]:
# correlations
corr_pearson = df_adult[features_num].corr(method='pearson')
corr_spearman = df_adult[features_num].corr(method='spearman')

fig = plt.figure(figsize = (8,6))
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')
plt.show()

fig = plt.figure(figsize = (8,6))
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()