# Exercise 1

In [None]:
import pandas as pd, numpy as np

eda_data = pd.read_csv('../datasets/eda.csv', na_values=['?'])

print(str(eda_data.shape[0]) + ' records')
print(str(eda_data.columns.size) + ' attributes:')

eda_data.head()

In [None]:
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns
sns.set(color_codes=True)

## Missing data: `gender`
Missing data often present challenges that need to be addressed before applying machine learning algorithms: *should the columns with missing values be removed? or rows with missing values? or can they be filled?* 

For instance, in the given dataset, attribute `gender` has a high proportion of missing values (325/1000).

In [None]:
plt.figure()
    
attr_data = eda_data['gender']
missing_count = np.count_nonzero(attr_data.isnull().values)
    
plot = sns.countplot(x='gender', data=eda_data)
plot.set_title('gender' + '\nMissing: ' + str(missing_count))
plot.set_xlabel(' ')
plot.set_ylabel('Count')
plot.yaxis.label.set_rotation(0)

plt.draw()

Missing data are not displayed on plots. Let's create a new attribute `gender_m`, where a missing value is replaced with a placeholder `MISSING`.

In [None]:
eda_data['gender_m'] = eda_data['gender']
eda_data.gender_m.fillna(value='MISSING', inplace=True)

eda_data.loc[eda_data.gender.isnull(), ['gender', 'gender_m']].head()

In [None]:
plt.figure(figsize=(8, 6))    
sns.set(font_scale=1.3)

attr_data = eda_data['gender_m']
missing_count = np.count_nonzero(attr_data.isnull().values)
    
plot = sns.countplot(x='gender_m', data=eda_data)
plot.set_title('gender_m' + '\nMissing: ' + str(missing_count))
plot.set_xlabel(' ')
plot.set_ylabel('Count')
plot.yaxis.label.set_rotation(0)

plt.draw()
sns.set()

To answer the question "*Are gender data missing at random?*", explore pairwise relations of `gender_m` with other attributes. It turns out that values are only missing at the `prenatal` hospital.

In [None]:
grid = sns.factorplot(data=eda_data,y='gender_m',col='hospital',kind='count')

grid.axes[0,0].yaxis.label.set_rotation(0)
grid.axes[0,0].yaxis.labelpad = 25

plt.subplots_adjust(top=0.87)
grid.fig.suptitle('Gender vs. hospital')

This suggests that the missing values can be filled in (with the value `f`). Note that now the distribution of `gender` is skewed, i.e. there are more females in our sample than in the general population, where it is expected to be close to 50/50. This might be important for further modelling.

In [None]:
eda_data['gender_filled'] = eda_data['gender']
eda_data.gender_filled.fillna(value='f', inplace=True)

eda_data.loc[eda_data.gender.isnull(), ['gender', 'gender_m', 'gender_filled']].head()

In [None]:
plt.figure()
    
attr_data = eda_data['gender_filled']
missing_count = np.count_nonzero(attr_data.isnull().values)
    
plot = sns.countplot(x='gender_filled', data=eda_data)
plot.set_title('gender_filled' + '\nMissing: ' + str(missing_count))
plot.set_xlabel(' ')
plot.set_ylabel('Count')
plot.yaxis.label.set_rotation(0)

plt.draw()

## Statistical peculiarities: `height` and `weight`
It is often worth to verify that statistical properties of the given dataset match your expectations. For example, heights and weights are expected to be normally distributed, which is clearly not the case in the hospital data, where they have strange bimodal distributions with a large number of "outliers".

In [None]:
for attr_name in ['height', 'weight']:
    f, (ax_hist, ax_box) = plt.subplots(2, sharex=True, 
                                           gridspec_kw={"height_ratios": (.9, .1)})

    attr_data = eda_data[attr_name]
    min = attr_data.min()
    mean = attr_data.mean()
    median = attr_data.median()
    max = attr_data.max()
    std_dev = attr_data.std()
    missing_count = np.count_nonzero(attr_data.isnull().values)
    
    distplot = sns.distplot(eda_data[attr_name].dropna(), kde=True, rug=False, axlabel=False, ax=ax_hist)
    distplot.set_title(attr_name + '\n' + 
                   'Min: '         + str(min)               + '   ' +
                   'Avg: '         + str(round(mean, 2))    + '   ' +
                   'Std.dev: '     + str(round(std_dev, 2)) + '   ' +
                   'Median: '      + str(median)            + '   ' +
                   'Max: '         + str(max)               + '   ' +
                   'Missing: '     + str(missing_count))
    
    boxplot = sns.boxplot(attr_data, ax=ax_box)
    boxplot.set_xlabel(' ')

Again, the question to investigate is whether this happens randomly or is correlated with other attributes. In the latter case, there is an opportunity to fix these statistical anomalies.

In the hospital data, `sports` hospital apparently records heights and weights in the imperial units, which can easily be fixed.

In [None]:
plot = sns.stripplot(data=eda_data, x="height", y="hospital", jitter=True)
plot.set_title('Height vs. hospital')
plot.yaxis.label.set_rotation(0)
plot.yaxis.labelpad = 25

In [None]:
eda_data['height_m'] = eda_data['height']
eda_data.loc[eda_data.hospital == 'sports', 'height_m'] *= 0.3048

eda_data['weight_kg'] = eda_data['weight']
eda_data.loc[eda_data.hospital == 'sports', 'weight_kg'] *= 0.45359237

eda_data.loc[eda_data.hospital == 'sports', ['height', 'height_m', 'weight', 'weight_kg']].head()

In [None]:
attr_name = 'height_m'
f, (ax_hist, ax_box) = plt.subplots(2, sharex=True, gridspec_kw={"height_ratios": (.9, .1)})

attr_data = eda_data[attr_name]
min = attr_data.min()
mean = attr_data.mean()
median = attr_data.median()
max = attr_data.max()
std_dev = attr_data.std()
missing_count = np.count_nonzero(attr_data.isnull().values)

distplot = sns.distplot(eda_data[attr_name].dropna(), kde=True, rug=False, axlabel=False, ax=ax_hist)
distplot.set_title(attr_name + '\n' + 
               'Min: '         + str(min)               + '   ' +
               'Avg: '         + str(round(mean, 2))    + '   ' +
               'Std.dev: '     + str(round(std_dev, 2)) + '   ' +
               'Median: '      + str(median)            + '   ' +
               'Max: '         + str(max)               + '   ' +
               'Missing: '     + str(missing_count))

boxplot = sns.boxplot(attr_data, ax=ax_box)
boxplot.set_xlabel(' ')

The distribution of *transformed* height is slightly left-skewed compared to the normal distribution. The reason is that patients of `sports` hospital are taller and heavier than the rest of patients.

In [None]:
plot = sns.stripplot(data=eda_data, x="height_m", y="hospital", jitter=True)
plot.set_title('Height vs. hospital')
plot.yaxis.label.set_rotation(0)
plot.yaxis.labelpad = 25

In [None]:
sns.distplot(eda_data.loc[eda_data.hospital == 'sports', 'weight_kg'])
sns.distplot(eda_data.loc[eda_data.hospital != 'sports', 'weight_kg'])

In [None]:
grid = sns.FacetGrid(size=7, data=eda_data, hue='hospital', legend_out=True)
grid.map(plt.scatter, "height_m", "weight_kg")
grid.add_legend()

plt.subplots_adjust(top=0.95)
grid.fig.suptitle('Height in m vs. weight in kg')

grid.ax.yaxis.label.set_rotation(0)
grid.ax.yaxis.labelpad = 35

# Set ranges for X and Y axes
grid.ax.set_xlim(1.6, 2.0)
grid.ax.set_ylim(50, 120)