# Table of Contents
* [EDA](#1)
* [Target vs Features](#2)
* [Fit Linear Model](#3)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# statistics tools
import scipy.stats as stats
from sklearn.metrics import mean_absolute_error, mean_squared_error

# machine learning tools
import h2o
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

In [None]:
# load data / preview
df = pd.read_csv('../input/body-fat-prediction-dataset/bodyfat.csv')
df.head()

<a id='1'></a>
# EDA

In [None]:
# data overview
df.info()

#### No missing values.

In [None]:
# basic stats
df.describe()

In [None]:
# plot target
df.BodyFat.plot(kind='hist', bins=25)
plt.title('Body Fat - Distribution')
plt.grid()
plt.show()

In [None]:
# body fat vs density
plt.scatter(df.Density, df.BodyFat)
plt.title('Body Fat vs Density')
plt.grid()
plt.show()

In [None]:
# correlation between Density and BodyFat
print('Correlation Pearson:', stats.pearsonr(df.Density, df.BodyFat))

### Very strong dependency between Body Fat and Density. We will try to build a model w/o using the Density feature.

### For a model using Density as feature as well see the following notebook:
#### https://www.kaggle.com/docxian/body-fat-prediction-glm/

In [None]:
# define features
features = ['Age', 'Weight', 'Height', 'Neck', 
            'Chest', 'Abdomen', 'Hip', 'Thigh', 'Knee',
            'Ankle', 'Biceps', 'Forearm', 'Wrist']

In [None]:
# boxplot of features
for f in features:
    plt.figure(figsize=(12,2))
    plt.boxplot(df[f], vert=False)
    plt.title(f)
    plt.grid()
    plt.show()

#### Height at ca. 30 inch is a significant outlier, so let's remove this row.

In [None]:
df[df.Height <= 30]

In [None]:
df = df[df.Height>30]

In [None]:
# pairwise scatter plots
sns.pairplot(df[features], 
             kind='reg', 
             plot_kws={'line_kws':{'color':'magenta'}, 'scatter_kws': {'alpha': 0.25}})
plt.show()

In [None]:
# correlations
corr_pearson = df[features].corr(method='pearson')
corr_spearman = df[features].corr(method='spearman')

plt.figure(figsize=(12,14))
ax1 = plt.subplot(2,1,1)
sns.heatmap(corr_pearson, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Pearson Correlation')

ax2 = plt.subplot(2,1,2, sharex=ax1)
sns.heatmap(corr_spearman, annot=True, cmap='RdYlGn', vmin=-1, vmax=+1)
plt.title('Spearman Correlation')
plt.show()

In [None]:
# add height and weight in metric units
df['Height_m'] = df.Height*2.54/100
df['Weight_m'] = df.Weight*0.454

# add BMI as feature
df['BMI'] = df.Weight_m / (df.Height_m**2)
features = features + ['BMI']

In [None]:
df

<a id='2'></a>
# Target vs Features

In [None]:
for f in features:
    c = df[f].corr(df.BodyFat, method='pearson')
    c = np.round(c,4)
    plt.figure(figsize=(5,5))
    plt.scatter(df[f], df.BodyFat, alpha=0.5)
    plt.title('Body Fat vs ' + f + ' / corr = ' + str(c))
    plt.xlabel(f)
    plt.ylabel('Body Fat')
    plt.grid()
    plt.show()

<a id='3'></a>
# Fit Linear Model

In [None]:
# used features
print(features)

In [None]:
# define target
target='BodyFat'

In [None]:
# start H2O
h2o.init(max_mem_size='12G', nthreads=4) # Use maximum of 12 GB RAM and 4 cores

In [None]:
# upload data frame in H2O environment
df_hex = h2o.H2OFrame(df)

# train / test split
train_perc = 0.7
train_hex, test_hex = df_hex.split_frame(ratios=[train_perc], seed=999)

In [None]:
# define GLM
glm_model = H2OGeneralizedLinearEstimator(family = 'gaussian',
                                          nfolds = 5,
                                          alpha = 0.75, # 0:Ridge (L2), 1:LASSO (L1)
                                          lambda_search = True,
                                          score_each_iteration = True,                                          
                                          seed=12345)

In [None]:
# train model
glm_model.train(features, target, training_frame = train_hex)

In [None]:
# show model details
glm_model

In [None]:
# variable importance
glm_model.varimp_plot()

In [None]:
# show coefficients
glm_model.coef()

In [None]:
# predict on training data
pred_train = glm_model.predict(train_hex)
y_train_act = train_hex.as_data_frame()[target].values # actuals
y_train_pred = pred_train.as_data_frame().predict.values # predictions

In [None]:
# plot predictions vs actual
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.scatter(x=y_train_act,y=y_train_pred)
ax.plot([0,50],[0,50], color='green')
ax.set_aspect(1)
plt.grid()
plt.title('Prediction vs Actual - Training Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
# correlations
print('Correlations - Training Data')
print('Correlation Pearson:', stats.pearsonr(y_train_act, y_train_pred))
print('Correlation Spearman:', stats.spearmanr(y_train_act, y_train_pred))

In [None]:
# metrics on training data
print('MAE (train): ', np.round(mean_absolute_error(y_train_act, y_train_pred),2))
print('RMSE(train): ', np.round(np.sqrt(mean_squared_error(y_train_act, y_train_pred)),2))

In [None]:
# predict on test data
pred_test = glm_model.predict(test_hex)
y_test_act = test_hex.as_data_frame()[target].values # actual values
y_test_pred = pred_test.as_data_frame().predict.values # predictions

In [None]:
# plot predictions vs actuals
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(111)
ax.scatter(x=y_test_act,y=y_test_pred)
ax.plot([0,50],[0,50], color='green')
ax.set_aspect(1)
plt.grid()
plt.title('Prediction vs Actual - Test Data')
plt.xlabel('Actual')
plt.ylabel('Prediction')
plt.show()

In [None]:
# correlations
print('Correlations - Test Set')
print('Correlation Pearson:', stats.pearsonr(y_test_act, y_test_pred))
print('Correlation Spearman:', stats.spearmanr(y_test_act, y_test_pred))

In [None]:
# metrics on test data
print('MAE (test): ', np.round(mean_absolute_error(y_test_act, y_test_pred),2))
print('RMSE(test): ', np.round(np.sqrt(mean_squared_error(y_test_act, y_test_pred)),2))