In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
sns.set_style('darkgrid')

# Visualizing the data

In [None]:
df = pd.read_csv("../input/body-fat-prediction-dataset/bodyfat.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
f,ax = plt.subplots(figsize=(10,10))
sns.heatmap(df.corr(),annot=True,cbar=False,ax=ax)
plt.show()

In [None]:
df.hist(figsize=(15,15))
plt.show()

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(data=df,color="white",linewidth=3)
sns.swarmplot(data=df,s=5,alpha=0.65)
plt.show()

Abdomen seems to be a strong predictor.

In [None]:
plt.scatter(df.BodyFat,df.Abdomen)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
lin_imp = abs(df.corr().BodyFat).sort_values(ascending=False).iloc[1:]
sns.barplot(lin_imp.index,lin_imp.values,palette="Blues_r")
plt.title("Feature importances",fontsize=15)
plt.show()

# Preprocessing

In [None]:
# Adding BMI as a variable

df["BMI"] = (df.Weight/2.205)/((df.Height/39.37)**2) # convert pounds into kg and feet into cm
df.BMI.describe()

In [None]:
sns.boxplot(df.BMI)
plt.title("Outlier?")
plt.show()

In [None]:
print(df.shape)
df = df.query("BMI<100")
print(df.shape)

There seems to at least one outlier with a BMI of > 160.

# Regression
## OLS

In [None]:
from sklearn.model_selection import train_test_split
y = df.BodyFat
x = df.iloc[:,2:] # Dropping Bodyfat as well as Density, as it is an almost perfect predictor.

X_train, X_test, y_train, y_test = train_test_split(x,y,test_size=0.2,random_state=42)
print("Train set", X_train.shape)
print("Test set", X_test.shape)

In [None]:
# Linear Regression

from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)

sample = X_train.sample(5,random_state=20).index
some_data = X_train.loc[sample]
some_dep = y_train.loc[sample]

pred = lin_reg.predict(some_data)

In [None]:
print(some_dep.to_list(),"\n",pred)

In [None]:
# Get Linear Coefficients
print("Intercept:", lin_reg.intercept_)
print("\nCoefficients:\n--------------")
pd.Series(lin_reg.coef_,index=X_train.columns)

As long as we rule out endogeneity and accept standard OLS assumptions, the coefficients are BLUE (best linear unbiased estimator). However, high levels of multicolinearity changes the magnitude of the coefficients, making them hard to interpret. The fact that some measures have a negative sign is counter-intuitive, in fact, it is likely that some coefficients rebalance the strong positive effect of Abdomen.

In [None]:
# Check VIF
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

vif_df = add_constant(X_train)
vif = pd.Series([variance_inflation_factor(vif_df.values,i) for i in range(vif_df.shape[1])],index=vif_df.columns)
vif

As suspected, there are high levels of multicolinearity, especially for Weight, Height, and the BMI. Some variables should perhaps be dropped, for example by constraining the model using an L1 norm, i.e. using a Lasso Regression.

In [None]:
from sklearn.metrics import mean_squared_error

bodyfat_predictions = lin_reg.predict(X_train)
lin_rmse = np.sqrt(mean_squared_error(y_train,bodyfat_predictions))
print("Linear Root Mean Squared Error:", lin_rmse)

## Lasso

In [None]:
from sklearn.linear_model import Lasso

lasso_reg = Lasso()
lasso_reg.fit(X_train,y_train)

bodyfat_predictions_l = lasso_reg.predict(X_train)
lasso_rmse = np.sqrt(mean_squared_error(y_train,bodyfat_predictions_l))
print("Lasse Root Mean Squared Error:", lasso_rmse)

Lasso actually performs slightly worse than OLS. However, it greatly reduces the number of predictors.

In [None]:
# Get Lasso Coefficients
print("Intercept:", lasso_reg.intercept_)
print("\nCoefficients:\n--------------")
c = pd.Series(lasso_reg.coef_,index=X_train.columns)
c.loc[lambda x: x!=0]

## Ridge

In [None]:
from sklearn.linear_model import Ridge

ridge_reg = Ridge(alpha=1)
ridge_reg.fit(X_train,y_train)

bodyfat_predictions_r = ridge_reg.predict(X_train)
ridge_rmse = np.sqrt(mean_squared_error(y_train,bodyfat_predictions_r))
print("Ridge Root Mean Squared Error:", ridge_rmse)

In [None]:
print("Intercept:", ridge_reg.intercept_)
print("\nCoefficients:\n--------------")
c = pd.Series(ridge_reg.coef_,index=X_train.columns)
c.loc[lambda x: x!=0]

Ridge is basically identical to the OLS regression.

## Introducing Interaction Terms of degree 2

In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2,include_bias=False)
X_train_poly = poly_features.fit_transform(X_train)

lasso_reg_p = Lasso()
lasso_reg_p = lasso_reg_p.fit(X_train_poly,y_train)

print(f"Input features: {poly_features.n_input_features_},\nOutput features (after fitting polynomial terms): {poly_features.n_output_features_}")

In [None]:
print("Intercept:", lasso_reg_p.intercept_)
print("\nCoefficients:\n--------------")
c = pd.Series(lasso_reg_p.coef_)
c.loc[lambda x: x!=0]

The Lasso estimator reduced the number of (non-zero) predictors from 119 to 66, eliminating 53.

In [None]:
bodyfat_predictions_l_poly = lasso_reg_p.predict(X_train_poly)
lasso_p_rmse = np.sqrt(mean_squared_error(y_train,bodyfat_predictions_l_poly))
print(f"Lasso Root Mean Squared Error with degree=2 Polynomial features: {lasso_p_rmse}")

We can see that the RMSE slightly improved. Adding new features might improve the score even further.

## Plotting Predictions

In [None]:
# Plotting Predictions
from sklearn.metrics import r2_score
plt.figure(figsize=(15,15))
plt.plot(y_train,y_train,label="True",c="r")
plt.scatter(bodyfat_predictions,y_train,label="OLS",c="g",marker="x")
plt.scatter(bodyfat_predictions_l,y_train,label="Lasso",c="y",marker="x")
plt.scatter(bodyfat_predictions_l_poly,y_train,label="Lasso Polynomial Features",c="b",marker="x")
plt.legend()
plt.title(f"Overall Fits: \nOLS - {r2_score(y_train,bodyfat_predictions)}, \nLasso - {r2_score(y_train,bodyfat_predictions_l)},\nLasso with Pol. Features {r2_score(y_train,bodyfat_predictions_l_poly)}",fontsize=15)
plt.plot()

# N.B. Decision Tree Regressor

In [None]:
# DecisionTreeRegressor, or an example of a bad choice of model

from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)

bodyfat_predictions = tree_reg.predict(X_train)
tree_rmse = np.sqrt(mean_squared_error(y_train,bodyfat_predictions))
print("Tree Regressor Root Mean Squared Error:", tree_rmse)

No, the model is not perfect. The Tree Regressor is badly overfitting the data we gave it. It will not score well on new data. The best way to see this, is to use Cross Validation:

In [None]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(tree_reg,X_train,y_train,scoring="neg_mean_squared_error",cv=4)
tree_rmse_scores = np.sqrt(-scores)
print(f"Mean: {tree_rmse_scores.mean()}")
print(f"Standard Dev: {tree_rmse_scores.std()}")
print(tree_rmse_scores)

The fact that the scores are much worse than the OLS and Lasso scores is a confirmation of the fact that this model is badly overfitting, and given the relatively little amount of observations, hence not a good choice of model. 