In [63]:
import pandas as pd
import plotly.express as px
import numpy as np
import cut_the_tails as ct
from sklearn.model_selection import LeaveOneOut,KFold,train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [64]:
df = pd.read_csv('data_sets\\bodyfat.csv')
target = 'BodyFat'

In [65]:
features = list(df.columns) 
features = features[2:]
features

['Age',
 'Weight',
 'Height',
 'Neck',
 'Chest',
 'Abdomen',
 'Hip',
 'Thigh',
 'Knee',
 'Ankle',
 'Biceps',
 'Forearm',
 'Wrist']

In [66]:
fig = px.histogram(df, x='BodyFat', nbins=40)
fig.show()

## ML pipeline

### Data preparation

In [67]:
X = df[features].to_numpy()
y = df[target].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model selection

In [68]:
normal_model = RandomForestRegressor(max_depth=4)
lower_tail_model = RandomForestRegressor(max_depth=4)
upper_tail_model = RandomForestRegressor(max_depth=4)

tail_classifier = RandomForestClassifier()

### Cut the tail models

In [69]:
ct.fit_cut_the_tail(X_train,y_train,[0.05,0.95],tail_classifier,lower_tail_model,
    normal_model,upper_tail_model)

In [70]:
y_pred = ct.predict_cut_the_tails(X_test,tail_classifier,lower_tail_model,normal_model,upper_tail_model)

## Baseline

In [71]:
model = RandomForestRegressor(max_depth=4)
model.fit(X_train,y_train)
y_pred_plain = model.predict(X_test)

## Results

In [72]:
y_pred-y_pred_plain

array([ 1.12444948,  0.29830884, -0.34413851,  3.53982992,  2.10795632,
        4.31595997,  0.42665975, -0.84571911, -0.4523822 ,  0.95559482,
        0.89336738,  1.22584322, -2.01861971,  0.29647022, -0.01707523,
       -0.0709681 , -1.52594923,  0.19551308, -0.13739938,  1.71734506,
       -0.02166884, -2.40060817, -1.72720188, -0.30867142,  0.18568351,
       -1.57588956,  0.61241749,  2.21996324,  1.05711062,  0.11604188,
        3.67511851,  0.0406553 , -0.2751242 ,  0.10143432,  0.51440528,
        3.16223632,  0.35563621,  1.72976471,  2.34830011, -0.05234756,
       -1.56911971, -1.63918817,  0.34523719,  2.37606072, -0.63986385,
        0.86188291, -0.9923916 , -4.02298849,  0.47669866, -0.47433852,
       -0.55267128])

In [73]:
print(f'{mean_absolute_error(y_pred,y_pred_plain)}')
print(f'{mean_absolute_percentage_error(y_pred,y_pred_plain)}')

1.1556915637999357
0.07521608810086058


In [74]:
print(f'Cut: {mean_absolute_error(y_pred,y_test)}')
print(f'Plain: {mean_absolute_error(y_pred_plain,y_test)}')
print(f'Cut: {mean_absolute_percentage_error(y_pred,y_test)}')
print(f'Plain: {mean_absolute_percentage_error(y_pred_plain,y_test)}')

Cut: 3.886284278645255
Plain: 3.88410589813369
Cut: 0.22121892429766274
Plain: 0.24289137977701986
