In [39]:
import pandas as pd
import plotly.express as px
import numpy as np
import cut_the_tails as ct
from sklearn.model_selection import LeaveOneOut,KFold,train_test_split
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [40]:
df = pd.read_csv('data_sets\\bodyfat.csv')
target = 'BodyFat'

In [41]:
features = list(df.columns) 
features = features[2:]
features

['Age',
 'Weight',
 'Height',
 'Neck',
 'Chest',
 'Abdomen',
 'Hip',
 'Thigh',
 'Knee',
 'Ankle',
 'Biceps',
 'Forearm',
 'Wrist']

In [42]:
fig = px.histogram(df, x='BodyFat', nbins=40)
fig.show()

## ML pipeline

### Data preparation

In [43]:
X = df[features].to_numpy()
y = df[target].to_numpy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

### Model selection

In [44]:
normal_model = RandomForestRegressor()
lower_tail_model = RandomForestRegressor()
upper_tail_model = RandomForestRegressor()

tail_classifier = RandomForestClassifier()

### Cut the tail models

In [57]:
ct.fit_cut_the_tail(X_train,y_train,[0.00,1],tail_classifier,lower_tail_model,
    normal_model,upper_tail_model)

In [58]:
y_pred = ct.predict_cut_the_tails(X_test,tail_classifier,lower_tail_model,normal_model,upper_tail_model)

## Baseline

In [59]:
model = RandomForestRegressor()
model.fit(X_train,y_train)
y_pred_plain = model.predict(X_test)

## Results

In [60]:
y_pred-y_pred_plain

array([-0.25 , -0.986, -2.219,  0.594, -0.846,  1.098, -0.764,  0.018,
       -0.444,  0.696,  1.457, -1.248, -0.588, -1.623, -0.85 , -0.078,
        0.537,  0.116, -1.759, -0.562, -0.705,  0.189,  0.597,  0.351,
        0.132,  1.139,  0.462,  0.045, -0.255, -0.207,  0.826, -0.208,
       -0.014,  2.103, -0.004, -0.403, -0.45 , -1.011, -0.015, -0.664,
        0.515, -0.256, -0.4  ,  0.776,  0.439,  0.443, -0.507,  0.481,
        0.635, -1.896,  0.007])

In [61]:
print(f'{mean_absolute_error(y_pred,y_pred_plain)}')
print(f'{mean_absolute_percentage_error(y_pred,y_pred_plain)}')

0.6444705882352936
0.037708824795681634


In [62]:
print(f'Cut: {mean_absolute_error(y_pred,y_test)}')
print(f'Plain: {mean_absolute_error(y_pred_plain,y_test)}')
print(f'Cut: {mean_absolute_percentage_error(y_pred,y_test)}')
print(f'Plain: {mean_absolute_percentage_error(y_pred_plain,y_test)}')

Cut: 3.7229019607843132
Plain: 3.985137254901961
Cut: 0.22915511630866037
Plain: 0.25055208536815615
