# Train the regression model for body fat

In [1]:
	
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_selection import mutual_info_regression

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import klib

from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor



import joblib

In [2]:
df = pd.read_csv(r"/Users/user/CV/BodyVisionAI/backend/notebook/data/bodyfat.csv")
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252 entries, 0 to 251
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Density  252 non-null    float64
 1   BodyFat  252 non-null    float64
 2   Age      252 non-null    int64  
 3   Weight   252 non-null    float64
 4   Height   252 non-null    float64
 5   Neck     252 non-null    float64
 6   Chest    252 non-null    float64
 7   Abdomen  252 non-null    float64
 8   Hip      252 non-null    float64
 9   Thigh    252 non-null    float64
 10  Knee     252 non-null    float64
 11  Ankle    252 non-null    float64
 12  Biceps   252 non-null    float64
 13  Forearm  252 non-null    float64
 14  Wrist    252 non-null    float64
dtypes: float64(14), int64(1)
memory usage: 29.7 KB


Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,Biceps,Forearm,Wrist
0,1.0708,12.3,23,154.25,67.75,36.2,93.1,85.2,94.5,59.0,37.3,21.9,32.0,27.4,17.1
1,1.0853,6.1,22,173.25,72.25,38.5,93.6,83.0,98.7,58.7,37.3,23.4,30.5,28.9,18.2
2,1.0414,25.3,22,154.0,66.25,34.0,95.8,87.9,99.2,59.6,38.9,24.0,28.8,25.2,16.6
3,1.0751,10.4,26,184.75,72.25,37.4,101.8,86.4,101.2,60.1,37.3,22.8,32.4,29.4,18.2
4,1.034,28.7,24,184.25,71.25,34.4,97.3,100.0,101.9,63.2,42.2,24.0,32.2,27.7,17.7


    Weight to kg; Height to m

In [3]:
# change the weight column to kg get the last two decimal places

df['Weight'] = df['Weight'].apply(lambda x: round(x * 0.453592, 2))

# change the height column to m get the last two decimal places

df['Height'] = df['Height'].apply(lambda x: round(x * 0.0254, 2))


In [4]:
df["BMI"] = df["Weight"] / (df["Height"] ** 2)
df['Abdomen_to_Hip'] = df['Abdomen'] / df['Hip']
df['Chest_to_Abdomen'] = df['Chest'] / df['Abdomen']
df['Abdomen_to_Height'] = df['Abdomen'] / df['Height']


In [5]:
df = df.drop(columns=['Biceps','Wrist','Forearm'], axis=1)

In [6]:
df.describe()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,BMI,Abdomen_to_Hip,Chest_to_Abdomen,Abdomen_to_Height
count,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0,252.0
mean,1.055574,19.150794,44.884921,81.158452,1.782302,37.992063,100.824206,92.555952,99.904762,59.405952,38.590476,23.102381,25.9262,0.924538,1.094165,52.159735
std,0.019031,8.36874,12.60204,13.330783,0.09329,2.430913,8.430476,10.783077,7.164058,5.249952,2.411805,1.694893,9.538235,0.059047,0.056191,8.134091
min,0.995,0.0,22.0,53.75,0.75,31.1,79.3,69.4,85.0,47.2,33.0,19.1,17.95917,0.787879,0.919649,39.565217
25%,1.0414,12.475,35.75,72.1175,1.73,36.4,94.35,84.575,95.5,56.0,36.975,22.0,23.051661,0.886043,1.058232,47.52405
50%,1.0549,19.2,43.0,80.06,1.78,38.0,99.65,90.95,99.3,59.0,38.5,22.8,25.079836,0.920218,1.090443,51.333948
75%,1.0704,25.3,54.0,89.36,1.84,39.425,105.375,99.325,103.525,62.35,39.925,24.0,27.297665,0.96315,1.128917,55.240726
max,1.1089,47.5,81.0,164.72,1.97,51.2,136.2,148.1,147.7,87.3,49.1,33.9,165.315556,1.096362,1.265223,139.066667


In [7]:
df.head()

Unnamed: 0,Density,BodyFat,Age,Weight,Height,Neck,Chest,Abdomen,Hip,Thigh,Knee,Ankle,BMI,Abdomen_to_Hip,Chest_to_Abdomen,Abdomen_to_Height
0,1.0708,12.3,23,69.97,1.72,36.2,93.1,85.2,94.5,59.0,37.3,21.9,23.651298,0.901587,1.092723,49.534884
1,1.0853,6.1,22,78.58,1.84,38.5,93.6,83.0,98.7,58.7,37.3,23.4,23.210066,0.840932,1.127711,45.108696
2,1.0414,25.3,22,69.85,1.68,34.0,95.8,87.9,99.2,59.6,38.9,24.0,24.748441,0.886089,1.089875,52.321429
3,1.0751,10.4,26,83.8,1.84,37.4,101.8,86.4,101.2,60.1,37.3,22.8,24.75189,0.853755,1.178241,46.956522
4,1.034,28.7,24,83.57,1.81,34.4,97.3,100.0,101.9,63.2,42.2,24.0,25.508989,0.981354,0.973,55.248619


In [8]:
from sklearn.feature_selection import mutual_info_regression
mi_scores = mutual_info_regression(df.drop('BodyFat', axis=1), df['BodyFat'])

a = df.drop('BodyFat', axis=1)
mi_scores = pd.DataFrame(mi_scores, index=a.columns, columns=['Mutual Information'])
mi_scores = mi_scores.sort_values(by='Mutual Information', ascending=False)
mi_scores

Unnamed: 0,Mutual Information
Density,3.934954
Abdomen,0.58785
Abdomen_to_Height,0.553996
Abdomen_to_Hip,0.48362
BMI,0.405211
Chest,0.349698
Chest_to_Abdomen,0.330274
Hip,0.316563
Weight,0.285096
Thigh,0.282179


In [9]:
x = df.drop(columns=['BodyFat'], axis=1)
y = df['BodyFat']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [18]:
df.columns


Index(['Density', 'BodyFat', 'Age', 'Weight', 'Height', 'Neck', 'Chest',
       'Abdomen', 'Hip', 'Thigh', 'Knee', 'Ankle', 'BMI', 'Abdomen_to_Hip',
       'Chest_to_Abdomen', 'Abdomen_to_Height'],
      dtype='object')

# Models

In [10]:
# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Regressor - MSE: {mse_rf}, R2: {r2_rf}")

Random Forest Regressor - MSE: 0.07311978431372662, R2: 0.9984281423464897


In [11]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(x_train, y_train)
y_pred_ridge = ridge.predict(x_test)
mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Ridge Regression - MSE: {mse_ridge}, R2:    {r2_ridge}")

Ridge Regression - MSE: 18.31599569060667, R2:    0.6062606272959461


In [12]:
# lgbm Regressor
lgbm = LGBMRegressor(n_estimators=100, random_state=42)
lgbm.fit(x_train, y_train)
y_pred_lgbm = lgbm.predict(x_test)
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
r2_lgbm = r2_score(y_test, y_pred_lgbm)
print(f"LGBM Regressor - MSE: {mse_lgbm}, R2: {r2_lgbm}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 847
[LightGBM] [Info] Number of data points in the train set: 201, number of used features: 15
[LightGBM] [Info] Start training from score 19.435821
LGBM Regressor - MSE: 0.6840249928975496, R2: 0.985295499291065


In [13]:
# xgb Regressor
xgb = XGBRegressor(n_estimators=100, random_state=42)
xgb.fit(x_train, y_train)
y_pred_xgb = xgb.predict(x_test)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)
print(f"XGB Regressor - MSE: {mse_xgb}, R2: {r2_xgb}")

XGB Regressor - MSE: 0.2433749178151132, R2: 0.9947681638994069


In [14]:
ln = LinearRegression()
ln.fit(x_train, y_train)
y_pred_ln = ln.predict(x_test)
mse_ln = mean_squared_error(y_test, y_pred_ln)
r2_ln = r2_score(y_test, y_pred_ln)
print(f"Linear Regression - MSE: {mse_ln}, R2: {r2_ln}")

Linear Regression - MSE: 0.34877209851668745, R2: 0.9925024383273324


In [15]:
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [3, 4, 5],
#     'learning_rate': [0.01, 0.05, 0.1],
#     'subsample': [0.7, 0.8, 1.0],
#     'colsample_bytree': [0.7, 0.8, 1.0],
#     'reg_alpha': [0, 0.1, 0.5],
#     'reg_lambda': [1, 1.5, 2]
# }

# grid_search = GridSearchCV(
#     estimator=xgb,
#     param_grid=param_grid,
#     cv=5,
#     scoring='r2',
#     verbose=1,
#     n_jobs=-1
# )

# grid_search.fit(x_train, y_train)

# print("Best parameters:", grid_search.best_params_)
# print("Best CV R²:", grid_search.best_score_)

In [16]:
# best_xgb = grid_search.best_estimator_
# y_pred = best_xgb.predict(x_test)

# mse = mean_squared_error(y_test, y_pred)
# r2 = r2_score(y_test, y_pred)

# print(f"Tuned XGBoost - MSE: {mse}, R²: {r2}")


In [17]:

joblib.dump(lgbm, r'/Users/user/CV/BodyVisionAI/backend/models/prediction_model.pkl')
print("Model saved successfully!")

Model saved successfully!
