In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Import libraries

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error
import xgboost as XGB

## Read the data

In [None]:
df = pd.read_csv('../input/body-fat-prediction-dataset/bodyfat.csv')

In [None]:
df.head()

## Check for null values

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.shape

In [None]:
data = df.copy()

# EDA

Drop the columns as specified in the question

In [None]:
data = data.drop(columns=['BodyFat','Density'], axis=1)
features = list(data.columns)

In [None]:
data.head()

In [None]:
sns.pairplot(data)

In [None]:
correlation = df[features].corr(method='spearman')

In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(correlation, annot=True, vmin=-1, vmax=1)
plt.show()

No feature is correlated to another feature , hence all the features can be taken for our analysis

In [None]:
X = data
y = df['BodyFat']

# Split data

In [None]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Linear Regression

In [None]:
logs = []

In [None]:
regr = LinearRegression()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
score = regr.score(X_test, y_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"Score --> {score}")
print(f"RMSE --> {rmse}")

log = {"name": "linear_regression", "score": score, "rmse": rmse}
logs.append(log)

# Lasso

In [None]:
lasso_regr = Lasso(alpha=0.5)
lasso_regr.fit(X_train, y_train)
cv_score = cross_val_score(lasso_regr, X_train, y_train, cv=10)
print(f"CV Score --> {np.mean(cv_score)}")
y_pred = lasso_regr.predict(X_test)
print(f"Score --> {lasso_regr.score(X_test, y_test)}")
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE --> {rmse}")

log = {"name": "lasso", "score": np.mean(cv_score), "rmse": rmse}
logs.append(log)

# Ridge

In [None]:
ridge_regr = Ridge(alpha=0.5)
ridge_regr.fit(X_train, y_train)
cv_score = cross_val_score(ridge_regr, X_train, y_train, cv=10)
print(f"CV Score --> {np.mean(cv_score)}")
y_pred = ridge_regr.predict(X_test)
print(f"Score --> {ridge_regr.score(X_test, y_test)}")
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE --> {rmse}")

log = {"name": "ridge", "score": np.mean(cv_score), "rmse": rmse}
logs.append(log)

# XGBoost

In [None]:
xgb_regr = XGB.XGBRegressor(learning_rate = 0.01, n_estimators=1000)
xgb_regr.fit(X_train, y_train)
cv_score = cross_val_score(xgb_regr, X_train, y_train, cv=10)
print(f"CV Score --> {np.mean(cv_score)}")
y_pred = regr.predict(X_test)
print(f"Score --> {xgb_regr.score(X_test, y_test)}")
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f"RMSE --> {rmse}")

log = {"name": "XGBoost", "score": np.mean(cv_score), "rmse": rmse}
logs.append(log)

In [None]:
logs

# Models comparison

In [None]:
x = []
y = []
z = []
for log in logs:
    x.append(log['name'])
    y.append(log['score'])
    z.append(log['rmse'])
    
# sns.barplot(x,y)
# sns.barplot(x,z)

plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
sns.barplot(x,y)
plt.title("Models and their accuracy")

plt.subplot(2,2,2)
sns.barplot(x,z)
plt.title("Models and their rmse")

plt.show()

**Lasso regression technique gives the highest accuracy of 69.5%**

**This is a work in progress. In the subsequent versions, I will try to increase the accuracy.**

### If you found this notebook useful, consider upvoting. Thank you