In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [None]:
# Import Data
data_raw = pd.read_csv("gym_members_exercise_tracking.csv")
data_raw.info()
data_raw.head()

In [None]:
### Preprocess Data
data = data_raw.copy()

# dtypes
data['Experience_Level'] = data['Experience_Level'].astype(str)
data['Workout_Frequency (days/week)'] = data['Workout_Frequency (days/week)'].astype(str)

# Make water intake binary
data['Water_intake_binary'] = (data['Water_Intake (liters)'] > 2).astype(int)
data.drop(columns=["Water_Intake (liters)"],inplace=True)

# change column names and make column names lower case
data.columns = ['Age','Gender','Weight_kg','Height_m','Max_BPM','Avg_BPM','Resting_BPM','Session_Duration_hrs','Calories_Burned','Workout_Type','Fat_Percentage','Water_Intake_lts','Workout_Frequency_days_per_week','Experience_Level','BMI']
data.columns = map(str.lower, data.columns)

# print
data.info()

In [None]:
data.head()

In [None]:
numeric_variables = ['age','weight_kg','height_m','max_bpm','avg_bpm','resting_bpm','session_duration_hrs','calories_burned','fat_percentage']

In [None]:
sns.pairplot(data[numeric_variables], diag_kind='kde', corner=True)
plt.show()

In [None]:
corr_mat = data[numeric_variables].corr()
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=0.5)
plt.title('Correlation Plot')
plt.show()

In [None]:
fig, axs = plt.subplots(5,2, figsize=(12, 14))
axs[0,0].hist(data['age'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[0,0].set_title("Hist of Age")
axs[0,1].hist(data['weight_kg'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[0,1].set_title("Hist of Weight (kg)")
axs[1,0].hist(data['height_m'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[1,0].set_title("Hist of Height (m)")
axs[1,1].hist(data['fat_percentage'],bins=10,density=True,alpha=0.6,color='b',edgecolor="black")
axs[1,1].set_title("Hist Fat Percentage")
axs[2,0].hist(data['session_duration_hrs'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[2,0].set_title("Hist of Session duration hrs")
axs[2,1].hist(data['max_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[2,1].set_title("Hist of Max BPM")
axs[3,0].hist(data['avg_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[3,0].set_title("Hist of Avg. BPM")
axs[3,1].hist(data['resting_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[3,1].set_title("Hist of Resting BPM")
axs[4,0].hist(data['calories_burned'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[4,0].set_title("Hist of Calories Burned")
axs[4,1].axis("off")
plt.tight_layout(pad=2.0)


In [None]:
fig, axs = plt.subplots(5,2, figsize=(12, 14))
axs[0,0].boxplot(data['age'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[0,0].set_title("Boxplot of Age")
axs[0,1].boxplot(data['weight_kg'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[0,1].set_title("Boxplot of Weight (kg)")
axs[1,0].boxplot(data['height_m'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[1,0].set_title("Boxplot of Height (m)")
axs[1,1].boxplot(data['fat_percentage'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[1,1].set_title("Boxplot Fat Percentage")
axs[2,0].boxplot(data['session_duration_hrs'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[2,0].set_title("Boxplot of Session duration hrs")
axs[2,1].boxplot(data['max_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[2,1].set_title("Boxplot of Max BPM")
axs[3,0].boxplot(data['avg_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[3,0].set_title("Boxplot of Avg. BPM")
axs[3,1].boxplot(data['resting_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[3,1].set_title("Boxplot of Resting BPM")
axs[4,0].boxplot(data['calories_burned'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[4,0].set_title("Boxplot of Calories Burned")
axs[4,1].axis("off")
plt.tight_layout(pad=2.0)

The Boxplot of Age and Session duration look very symmetric only by looking.
The Height and the Fat % are both asymmetrics.
The 3 about BPM looks kind of asymmetric.
The boxplots Weight variables and the burned calories boths have outliers in them.

In [None]:
for i in numeric_variables:
    aux_skew = stats.skew(data[i])
    print(f"Skewness of {i} : {aux_skew}")

The mix between boxplot and skewness tell us the next thing about our variables:
The Age, Max BPM, Avg. BPM, Resting BPM and session duration are all symmetric in the data.
For Weight, Height and calorie burned are moderately swkewed. The 3 variables have a long tails on the right side. (The outliers that we detected in the boxplot.)