In [109]:
import numpy as np
import pandas as pd
import warnings
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
import os

warnings.filterwarnings("ignore")


### Reading the dataset

In [None]:
IMAGE_OUTPUT_DIR = '../visuals/'
DATA_PATH = '../data/raw/healthifime_fitness_data.csv'

In [3]:
def save_plotly_fig(fig, filename):
    """Saves a Plotly figure to a static image file."""
    if not os.path.exists(IMAGE_OUTPUT_DIR):
        os.makedirs(IMAGE_OUTPUT_DIR)
    
    filepath = os.path.join(IMAGE_OUTPUT_DIR, filename)
    try:
        fig.write_image(filepath)
        print(f"Saved figure to {filepath}")
    except Exception as e:
        print(f"Could not save figure. Do you have 'kaleido' installed? (pip install kaleido)")
        print(f"Error: {e}")

In [4]:
df = pd.read_csv(DATA_PATH)

In [5]:
print(f'Rows {df.shape[0]}  \nColumns {df.shape[1]}')
print('-' * 50)
print(f'Dataset Information"\n{df.info()}')
print('-' * 50)
print(f'Null Values in the Dataset\n{df.isnull().sum()}')
print('-' * 50)
print(f'Dataset Preview:\n')
df.head(5)

Rows 13393  
Columns 12
--------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13393 entries, 0 to 13392
Data columns (total 12 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   age                      13393 non-null  float64
 1   gender                   13393 non-null  object 
 2   height_cm                13393 non-null  float64
 3   weight_kg                13393 non-null  float64
 4   body fat_%               13393 non-null  float64
 5   diastolic                13393 non-null  float64
 6   systolic                 13393 non-null  float64
 7   gripForce                13393 non-null  float64
 8   sit and bend forward_cm  13393 non-null  float64
 9   sit-ups counts           13393 non-null  float64
 10  broad jump_cm            13393 non-null  float64
 11  class                    13393 non-null  object 
dtypes: float64(10), object(2)
memory usage: 1.2+ MB
Dataset

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B


In [6]:
df.describe(include='all')

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class
count,13393.0,13393,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393.0,13393
unique,,2,,,,,,,,,,4
top,,M,,,,,,,,,,C
freq,,8467,,,,,,,,,,3349
mean,36.775106,,168.559807,67.447316,23.240165,78.796842,130.234817,36.963877,15.209268,39.771224,190.129627,
std,13.625639,,8.426583,11.949666,7.256844,10.742033,14.713954,10.624864,8.456677,14.276698,39.868,
min,21.0,,125.0,26.3,3.0,0.0,0.0,0.0,-25.0,0.0,0.0,
25%,25.0,,162.4,58.2,18.0,71.0,120.0,27.5,10.9,30.0,162.0,
50%,32.0,,169.2,67.4,22.8,79.0,130.0,37.9,16.2,41.0,193.0,
75%,48.0,,174.8,75.3,28.0,86.0,141.0,45.2,20.7,50.0,221.0,


## Initial thoughts:
- diastolic and systolic values cannot be 0.
- gripForce, sit-ups counts and broad jump_cm can be 0 but only in very extreme conditions or data was not captured
- sit and bend forward_cm has a very high max value. Can be a outlier! Need to check

## Class variable analysis

In [42]:
class_dict = df['class'].value_counts().to_dict()
fig = px.bar(x=list(class_dict.keys()), y=list(class_dict.values()),
             labels={'x': 'Fitness Level Class', 'y': 'Count'},
             title='Distribution of Fitness Level Classes',
             width=500,
             color=['#B6E880', '#FF97FF', '#FECB52', '#636EFA'],
             opacity=0.7
             )
fig.update_layout(template='plotly_dark', showlegend=False)
fig.show()
save_plotly_fig(fig, '01_class_distribution.png')

Saved figure to ../visuals/01_class_distribution.png


All classes are equally distributed

In [19]:
numerical_features = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
len(numerical_features)

10

In [33]:
fig = make_subplots(rows=5, cols=2, subplot_titles=numerical_features)
colors = px.colors.qualitative.Plotly
for i, col in enumerate(numerical_features):
    color = colors[i % len(colors)]

    hist_data = [df[col].dropna()]
    fig_dist = ff.create_distplot(
            hist_data, 
            group_labels=[col], 
            colors=[color],
            show_hist=True,
            show_curve=True, # This adds the KDE
            show_rug=False
        )
    
    hist_trace = fig_dist['data'][0]
    kde_trace = fig_dist['data'][1]

    hist_trace.update(opacity=0.7, name=f'{col} Hist')
    kde_trace.update(name=f'{col} KDE')
    fig.add_trace(
                hist_trace,
                row=(i // 2) + 1, col=(i % 2) + 1
                )
    fig.add_trace(
                kde_trace,
                row=(i // 2) + 1, col=(i % 2) + 1
                )
    

fig.update_layout(title_text="Univariate Analysis of Numerical Features", height=1500, width=1000, template='plotly_dark')
fig.show()
save_plotly_fig(fig, f'02_all_numerica_distributions.png')

Saved figure to ../visuals/02_all_numerica_distributions.png


In [54]:
fig_gender = px.pie(df, names='gender', title='Gender Distribution', color_discrete_sequence=px.colors.sequential.Tealgrn_r)
fig_gender.show()
save_plotly_fig(fig_gender, '03_gender_distribution.png')

Saved figure to ../visuals/03_gender_distribution.png


In [80]:
df['height_m'] = df['height_cm'] / 100
df['BMI'] = df['weight_kg'] / (df['height_m'] ** 2)

In [81]:
# Feature correlation analysis
corr_df = df.drop(columns=['gender']).copy()
corr_df['class'] = corr_df['class'].astype('category').cat.codes
corr_matrix = np.round(corr_df.corr(), 1)

fig_corr = px.imshow(corr_matrix, text_auto=True, aspect="auto",
                         title="Feature Correlation Heatmap",
                         color_continuous_scale='blues')
fig_corr.show()
save_plotly_fig(fig_corr, '04_correlation_heatmap.png')

Saved figure to ../visuals/04_correlation_heatmap.png


### Correlation results:
    - height +vely correlated with weight, gripforce, broad jump_cm and situp_counts (Shows increase in strength and increase in weight w.r.t height)
    - weight has less impact on sit-up counts and broad jump_cm though this is also influenced by the increase in height
    - BMI shows a clear picture in this regards, how increase in weight influences weight more than anything else
    - body fat % has a high negative correlation with fitness class, sit_up counts, broad jump_cm and grip strength 


In [None]:
for i, col1 in enumerate(numerical_features):
    for j, col2 in enumerate(numerical_features):
        if i == j:
            continue
        fig = px.scatter(df, x=col, y=col2, color='class',
                         title=f'Scatter plot of {col1} vs {col2} by Fitness Class')
        fig.show()
        save_plotly_fig(fig, f'scatter_{col1}_vs_{col2}.png')

In [82]:
df['age_group'] = pd.cut(df['age'], bins=[17, 25, 35, 45, 55, 65], labels=['18-25', '26-35', '36-45', '46-55', '56-65'])

In [83]:
df.head()

Unnamed: 0,age,gender,height_cm,weight_kg,body fat_%,diastolic,systolic,gripForce,sit and bend forward_cm,sit-ups counts,broad jump_cm,class,height_m,BMI,age_group
0,27.0,M,172.3,75.24,21.3,80.0,130.0,54.9,18.4,60.0,217.0,C,1.723,25.344179,26-35
1,25.0,M,165.0,55.8,15.7,77.0,126.0,36.4,16.3,53.0,229.0,A,1.65,20.495868,18-25
2,31.0,M,179.6,78.0,20.1,92.0,152.0,44.8,12.0,49.0,181.0,C,1.796,24.181428,26-35
3,32.0,M,174.5,71.1,18.4,76.0,147.0,41.4,15.2,53.0,219.0,B,1.745,23.349562,26-35
4,28.0,M,173.8,67.7,17.1,70.0,127.0,43.5,27.1,45.0,217.0,B,1.738,22.412439,26-35


In [100]:
columns_to_plot = ['body fat_%', 'BMI', 'gripForce', 'sit-ups counts', 'broad jump_cm', 'age_group']

fig = make_subplots(rows=3, cols=2, subplot_titles=columns_to_plot)
colors = px.colors.qualitative.Plotly
for i, col in enumerate(columns_to_plot):
    color = colors[i % len(colors)]

    fig.add_trace( 
                go.Box(
                    y=df[col], 
                    x=df['class'], 
                    name=col, 
                    marker_color=color
                    
                ),
                row=(i // 2) + 1, col=(i % 2) + 1
                    )


fig.update_layout(title_text="Distribution Analysis", height=1500, width=1000, template='plotly_white')
fig.show()
save_plotly_fig(fig, f'05_distibutions.png')

Saved figure to ../visuals/05_distibutions.png


In [102]:
from sklearn.decomposition import PCA

df['gender_map'] = df['gender'].astype('category').cat.codes
pca = PCA(n_components=2)
pca_result = pca.fit_transform(df[numerical_features + ['gender_map']])

In [105]:
px.scatter(x=pca_result[:,0], y=pca_result[:,1], color=df['class'],
           title='PCA of Fitness Data Colored by Fitness Class',
           labels={'x': 'PCA Component 1', 'y': 'PCA Component 2'}
           ).show()

Clear clusters are not visible