In [None]:
### import libraries

# format
import pandas as pd

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# math
import numpy as np
import math
from scipy.linalg import inv, det
from numpy.linalg import eig

from scipy import stats

# Introduction
The following project documentation was written as work assignment for the module "Multivariate Analysis" of the Master in Statistics for Data Science at the Universidad Carlos III de Madrid. It contains the Multivariate Analysis of a Kaggel dataset on physical (gym) excercise data (https://www.kaggle.com/datasets/valakhorasani/gym-members-exercise-dataset). The work is split into to parts, where in a first part a exploratory data analysis is performed, where required,  data preprocessing steps are performed and a Prinicipal Component Analysis (PCA) is performed. In the second part, based on the learnings of part one, a (XXXXXXXXXXX) is performed to (XXXXXXXXX). 

# Part 1 - Exploratory Analysis, Preprossesing and PCA
As discussed in the introduction, the first part contains the initial exploratory analysis of the dataset as well as a PCA of the dataset. The dataset at hand is composed out of the following collumns: 

| Attribute                      | Description                                                       |
|--------------------------------|-------------------------------------------------------------------|
| Age                            | Age of the gym member.                                            |
| Gender                         | Gender of the gym member (Male or Female).                        |
| Weight (kg)                    | Member’s weight in kilograms.                                     |
| Height (m)                     | Member’s height in meters.                                        |
| Max_BPM                        | Maximum heart rate (beats per minute) during workout sessions.    |
| Avg_BPM                        | Average heart rate during workout sessions.                       |
| Resting_BPM                    | Heart rate at rest before workout.                                |
| Session_Duration (hours)       | Duration of each workout session in hours.                        |
| Calories_Burned                | Total calories burned during each session.                        |
| Workout_Type                   | Type of workout performed (e.g., Cardio, Strength, Yoga, HIIT).   |
| Fat_Percentage                 | Body fat percentage of the member.                                |
| Water_Intake (liters)          | Daily water intake during workouts.                               |
| Workout_Frequency (days/week)  | Number of workout sessions per week.                              |
| Experience_Level               | Level of experience, from beginner (1) to expert (3).             |
| BMI                            | Body Mass Index, calculated from height and weight.               |

And before starting with the exploratory data analysis, few basic data formatting steps are performed like modifying data types, features names and turning the water intake feature from a continious variable to a binary variable (<2 liters:0,>=2:1) to comply with the assignments requirements.

In [None]:
# Import Data
data_raw = pd.read_csv("cities.csv")
data_raw.info()
data_raw.head()

In [None]:
# ### Preprocess Data
# data = data_raw.copy()

# # dtypes
# data['Experience_Level'] = data['Experience_Level'].astype(str)
# data['Workout_Frequency (days/week)'] = data['Workout_Frequency (days/week)'].astype(str)

# # Make water intake binary
# data['Water_intake_binary'] = (data['Water_Intake (liters)'] > 2).astype(int)
# data.drop(columns=["Water_Intake (liters)"],inplace=True)

# # change column names and make column names lower case
# rename_dict = {
#     'Age': 'Age',
#     'Gender': 'Gender',
#     'Weight (kg)': 'Weight_kg',
#     'Height (m)': 'Height_m',
#     'Max_BPM': 'Max_BPM',
#     'Avg_BPM': 'Avg_BPM',
#     'Resting_BPM': 'Resting_BPM',
#     'Session_Duration (hours)': 'Session_Duration_hrs',
#     'Calories_Burned': 'Calories_Burned',
#     'Workout_Type': 'Workout_Type',
#     'Fat_Percentage': 'Fat_Percentage',
#     'Water_Intake (liters)': 'Water_Intake_lts',
#     'Workout_Frequency (days/week)': 'Workout_Frequency_days_per_week',
#     'Experience_Level': 'Experience_Level',
#     'BMI': 'BMI'
# }

# data.rename(columns=rename_dict, inplace=True)


# data.columns = map(str.lower, data.columns)

# # print
# data.info()

In [None]:
data_raw.columns

In [None]:
data = data_raw.copy()
data.drop(columns=["Unnamed: 0"],inplace=True)

In [None]:
data

In [None]:
# set up list of numeric var for plotting
numeric_variables = ['Housing',
       'Cost of Living', 'Startups', 'Venture Capital', 'Travel Connectivity',
       'Commute', 'Business Freedom', 'Safety', 'Healthcare', 'Education',
       'Environmental Quality', 'Economy', 'Taxation', 'Internet Access',
       'Leisure & Culture', 'Tolerance', 'Outdoors']
categorical_variables = ['UA_Name', 'UA_Country', 'UA_Continent']

TODO: Pairplot interpretation

In [None]:
# pairplot
sns.pairplot(data[numeric_variables], diag_kind='kde', corner=True)
plt.show()

TODO: Correlation Plot interpretation

In [None]:
corr_mat = data[numeric_variables].corr()
plt.figure(figsize=(15, 12))  # Adjust the figure size as needed
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=0.5)
plt.title('Correlation Plot')
plt.show()

In [None]:
# # Histograms
# fig, axs = plt.subplots(5,2, figsize=(12, 14))
# axs[0,0].hist(data['age'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[0,0].set_title("Hist of Age")
# axs[0,1].hist(data['weight_kg'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[0,1].set_title("Hist of Weight (kg)")
# axs[1,0].hist(data['height_m'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[1,0].set_title("Hist of Height (m)")
# axs[1,1].hist(data['fat_percentage'],bins=10,density=True,alpha=0.6,color='b',edgecolor="black")
# axs[1,1].set_title("Hist Fat Percentage")
# axs[2,0].hist(data['session_duration_hrs'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[2,0].set_title("Hist of Session duration hrs")
# axs[2,1].hist(data['max_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[2,1].set_title("Hist of Max BPM")
# axs[3,0].hist(data['avg_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[3,0].set_title("Hist of Avg. BPM")
# axs[3,1].hist(data['resting_bpm'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[3,1].set_title("Hist of Resting BPM")
# axs[4,0].hist(data['calories_burned'],density=True,alpha=0.6,color='b',edgecolor="black")
# axs[4,0].set_title("Hist of Calories Burned")
# axs[4,1].axis("off")
# plt.tight_layout(pad=2.0)


In [None]:
# fig, axs = plt.subplots(5,2, figsize=(12, 14))
# axs[0,0].boxplot(data['age'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[0,0].set_title("Boxplot of Age")
# axs[0,1].boxplot(data['weight_kg'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[0,1].set_title("Boxplot of Weight (kg)")
# axs[1,0].boxplot(data['height_m'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[1,0].set_title("Boxplot of Height (m)")
# axs[1,1].boxplot(data['fat_percentage'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[1,1].set_title("Boxplot Fat Percentage")
# axs[2,0].boxplot(data['session_duration_hrs'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[2,0].set_title("Boxplot of Session duration hrs")
# axs[2,1].boxplot(data['max_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[2,1].set_title("Boxplot of Max BPM")
# axs[3,0].boxplot(data['avg_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[3,0].set_title("Boxplot of Avg. BPM")
# axs[3,1].boxplot(data['resting_bpm'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[3,1].set_title("Boxplot of Resting BPM")
# axs[4,0].boxplot(data['calories_burned'],vert=True, patch_artist=True,medianprops=dict(color="black"))
# axs[4,0].set_title("Boxplot of Calories Burned")
# axs[4,1].axis("off")
# plt.tight_layout(pad=2.0)

The Boxplot of Age and Session duration look very symmetric only by looking. The Height and the Fat % are both asymmetrics. The 3 about BPM looks kind of asymmetric. The boxplots Weight variables and the burned calories boths have outliers in them.

In [None]:
for i in numeric_variables:
    aux_skew = stats.skew(data[i])
    print(f"Skewness of {i} : {aux_skew}")

The mix between boxplot and skewness tell us the next thing about our variables: The Age, Max BPM, Avg. BPM, Resting BPM and session duration are all symmetric in the data. For Weight, Height and calorie burned are moderately swkewed. The 3 variables have a long tails on the right side. (The outliers that we detected in the boxplot.)

In [None]:
# Function for plotting of conditional histograms 
def plot_categorical_hist(ncols,
                          numeric_variables,
                          categorical_variable,
                          host_stat='count',
                          figsize=(12, 10)
                          ):
    
    nrows = math.ceil(len(numeric_variables)/ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    for i, col in enumerate(numeric_variables):
        row = i // 2 
        col_pos = i % 2 
        sns.histplot(data=data, x=col, bins=10,hue=categorical_variable, kde=True, ax=axes[row, col_pos],stat=host_stat)
        axes[row, col_pos].set_title(f'Distribution of {col}')

    # Adjust layout for better spacing
    plt.tight_layout()
    plt.show()

In [None]:
# TODO: Select relevant ones 
for categorical_variable in categorical_variables:
    print(categorical_variable)
    plot_categorical_hist(ncols=2,
                        numeric_variables=numeric_variables,#['calories_burned','session_duration_hrs','fat_percentage','avg_bpm'],
                        categorical_variable=categorical_variable,
                        host_stat='probability',
                        figsize=(12, 20))

In [None]:
### Relevant ones

# gender -> hight and weight show differences but not relevant for analysis
plot_categorical_hist(ncols=2,
                      numeric_variables=['calories_burned','session_duration_hrs','fat_percentage','avg_bpm'],
                      categorical_variable='gender',
                      figsize=(12, 10))

In [None]:
### TODO: Pointplots

In [None]:
### Statistical Analysis & intercorrelation

means = data[numeric_variables].mean()
variances = data[numeric_variables].var()
df_covariance_matrix = data[numeric_variables].cov()
df_correlation_matrix = data[numeric_variables].corr()

def intercorrelations(X):
    n, p = X.shape
    R = np.corrcoef(X, rowvar=False)
    lambda_vals, _ = eig(R)
    rjj = np.diag(inv(R))
    q = np.zeros(6)
    q[0] = (1 - min(lambda_vals) / max(lambda_vals)) ** (p + 2)
    q[1] = 1 - p / np.sum(1. / lambda_vals)
    q[2] = 1 - np.sqrt(det(R))
    q[3] = (max(lambda_vals) / p) ** (3 / 2)
    q[4] = (1 - min(lambda_vals) / p) ** 5
    q[5] = np.sum((1 - 1. / rjj) / p)
    
    # print
    print(q)

    # plot
    plt.plot(range(1, 7), q, marker='o', linestyle='-', color='b', label='Intercorrelations')
    plt.xlabel("intercorrelation metric")
    plt.ylabel("intercorrelation score")
    return None


intercorrelations(data[numeric_variables])