In [None]:
### import libraries

# format
import pandas as pd

# plot
import matplotlib.pyplot as plt
import seaborn as sns

# math
import numpy as np
import math
from scipy.linalg import inv, det
from numpy.linalg import eig

from scipy import stats


In [None]:

data_raw = pd.read_csv("Sleep_health_and_lifestyle_dataset.csv")
data_raw.head()

In [None]:
### Preprocess Data
data = data_raw.copy()

rename_dict = {
    'Person ID':'person_id',
    'Gender': 'gender',
    'Age':'age',
    'Occupation':'occupation',
    'Sleep Duration':'sleep_duration',
    'Quality of Sleep':'quality_of_sleep',
    'Physical Activity Level':'physical_activity_level',
    'Stress Level':'stress_level',
    'BMI Category':'bmi_category', 
    'Blood Pressure':'blood_pressure', 
    'Heart Rate':'heart_rate', 
    'Daily Steps':'daily_steps',
    'Sleep Disorder':'sleep_disorder' 
}

data.rename(columns=rename_dict, inplace=True)

#sleep quality categorical,stress level bmi sleep disorder change to 1 to 0 if there's any sleep issue

data['quality_of_sleep'] = data['quality_of_sleep'].astype(str)
data['stress_level'] = data['stress_level'].astype(str)
data['sleep_issue'] = data['sleep_disorder'].map(lambda x: '1' if x in ['Insomnia','Sleep Apnea'] else '0').astype(str)
data[["blood_pressure_systolic","blood_pressure_diastolic"]] = data["blood_pressure"].str.split('/',expand=True)
data["blood_pressure_diastolic"] = pd.to_numeric(data['blood_pressure_diastolic'])
data["blood_pressure_systolic"] = pd.to_numeric(data["blood_pressure_systolic"])


data.info()

In [None]:
data.columns

In [None]:
numeric_variables = ['age','sleep_duration','physical_activity_level','heart_rate','daily_steps','blood_pressure_systolic','blood_pressure_diastolic']
categorical_variable = ['gender','occupation','quality_of_sleep','stress_level','bmi_category','sleep_issue']

In [None]:
# pairplot
sns.pairplot(data[numeric_variables], diag_kind='kde', corner=True)
plt.show()

In [None]:
corr_mat = data[numeric_variables].corr()
plt.figure(figsize=(8, 6))  # Adjust the figure size as needed
sns.heatmap(corr_mat, annot=True, cmap='coolwarm', vmin=-1, vmax=1, center=0, square=True, linewidths=0.5)
plt.title('Correlation Plot')
plt.show()

In [None]:

fig, axs = plt.subplots(4,2, figsize=(12, 14))
axs[0,0].hist(data['age'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[0,0].set_title("Hist of Age")
axs[0,1].hist(data['sleep_duration'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[0,1].set_title("Hist of Sleep Duration")
axs[1,0].hist(data['physical_activity_level'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[1,0].set_title("Hist of Physical Activity Level")
axs[1,1].hist(data['heart_rate'],bins=10,density=True,alpha=0.6,color='b',edgecolor="black")
axs[1,1].set_title("Hist of Heart Rate")
axs[2,0].hist(data['heart_rate'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[2,0].set_title("Hist of Daily Steps")
axs[2,1].hist(data['blood_pressure_systolic'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[2,1].set_title("Hist of Blood pressure systolic")
axs[3,0].hist(data['blood_pressure_diastolic'],density=True,alpha=0.6,color='b',edgecolor="black")
axs[3,0].set_title("Hist of Blood pressure diastolic")
axs[3,1].axis("off")
plt.tight_layout(pad=2.0)

In [None]:

fig, axs = plt.subplots(4,2, figsize=(12, 14))
axs[0,0].boxplot(data['age'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[0,0].set_title("Hist of Age")
axs[0,1].boxplot(data['sleep_duration'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[0,1].set_title("Hist of Sleep Duration")
axs[1,0].boxplot(data['physical_activity_level'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[1,0].set_title("Hist of Physical Activity Level")
axs[1,1].boxplot(data['heart_rate'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[1,1].set_title("Hist of Heart Rate")
axs[2,0].boxplot(data['heart_rate'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[2,0].set_title("Hist of Daily Steps")
axs[2,1].boxplot(data['blood_pressure_systolic'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[2,1].set_title("Hist of Blood pressure systolic")
axs[3,0].boxplot(data['blood_pressure_diastolic'],vert=True, patch_artist=True,medianprops=dict(color="black"))
axs[3,0].set_title("Hist of Blood pressure diastolic")
axs[3,1].axis("off")
plt.tight_layout(pad=2.0)

In [None]:
for i in numeric_variables:
    aux_skew = stats.skew(data[i])
    print(f"Skewness of {i} : {aux_skew}")

In [None]:
### Statistical Analysis & intercorrelation

means = data[numeric_variables].mean()
variances = data[numeric_variables].var()
df_covariance_matrix = data[numeric_variables].cov()
df_correlation_matrix = data[numeric_variables].corr()

def intercorrelations(X):
    n, p = X.shape
    R = np.corrcoef(X, rowvar=False)
    lambda_vals, _ = eig(R)
    rjj = np.diag(inv(R))
    q = np.zeros(6)
    q[0] = (1 - min(lambda_vals) / max(lambda_vals)) ** (p + 2)
    q[1] = 1 - p / np.sum(1. / lambda_vals)
    q[2] = 1 - np.sqrt(det(R))
    q[3] = (max(lambda_vals) / p) ** (3 / 2)
    q[4] = (1 - min(lambda_vals) / p) ** 5
    q[5] = np.sum((1 - 1. / rjj) / p)
    
    # print
    print(q)

    # plot
    plt.plot(range(1, 7), q, marker='o', linestyle='-', color='b', label='Intercorrelations')
    plt.xlabel("intercorrelation metric")
    plt.ylabel("intercorrelation score")
    return None


intercorrelations(data[numeric_variables])

In [None]:
# Function for plotting of conditional histograms 
def plot_categorical_hist(ncols,
                          numeric_variables,
                          categorical_variable,
                          host_stat='count',
                          figsize=(12, 10)
                          ):
    
    nrows = math.ceil(len(numeric_variables)/ncols)
    fig, axes = plt.subplots(nrows, ncols, figsize=figsize)
    for i, col in enumerate(numeric_variables):
        row = i // 2 
        col_pos = i % 2 
        sns.histplot(data=data, x=col, bins=10,hue=categorical_variable, kde=True, ax=axes[row, col_pos],stat=host_stat)
        axes[row, col_pos].set_title(f'Distribution of {col}')

    # Adjust layout for better spacing
    plt.tight_layout()
    plt.show()

In [None]:
# TODO: Select relevant ones 
for categorical_variable in categorical_variables:
    print(categorical_variable)
    plot_categorical_hist(ncols=2,
                        numeric_variables=numeric_variables,#['calories_burned','session_duration_hrs','fat_percentage','avg_bpm'],
                        categorical_variable=categorical_variable,
                        host_stat='probability',
                        figsize=(12, 20))