# Health data analysis

Oura ring data for last 3 years
captures HRV, Sleep, Exercise etc...

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
import pytz

from datetime import timedelta
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mutual_info_score
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler





In [None]:
# Load data
df = pd.read_csv('data/data.csv')


In [None]:
# look at data
df.head()

In [None]:
df.isna().sum()/len(df)

In [None]:
# remove NAs
df = df.dropna()

In [None]:
# check type of data
df.dtypes

In [None]:
df['date'] = pd.to_datetime(df['date'], utc=True)
df = df.set_index('date')



In [None]:
#remove columns with 'score' in the name

df_copy = df.copy()
df_copy = df_copy.loc[:, ~df_copy.columns.str.contains('score', case=False)]
print("Current Columns: ",df_copy.columns)
print("removed columns: ",df.columns[df.columns.str.contains('score', case=False)])

In [None]:
numeric_cols = df_copy.select_dtypes(include=[np.number]).columns.tolist()
# Standardize numeric columns | note that we don't have to import any packages for this.
scaler = StandardScaler() 
df_copy[numeric_cols] = scaler.fit_transform(df_copy[numeric_cols])

In [None]:
df_copy = df_copy.reset_index()

In [None]:
# plot Average HRV and Resting Heart Rate
plt.figure(figsize=(12, 6))
plt.plot(df_copy['date'], df_copy['Average HRV'], label='Average HRV', color='blue')
plt.plot(df_copy['date'], df_copy['Lowest Resting Heart Rate'], label='Resting Heart Rate', color='orange')

plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Average HRV and Resting Heart Rate Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()

# What impacts my sleep score the most?

In [None]:
# define targets and features
y = df['Sleep Score']
X = df_copy


In [None]:
# Drop date column from features
X = X.drop(columns=['date','Bedtime Start', 'Bedtime End'])

# Handle missing values
X = X.fillna(X.mean()) # can try RF imputation or other methods as well
y = y.fillna(y.mean())

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))

In [None]:
importance = model.feature_importances_
features = pd.Series(importance, index=X.columns).sort_values(ascending=False)

features.plot(kind='bar', title='Feature Importance for Sleep Score')
plt.tight_layout()
plt.show()

In [None]:
sleep_cols = ['Deep Sleep Duration','Total Sleep Duration', 'Sleep Latency','Light Sleep Duration','REM Sleep Duration','Total Bedtime ','Restless Sleep']
sleep_cols.append('Sleep Score')  # include target
df_sleep = df[sleep_cols]
df_sleep = df_sleep.dropna()
# df_sleep = df_sleep.drop(columns =['Bedtime Start', 'Bedtime End'])
df_sleep = df_sleep.loc[:, ~df_sleep.columns.duplicated()]  # remove duplicate columns

In [None]:
# sleep_cols

In [None]:
X = df_sleep.drop(columns='Sleep Score')
y = df_sleep['Sleep Score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_df)