# Health data analysis

Oura ring data for last 3 years
captures HRV, Sleep, Exercise etc...

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from scipy.stats import ttest_ind
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

## Load Data

In [None]:
df = pd.read_csv('data/data.csv')

# backup (just in case)
# df = pd.read_csv('backup_data/data.csv')

## EDA

In [None]:
# look at data
df.head()

In [None]:
df.isna().sum()/len(df)

In [None]:
# remove NAs
df = df.dropna()

In [None]:
# # check type of data
# df.dtypes

In [None]:
df['date'] = pd.to_datetime(df['date'], utc=True)
df = df.set_index('date')

## Can we identify when I fell sick just by looking at the data?

In [None]:
# remove columns with 'score' in the name

df_copy = df.copy()
df_copy = df_copy.loc[:, ~df_copy.columns.str.contains('score', case=False)]
print("Current Columns: ",df_copy.columns)
print("removed columns: ",df.columns[df.columns.str.contains('score', case=False)])

In [None]:
numeric_cols = df_copy.select_dtypes(include=[np.number]).columns.tolist()
# Standardize numeric columns | note that we don't have to import any packages for this.
scaler = StandardScaler() 
df_copy[numeric_cols] = scaler.fit_transform(df_copy[numeric_cols])

In [None]:
df_copy = df_copy.reset_index()

In [None]:
# plot some physiological metrics
plt.figure(figsize=(12, 6))
# plt.plot(df_copy['date'], df_copy['Average HRV'], label='Average HRV', color='blue')
plt.plot(df_copy['date'], df_copy['Lowest Resting Heart Rate'], label='Resting Heart Rate', color='blue')
plt.plot(df_copy['date'], df_copy['Temperature Trend Deviation'], label='Temperature Trend Deviation', color='red')

plt.xlabel('Date')
plt.ylabel('Values')
plt.title('Temperature Trend Deviation and Resting Heart Rate Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.gca().xaxis.set_major_locator(mdates.AutoDateLocator())

# But can we predict the likelihood of falling sick?

### Or Can we find another excuse to not go to the gym today?

In [None]:
# known sick dates 
sick_dates = ['2021-01-10', '2021-01-11','2021-01-12', '2021-01-13', '2021-01-15', '2021-01-16', '2021-01-17',  
    '2021-04-03', '2021-04-10', '2022-07-10',  '2022-07-16', '2022-07-17', '2022-07-18', '2022-07-19', 
    '2022-11-02', '2022-11-05', '2024-01-12']
sick_dates = pd.to_datetime(sick_dates)
# create sick flag
df_copy['sick'] = df_copy['date'].isin(sick_dates).astype(int)
df_copy['date'] = pd.to_datetime(df_copy['date']).dt.tz_localize(None)


In [None]:
# load weather dataset
df_weather = pd.read_csv('data/us_weather_data.csv')  # from the NOAA website
df_weather['date'] = pd.to_datetime(df_weather['date']).dt.tz_localize(None)


In [None]:
# merge datasets on date
df2 = df_copy.merge(df_weather, on='date', how='left')

In [None]:
# check columns after merge
print("Columns after merge: ", df2.columns)

### And research suggests that swinging temperatures place a strain on our thermoregulatory and cardiovascular systems

In [None]:
# So let's create an interaction term for temperature swing
df2['temp_swing'] = df2['TMAX'] - df2['TMIN']
df2.groupby('sick')[['temp_swing', 'PRCP', 'TAVG','TMAX', 'TMIN', 'Average HRV', 'Temperature Deviation (°C)']].mean()

### But is it significant?


In [None]:
sick_group = df2[df2['sick'] == 1]
healthy_group = df2[df2['sick'] == 0]

ttest_ind(sick_group['temp_swing'], healthy_group['temp_swing'], nan_policy='omit')

# What impacts my 'Sleep Score' the most?

In [None]:
sleep_cols = ['Deep Sleep Duration','Total Sleep Duration', 'Sleep Latency','Light Sleep Duration','REM Sleep Duration','Total Bedtime ','Restless Sleep', 'Awake Time']
sleep_cols.append('Sleep Score')  # include target
df_sleep = df[sleep_cols]
df_sleep = df_sleep.dropna()
# df_sleep = df_sleep.drop(columns =['Bedtime Start', 'Bedtime End'])
df_sleep = df_sleep.loc[:, ~df_sleep.columns.duplicated()]  # remove duplicate columns

In [None]:
X = df_sleep.drop(columns='Sleep Score')
y = df_sleep['Sleep Score']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LinearRegression()
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
print("R² Score:", r2_score(y_test, y_pred))

In [None]:
coef_df = pd.DataFrame({
    'Feature': X.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_df)

# But we all know that it's the ratios that are important!

So let's try some interaction terms

In [None]:
# X and y for the interaction model
X = df_sleep.drop(columns='Sleep Score').copy()
y = df_sleep['Sleep Score'].copy()


In [None]:
# Ratio features
X['REM_to_DEEP_ratio'] = X['REM Sleep Duration'] / X['Deep Sleep Duration']
X['DEEP_to_TOTAL_ratio'] = X['Deep Sleep Duration'] / X['Total Sleep Duration']
X['REM_to_TOTAL_ratio'] = X['REM Sleep Duration'] / X['Total Sleep Duration']
X['Light_to_Total_ratio'] = X['Light Sleep Duration'] / X['Total Sleep Duration']

In [None]:
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X = df_sleep.drop(columns='Sleep Score')

y = df_sleep['Sleep Score']
X_interactions = poly.fit_transform(X)


In [None]:
# Replace any inf/nan (due to division by zero or missing)
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.dropna(inplace=True)
y = y.loc[X.index]  # align y with filtered X

# Scale all features (originals + ratios)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

In [None]:
model = LinearRegression()
model.fit(X_scaled, y)

# View coefficients (importance)
coef_df = pd.DataFrame({
    'Feature': X_scaled_df.columns,
    'Coefficient': model.coef_
}).sort_values(by='Coefficient', key=abs, ascending=False)

print(coef_df.head(10))  # Top 10 important interactions

In [None]:
# does it have a higher R² score?
X_train, X_test, y_train, y_test = train_test_split(X_scaled_df, y, test_size=0.2, random_state=42)
lr_ratio = LinearRegression()
lr_ratio.fit(X_train, y_train)
ratio_score = lr_ratio.score(X_test, y_test)
print("R² Score with Ratios:", ratio_score)