In [None]:
from oura_analysis.loader import OuraDataNumeric
import plotly.express as px
import pandas as pd
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from matplotlib import pyplot as plt
from scipy.cluster.hierarchy import dendrogram

oura_data = OuraDataNumeric.from_path("../data/oura_2019-06-01_2024-01-01_trends.csv")

In [None]:
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# what are all the data columns
all_table_data = oura_data.data_table
print(all_table_data.columns)
print("Number of columns", len(all_table_data.columns))

In [None]:
def normalise_column_values(df: pd.DataFrame):
    result = df.copy()

    cols_min = df.min(axis=0)
    zero_min = result - cols_min
    result = zero_min / zero_min.max(axis=0)
    result -= 0.5
    result *= 2
    return result


# prepare oura data for clustering
# convert dates to floats
date_index = all_table_data["date"]
original_index_number = date_index.index.values
columns_to_drop = ["date", "Bedtime Start", "Bedtime End", "HRV Balance Score"]
without_some_cols = all_table_data.drop(axis=1, labels=columns_to_drop)
# drop any rows that have nans, since the clustering does not accept those
has_nan = without_some_cols.isna()
num_col_nan_sums = has_nan.sum(axis=0)
num_rows_nan_sums = has_nan.sum(axis=1)

rows_that_have_nan_values = num_rows_nan_sums > 0
num_days_with_nan_values = rows_that_have_nan_values.sum()
print("num rows discarded: ", num_days_with_nan_values)

feature_prep = without_some_cols[rows_that_have_nan_values == False]
print("num data rows to use: ", len(feature_prep))
normalised_features = normalise_column_values(feature_prep)
X = normalised_features

features_used_with_date = X.copy()
features_used_with_date["original_index_number"] = original_index_number[rows_that_have_nan_values == False]

In [None]:
distance_threshold = None
n_clusters = 7
clustering = AgglomerativeClustering(n_clusters=n_clusters, distance_threshold=distance_threshold, compute_distances=True).fit(X)
print("num unique labels: ", len(set(clustering.labels_)))
print()

features_used_with_date["category_from_clustering"] = clustering.labels_

In [None]:
plot_dendrogram(clustering, truncate_mode="level", p=4, labels=clustering.labels_)

In [None]:
import plotly.graph_objects as go


def normalise_column_values(df: pd.DataFrame):
    result = df.copy()

    cols_min = df.min(axis=0)
    zero_min = result - cols_min
    result = zero_min / zero_min.max(axis=0)
    result -= 0.5
    result *= 2
    return result


normalised_data = normalise_column_values(features_used_with_date)
normalised_data = normalised_data.sort_values("category_from_clustering")  # original_index_number # category_from_clustering
fig = go.Figure([go.Heatmap(z=normalised_data.values, y=np.arange(len(normalised_data)), x=normalised_data.columns, colorscale="RdBu")])
fig.show()

In [None]:
# could we now get what scores influnce what group, read that i could now do a random forest which has a built in feature importance metric

from sklearn.ensemble import RandomForestClassifier as RClf

model = RClf(n_estimators=100)
model.fit(X, clustering.labels_)
importances = model.feature_importances_
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)

indices = np.argsort(importances)[::-1]

print("Feature Ranking:")

for rank in range(len(importances)):
    feature_id = indices[rank]
    print(f"{rank} - {clustering.feature_names_in_[feature_id]} ({importances[feature_id]})")