# Fitness Data Clustering Dashboard

## 0. Importing of Libraries needed for this project

In [15]:
#disable some annoying warnings
import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

#plots the figures in place instead of a new window
%matplotlib inline
import pandas as pd
import numpy as np
import altair as alt
import ipywidgets as widgets
import seaborn as sns
from sklearn.impute import SimpleImputer
from sklearn import manifold
from sklearn import decomposition
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.cluster import KMeans, AffinityPropagation, MeanShift, SpectralClustering, AgglomerativeClustering, DBSCAN, OPTICS, Birch
from IPython.display import display, Image, Markdown
from scipy import stats
from datetime import datetime as dt
import numpy as np

print(alt.__version__)

import umap

4.1.0


In [16]:
# Helper functions


# Plotting:

def specify_palette_plt(categories):    
    palette = sns.color_palette(n_colors=len(categories))
    return palette

def specify_palette_altair(categories):
    colors = sns.color_palette(n_colors=len(categories)).as_hex()
    palette = alt.Scale(domain=categories, range=colors)
    return palette

def create_legend(axis, palette):
    custom_lines = [Line2D([0], [0], color=palette[i], marker='o', markersize=10, ls='') for i in range(len(activities))]    
    axis.legend(custom_lines, activities)
    

# Printing markdown:

def printmd(string):
    display(Markdown(string))

## 1. Data Loading and Preprocessing

In [17]:
class Dataset():
    
    def __init__(self, path, unnecessary_columns):       
        self.raw_df = pd.read_csv(path)
        self.cleaned_df = self._get_cleaned_df(unnecessary_columns) # this df can be used for task 2 & 3
        self.target, self.activities = self._encode_target()
        self.downprojection_df = self._get_downprojection_df() # this df is for task 4
    
    def _encode_target(self):       
        activities = self.cleaned_df['Activity Type'].unique()
        str2idx = dict((c, i) for i, c in enumerate(activities))
        target = [str2idx[i] for i in self.cleaned_df['Activity Type']]
        self.cleaned_df['Target'] = target
        return target, activities
    
    def decoder(self, encoded):
        idx2str = dict((i, c) for i, c in enumerate(self.activities))
        decoded = [idx2str[i] for i in encoded]
        return decoded
    
    def _get_cleaned_df(self, unnecessary_columns):
        data = self.raw_df.drop(unnecessary_columns, axis=1)
        time_columns = ['Time', 'Avg Pace', 'Best Pace', 'Best Lap Time', 'Climb Time']
        data = self.__convert_time_columns(data, time_columns)
        data = data.replace('--', np.nan)
        data = self.__str2float(data)
        
        # Shorten strings in 'Activity Type'
        original_strings = ['Strength Training', 'Open Water Swimming', 'Pool Swimming', 'Gym & Fitness Equipment']
        replacement_strings = ['Strength', 'Swimming', 'Swimming', 'Gym']
        data = self.__shorten_target_strings(data, original_strings, replacement_strings)        
        return data
    
    def __str2float(self, data):
        '''
        Converts strings to float
        '''
        data['Calories'] = data['Calories'].str.replace(",", "")
        data['Distance'] = data['Distance'].str.replace(",", "")
        float_columns = data.columns.drop(['Activity Type', 'Date'])
        data[float_columns] = data[float_columns].astype(float)
        return data
    
    def __convert_time_columns(self, data, time_columns):
 
        for column in time_columns:           
            data[column] = self.__get_converted_entries(data[column])
        return data
   

    def __get_converted_entries(self, entries):       
        '''
        The time columns either consist of '--' values, min/km strings in different time formats or km/h strings.
        This function converts all time strings into min/km floats.
        '''
        # define time format lambdas
        f1 = lambda a : dt.strptime(a, '%H:%M:%S')
        f2 = lambda a : dt.strptime(a, '%M:%S.%f')
        f3 = lambda a : dt.strptime(a, '%M:%S')
        f4 = lambda a : dt.strptime(a, '%H:%M:%S.%f')

        converted_entries = []

        for i in entries:
            if (i=='--'): # nan values
                converted_entries.append(i)
            elif (len(i)>9) & ('.' in i): # min/km, time format 4
                converted_entries.append(f4(i).hour*60 + f4(i).minute + f4(i).second/60)
            elif (len(i)>5) & ('.' in i): # min/km, time format 2
                converted_entries.append(f2(i).minute + f2(i).second/60)
            elif (len(i)>6): # min/km, time format 1
                converted_entries.append(f1(i).hour*60 + f1(i).minute + f1(i).second/60)
            elif ('.' not in i): # min/km, time format 3
                converted_entries.append(f3(i).minute + f3(i).second/60)
            else: # km/h, conversion to min/km
                converted_entries.append(float(i)/60)

        return converted_entries
    
    def __shorten_target_strings(self, data, original_strings, replacement_strings):      
        for original, replacement in zip(original_strings, replacement_strings):
            data['Activity Type'] = data['Activity Type'].str.replace(original, replacement)
        return data

    def _get_downprojection_df(self):        
        downprojection_data = self.cleaned_df.drop(['Activity Type', 'Date', 'Target'], axis=1)
        downprojection_data = downprojection_data.astype(float)
        
        downprojection_data = self.__fillna(downprojection_data)
        downprojection_data = self.__normalize(downprojection_data)
        
        return downprojection_data
            
    def __fillna(self, data, strategy='constant'):
        '''
        Fills missing values
        '''
        imputer = SimpleImputer(missing_values=np.nan, strategy=strategy)
        data = imputer.fit_transform(data)
        return data

    def __normalize(self, data):
        for column in range(data.shape[1]):
            s = data[:,column]   
            data[:,column] = (s-s.mean())/s.std()
        return data

The dataset is initially cleaned through the removal of unnecessary columns containing nan, zero, or string entries. Missing values, represented as '—' strings, are substituted with np.nan. The time column strings are converted to float values using get_converted_entries() due to differing time formats. The Calories and Distance string columns are transformed into floats. Activity Type names are shortened and summarized, such as 'Open Water Swimming' and 'Pool Swimming' becoming 'swimming'. The resultant "cleaned dataset" serves for statistics and correlation graphs.
Further preparation is required for downprojection and clustering, undertaken in the get_downprojection_df() method. Target columns and the date column are eliminated prior to applying machine learning.

As downprojection algorithms can't process np.nan, they are replaced with 0.
Dropping them would have been no option because whole activity type classes would be neglected, and replacing them by the mean or median would skew the dataset because it does not make sense to assign a distance to a yoga activity for instance.
Ultimately, the downprojection data is normalized through standard scaling.

In [18]:
path = 'data/Garmin_Activities_all.csv'

unnecessary_columns = ['Favorite', 'Title', 'Avg Vertical Ratio', 'Avg Vertical Oscillation', 'Avg Run Cadence.1', 
                       'Max Run Cadence.1', 'Training Stress Score®', 'Grit', 'Flow', 'Total Strokes', 'Avg. Swolf',
                       'Avg Stroke Rate', 'Total Reps', 'Total Sets', 'Bottom Time', 'Min Temp', 'Surface Interval',
                       'Decompression', 'Max Temp', 'Avg Resp', 'Min Resp', 'Max Resp', 'Stress Change', 'Stress Start', 
                       'Stress End', 'Avg Stress']

data = Dataset(path, unnecessary_columns)
cleaned_data = data.cleaned_df
target, activities = data.target, data.activities
cleaned_data.head()

Unnamed: 0,Activity Type,Date,Distance,Calories,Time,Avg HR,Max HR,Avg Run Cadence,Max Run Cadence,Avg Pace,Best Pace,Elev Gain,Elev Loss,Avg Stride Length,Climb Time,Best Lap Time,Number of Laps,Target
0,Walking,2020-12-04 10:31:47,3.7,276.0,55.7,132.0,177.0,100.0,158.0,15.066667,3.066667,83.0,104.0,0.67,55.7,10.533333,4.0,0
1,Breathwork,2020-12-01 11:10:04,0.0,,9.366667,76.0,92.0,,,,,,,0.0,9.366667,0.0,160.0,1
2,Breathwork,2020-11-30 21:02:16,0.0,,5.483333,98.0,106.0,,,,,,,0.0,5.483333,0.033333,101.0,1
3,Cardio,2020-11-30 20:09:43,0.0,300.0,51.716667,136.0,173.0,,,,,,,0.0,51.716667,51.7,1.0,2
4,Walking,2020-11-30 15:37:49,3.21,158.0,48.516667,96.0,113.0,103.0,158.0,15.133333,7.833333,93.0,62.0,0.64,48.516667,3.133333,4.0,0


## 2. Descriptive Statistics


### First we take a look at the structure of the cleaned dataset.
Aim:
* To give the user an overview about class distribution by using a bar chart.
* Knowing the size of a class should help the user judge the reliability of statistics which will be presented later in this section. (The mean of a small class may not be as representative as the mean of a class with
 hundreds of samples).
* Using a similar color scheme as in the Correlation task should allow a better overview.

In [19]:
printmd(f'This is a training dataset. The cleaned dataset contains: \
\n * {cleaned_data.shape[0]} datapoints \
\n   * each datapoint has a label and {cleaned_data.shape[1]-2} features (excluding: Activity Type, Target) \
\n   * each datapoint can belong to one of the following classes:  \
\n {set(cleaned_data["Activity Type"])}')

This is a training dataset. The cleaned dataset contains: 
 * 421 datapoints 
   * each datapoint has a label and 16 features (excluding: Activity Type, Target) 
   * each datapoint can belong to one of the following classes:  
 {'Yoga', 'Strength', 'Walking', 'Cardio', 'Breathwork', 'Gym', 'Swimming', 'Running', 'Cycling'}

In [20]:
# Specifying a color scheme:
palette = specify_palette_altair(activities)

# Plotting class distribution:
class_distribution=alt.Chart(cleaned_data).mark_bar(size=20).encode(
            y = alt.Y('Activity Type', title='classes'),
            x = alt.X('count()'),
            color=alt.Color('Activity Type:N', 
                            scale=palette, 
                            legend = alt.Legend(symbolSize=400, orient="top")),
            tooltip='count()'
        ).properties(
            width=380,
            height=300,
            title="class distribution"
        )

class_distribution

#### Interpretation:
The frequency of individual activities in this dataset varies quite a bit. Some activities were carried out only a couple of times (like "Gym" 2 times), others were carried out a lot (like "Walking" 159 times). Statistics will be way more representative on activities which were carried out a lot. E.g.: The two datapoints of the activity "Gym" will not tell us a lot where the true mean of e.g: "Calories" is. Furthermore the issue of missing values is bigger in the case of activities which where carried out less often.

### When was this Dataset created?
Aim:
* Give overview of the "Date" attribute of each datapoint.
* The user should be able to see if the "Date" attribute is realistic or if it contains obvious errors.

In [7]:
def draw_time_plot(df):
    chart=alt.Chart(df).mark_tick(
        opacity=0.2,
        color='red',
        thickness=2,
        size=40 * 0.9,  # controls width of tick.
    ).encode(x=alt.X('Date:T',axis=alt.Axis(format='%B %Y')),
            tooltip=['Activity Type','Date:T']
    ).interactive()
    return chart

printmd("This graph can be zoomed!")
@widgets.interact(selected = ['All Data','Walking','Cardio', 'Breathwork', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength', 'Running'])
def plotter(selected):
    if selected=='All Data':
        return draw_time_plot(cleaned_data)
    else:
        my_list=[selected]
        my_data=(cleaned_data.iloc[[True if any(string in activity for string in my_list) else False for activity in cleaned_data["Activity Type"]]])
        return draw_time_plot(my_data)

This graph can be zoomed!

interactive(children=(Dropdown(description='selected', options=('All Data', 'Walking', 'Cardio', 'Breathwork',…

#### Interpretation:
The dataset spans from April 5th, 2018 to December 4th, 2020. However, the graph indicates a concentration of entries between February and December 2020. Analyzing each activity and its corresponding dates offers valuable insights into dataset creation. Focusing solely on 2020, the predominant year of data collection, reveals patterns: "Cycling" primarily occurred during summer, while "Walking" persisted throughout the year. This examination serves as a "sanity check", demonstrating consistent and distributed engagement in frequent activities like "Running", "Walking", "Breathwork", "Cycling," and "Yoga" over an extended period, rather than clustered occurrences.

### Now lets get an overview over the distibution of the other attributes!
Aim:
* Give an overview of the distribution of all the attributes for each class, using boxplots.
* Since some attributes contain very large numbers and others contain very small numbers, the user can (optionally) get the attributes plotted at log2 scale.

In [8]:
def draw_stat_boxplot(df,att,plot_title):
    chart=alt.Chart(df).transform_fold(
    att,
    as_=['key', 'value']
    ).mark_boxplot().encode(
        y='key:N',
        x='value:Q'
    ).properties(
        title=plot_title
    )
    return chart

In [9]:
@widgets.interact(activity = ['All Activities','Cardio', 'Breathwork', 'Walking', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength', 'Running'],
                 scale=["linear","log2"])
def plotter(activity, scale):            
    if activity=='All Activities':
        if scale=="linear":
            part1= pd.DataFrame(cleaned_data)
            part1=part1.drop(['Activity Type','Calories','Elev Gain','Elev Loss','Date','Target','Distance'], axis=1)
            att1=list(part1.columns)

            part2=pd.DataFrame(cleaned_data['Calories'])
            part2['Distance']=cleaned_data['Distance']
            part2['Elev Gain']=cleaned_data['Elev Gain']
            part2['Elev Loss']=cleaned_data['Elev Loss']
            att2=list(part2.columns)

            return draw_stat_boxplot(part1,att1,f'{scale} scale Distribution of attibutes within in All Activities') | draw_stat_boxplot(part2,att2,f'{scale} scale distribution for attributes with larger values')
        else:
            part1= pd.DataFrame(cleaned_data)
            part1=part1.drop(['Activity Type','Date','Target'], axis=1)
            part1=np.log2(part1)
            att1=list(part1.columns)
            return draw_stat_boxplot(part1,att1,f'{scale} scale Distribution of attibutes within in All Activities')
    else:
        attribute_list=[]
        my_list=[activity]
        my_data=(cleaned_data.iloc[[True if any(string in activity for string in my_list) else False for activity in cleaned_data["Activity Type"]]])
        my_data=my_data.drop(['Activity Type','Target','Date'], axis=1)
        for colum in my_data.columns:
            if not len(my_data)==my_data[colum].isnull().sum():
                attribute_list.append(colum)
            #if not colum.isnull()
        if scale== "linear":
            return draw_stat_boxplot(my_data,attribute_list, f'{scale} scale Distribution of attibutes within in {activity}')
        else:
            my_data=np.log2(my_data)
            return draw_stat_boxplot(my_data,attribute_list, f'{scale} scale Distribution of attibutes within in {activity}')


interactive(children=(Dropdown(description='activity', options=('All Activities', 'Cardio', 'Breathwork', 'Wal…

#### Interpretation:

The graphs reveal that certain classes lack entries for specific activities, indicating that some attributes have been "dropped." For instance, "Swimming" and "Yoga" lack "Elevation gain" data. In many cases, attributes that are irrelevant to a given activity were excluded as expected. However, there are instances where irrelevant attributes were instead set to zero. For example, in the "Gym" activity, all values, except for "Calories," were zeroed.
Examining the distribution of non-empty attribute values gives significant insights. Most notably, it highlights a big variation in attribute value distribution across distinct classes. For instance, within the "Breathwork" class, all "Avg Heartrate" values fall below the median for the same attribute across the entire dataset.

### Since the distribution of attribute values differs a lot between classes, lets plot an individual attribute and label the class of each datapoint: 
Aim:
* We have seen that the distribution of individual attributes varies a lot between classes, but the graphs before made it really hard to judge which attribute value ranges correspond to which class. The following
graph should show exactly that.

In [10]:
@widgets.interact(selected = ['Max HR','Distance','Calories','Time','Avg HR','Avg Run Cadence','Max Run Cadence','Avg Pace','Best Pace','Elev Gain','Elev Loss','Avg Stride Length','Climb Time','Best Lap Time','Number of Laps'])
def plotter(selected):
     
    chart = alt.Chart(cleaned_data, width=50).mark_circle(opacity=0.7).encode(
    x=alt.X(selected + ':Q', 
            axis= alt.Axis(labelFontSize=8, tickCount=4)),
    color = alt.Color('Activity Type:N', scale=palette, legend = alt.Legend(symbolSize=400, orient="top")),
    tooltip=['Activity Type',selected],
    y=alt.Y(
        'jitter:Q',
        axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
        scale=alt.Scale()
    )
    ).transform_calculate(
    jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
    ).interactive(
    ).properties(
    width=800,
    height=80
    )
    return chart

interactive(children=(Dropdown(description='selected', options=('Max HR', 'Distance', 'Calories', 'Time', 'Avg…

### Interpretation
The graph illustrates that specific attributes such as "Max Heartrate" allow for a clean separation of activities (e.g., "Running" vs. "Breathwork").
In other attributes, a clean separation is not straightforward.
The attribute “Time” for example makes it impossible to separate classes based on their value range. However, the attribute “Time” reveals another interesting insight, that was not obvious from the boxplots in the graph above: Nearly all instances of Breathwork exercises lasted approximately 6 or 11 minutes, with minimal entries falling between these time frames.


### Comparing two attributes of two Classes/Activity Types:

Aim:
* The last graph showed that in some cases two classes can be distinguished by only looking at value ranges of one attribute, if those ranges are disjoint. But it if there is some overlap, it is hard for the user to
judge how big this overlapping is. By comparing two boxplots of the Attribute values of two classes, the user can understand this better.
* Give the user a "higher level" way of comparing two classes when looking at a specific attribute.

In [11]:
def stat_boxplot(df,sel,activity):
    chart=alt.Chart(df).transform_fold(
    [sel],
    as_=['key', 'value']
    ).mark_boxplot().encode(
        y='key:N',
        x = 'value:Q', 
    ).properties(
        title=activity
    )
    return chart
def double_boxplot(df_1,df_2,sel,activity1,activity2):
    plot=alt.vconcat(
        stat_boxplot(df_1,sel,activity1),
        stat_boxplot(df_2,sel,activity2)
    ).resolve_scale(
        x='shared'
    )
    #x =  stat_boxplot(df_1,sel) & stat_boxplot(df_2,sel)
    return plot
printmd("No Boxplot = this activity has no entries of the attribute!")
@widgets.interact(attribute = ['Max HR','Distance','Calories','Time','Avg HR','Avg Run Cadence','Max Run Cadence','Avg Pace','Best Pace','Elev Gain','Elev Loss','Avg Stride Length','Climb Time','Best Lap Time','Number of Laps'],
                  activity1 = ['Walking', 'Cardio', 'Breathwork', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength', 'Running'],
                  activity2 = ['Running','Cardio', 'Breathwork', 'Walking', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength'],)
def plotter(attribute,activity1,activity2):         
    my_list=[activity1]
    df_1=(cleaned_data.iloc[[True if any(string in activity for string in my_list) else False for activity in cleaned_data["Activity Type"]]])
    my_list=[activity2]
    df_2=(cleaned_data.iloc[[True if any(string in activity for string in my_list) else False for activity in cleaned_data["Activity Type"]]])
    
    return double_boxplot(df_1,df_2,attribute,activity1,activity2)

No Boxplot = this activity has no entries of the attribute!

interactive(children=(Dropdown(description='attribute', options=('Max HR', 'Distance', 'Calories', 'Time', 'Av…

### Interpretation

These graphs allow to better assess the degree of overlap between two classes with respect to a specific attribute.
For instance, previously identified attributes like "Max Heartrate" and "Avg Heartrate" exhibit notably separate value ranges for certain activities: "Avg Heartrate," reveals distinct, non-overlapping classes when comparing "Running vs. Breathwork" or "Running vs. Yoga”.

These boxplots also facilitate an understanding of the extent to which overlapping classes diverge. Consider "Avg Stride length" for "Walking" and "Running." Despite slight separation in medians (0.81 vs. 0.86), they show similar interquartile ranges. However, comparing "Calories" between these classes shows more divergence. The medians are significantly apart (71 vs. 331), and their interquartile ranges also differ significantly (92.5 vs. 292.5). Moreover, 75% of "Walking" datapoints fall within a range occupied by only 25% of "Running" datapoints, indicating a considerable degree of separation below 165 "Calories".

Furthermore, these boxplots highlight attribute value variances between specific activities. For example, they show that "Cycling" has notably lower variance in "Distance" than "Running", raising the question whether a similar variance pattern exists for "Time". Indeed, this correlation is true, indicating a connection between attribute values. This relationship will be further explored in chapter "3. Correlations".

### Now it would be interesting to compare the Standard Deviation of Attributes between Classes/Activities

Aim:
* To allow the user to compare multiple Standard deviations at once between two classes.

In [12]:
def plot_std(df,attribute):
    std_distibution=alt.Chart(df).mark_bar(size=20).encode(
            x = alt.X('Attributes', title='classes'),
            y = alt.Y('std'),
            tooltip='std'
        ).properties(
            width=380,
            height=300,
            title=f'Standard deviation of {attribute}'
        ).resolve_scale(
        y='shared'
        )

    return std_distibution

In [13]:
def plot_std_of_activity(Activity):
    float_data = cleaned_data.drop(['Activity Type', 'Date', 'Target'], axis=1)
    attribute_list = float_data.columns
    if Activity != 'All Activities':
        selected_activity=[Activity]
        float_data=(float_data.iloc[[True if any(string in activity for string in selected_activity) else False for activity in cleaned_data["Activity Type"]]])
    
    std_df = pd.DataFrame(columns=['Attributes', 'std'])
    for i,attribute in enumerate(attribute_list):
        std_value=[float_data[attribute].std()]
        std_df.loc[i] = [attribute] + std_value
    return plot_std(std_df,Activity)

In [14]:
@widgets.interact(Activity_1 = ['All Activities','Walking', 'Cardio', 'Breathwork', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength', 'Running'],
                  Activity_2 = ['Walking','All Activities', 'Cardio', 'Breathwork', 'Cycling', 'Yoga', 'Swimming', 'Gym', 'Strength', 'Running'])
def plotter(Activity_1,Activity_2):
    return alt.hconcat(
        plot_std_of_activity(Activity_1),
        plot_std_of_activity(Activity_2)
    ).resolve_scale(
        y='shared'
    )

interactive(children=(Dropdown(description='Activity_1', options=('All Activities', 'Walking', 'Cardio', 'Brea…

### Interpretation

This graph allows to compare standard deviations between Activities/Classes in the fitness dataset, providing valuable insights into training patterns. It shows the “regularity”, i.e., the similarity between training sessions within a class – low standard deviations imply high session regularity. By comparing "regularity" across classes, one can delve into training consistency.

For example, comparing "Breathwork" and "Walking" illustrates extreme contrast. "Breathwork" shows low standard deviations, indicating highly similar sessions. In contrast, "Walking" exhibits high standard deviations, implying session dissimilarity. "Breathwork" thus possesses higher "regularity". A deeper look reveals that "Walking" sessions vary significantly in attributes like "Calories", "Climb Time", "Elevation Gain", "Elevation Loss", "Max Run Cadence", and "Time." In contrast, "Breathwork" mainly differs in "Number of Laps".

A less extreme example is comparing "Walking" and "Cycling", where "Cycling" maintains higher "regularity" than "Walking."

Examining the std-plot for "All Activities" uncovers interesting insights. Individual training sessions differ significantly, not surprising given varied activities. Particularly, "Calories" entries are widely spread, displaying the most dissimilarity across sessions. This attribute stands out as the least similar among training sessions.

## 3. Correlations


After some descriptive statistics and exploratory analysis, the correlation between the attributes is analyzed.

In the next part, the user has the opportunity to choose attributes and compare them in a scatterplot
matrix. Moreover, the additional selection box allows to filter for Activity Types, in case someone is just interested in specific activities. Besides the scatterplot matrix, a colored heatmap, which shows pearson's correlation coefficient, is also provided. The heatmap is placed exactly under the scatterplot matrix and displays the preselected attributes and the corresponding correlation coefficent. To make it easier for the user to interpret the numbers, the heatmap is color encoded. (Blue indicates positive correlation between attributes and red indicates negative correlation with scaled hue depending on the strength of the correlation.)

In [12]:
# Scatterplot matrix

printmd("<br>**Choose attributes and activity type.**<br>Multiple options can be selected with shift and/or ctrl (or command) pressed and mouse clicks.")

@widgets.interact(attribute_list=widgets.SelectMultiple(
    options=cleaned_data.columns.drop(['Activity Type', 'Date', 'Target', 'Avg Stride Length', 'Best Lap Time', 'Number of Laps']),
    value=['Distance', 'Calories', 'Avg HR', 'Time'],
    description='Attributes',
    disabled=False
), activity_list=widgets.SelectMultiple(
    options=activities,
    value=list(activities),
    description='Activity Type',
    disabled=False
))

def plott(attribute_list,activity_list):
    category_mask = [True if any(string in activity for string in activity_list) else False for activity in cleaned_data["Activity Type"]]
    
    def correlation_plot(attribute_list):
        palette = specify_palette_altair(activities)

        chart = alt.Chart(cleaned_data[category_mask]).mark_circle(opacity=alt.Value(0.7)).encode(
            alt.X(alt.repeat("column"), type='quantitative'),
            alt.Y(alt.repeat("row"), type='quantitative'),
            color=alt.Color('Activity Type:N', 
                            scale=palette, 
                            legend = alt.Legend(symbolSize=400, orient="top")),
            tooltip=['Activity Type','Date','Distance', 'Avg HR']
        ).properties(
            width=150,
            height=150
        ).repeat(
            row=attribute_list,
            column=attribute_list
        )

        return chart

    def correlation_heatmap(attribute_list):
        
        # Correlation Heatmap
        cleaned_data["Distance"] = pd.to_numeric(cleaned_data.Distance, errors='coerce')
        cleaned_data["Calories"] = pd.to_numeric(cleaned_data.Calories, errors='coerce')
        cleaned_data["Avg HR"] = pd.to_numeric(cleaned_data["Avg HR"], errors='coerce')
        cleaned_data["Max HR"] = pd.to_numeric(cleaned_data["Max HR"], errors='coerce')
        cleaned_data["Avg Run Cadence"] = pd.to_numeric(cleaned_data["Avg Run Cadence"], errors='coerce')
        cleaned_data["Max Run Cadence"] = pd.to_numeric(cleaned_data["Max Run Cadence"], errors='coerce')
        cleaned_data["Elev Gain"] = pd.to_numeric(cleaned_data["Elev Gain"], errors='coerce')
        cleaned_data["Elev Loss"] = pd.to_numeric(cleaned_data["Elev Loss"], errors='coerce')

        df_cor = cleaned_data[category_mask][list(attribute_list)]
        corrMatrix = df_cor.corr().reset_index().melt('index').fillna(0)
        corrMatrix.columns = ['var1', 'var2', 'correlation']
        
        palette = specify_palette_altair(activities)
        
        chart = alt.Chart(corrMatrix,title="Correlation Heatmap").encode(
            x=alt.X('var1', title=None, sort=None, axis=alt.Axis(labelAngle=0,
                                                                 labelFontStyle="bold",
                                                                 labelPadding=8,                                                                 
                                                                 labelFontSize=11)),
            y=alt.Y('var2', title=None, sort=None, axis=alt.Axis(labelAngle=-90,
                                                                 labelFontStyle="bold",
                                                                 labelPadding=15,
                                                                 labelFontSize=11)),
            color=alt.Color('correlation')
        ).properties(
            width=alt.Step(220),
            height=alt.Step(80)
        )

        rects = chart.mark_rect().encode(
            color=alt.Color('correlation', scale=alt.Scale(scheme="redblue", domain=[-1,1]))
        )
        
        text = chart.mark_text(
            size=24
        ).encode(
            text=alt.Text('correlation', format=".2f"),
            color=alt.condition(
                "datum.correlation > 0.5",
                alt.value('white'),
                alt.value('black')
            )
        )
        return (rects + text)
    
    scatter = correlation_plot(attribute_list)
    heatmap = correlation_heatmap(attribute_list)
    
    return alt.vconcat(scatter, heatmap, spacing=60).configure_scale(bandPaddingInner=0.28, 
                                                                     bandPaddingOuter=0)

<br>**Choose attributes and activity type.**<br>Multiple options can be selected with shift and/or ctrl (or command) pressed and mouse clicks.

interactive(children=(SelectMultiple(description='Attributes', index=(0, 1, 3, 2), options=('Distance', 'Calor…

#### Interpretation:
The scatterplot matrix, along with the heatmap, offers intriguing insights into the data and the interrelationship among attributes. When examining 'Time' and 'Calories', it can be concluded that the longer an activity lasts, the more calories are burned.

The heatmap emphasizes this statement with a correlation coefficient of 0.87. There is also a quite strong positive correlation between 'Calories' and 'Avg HR', which points out that the more intense a activity is, the more calories are burned. The pearson's correlation coefficient between 'Avg Pace' and 'Avg Run Cadence' is -0.77. The 'Avg Run Cadence' is only tracked for Running or Walking and measures the steps per minute. However, we can conclude that the higher the 'Avg Pace', the less steps per minute are made.

The almost perfect correlation between 'Elev Gain' and 'Elev Loss' can be explained with the preferred routes of the athlete. It seems to be that most of the time the startpoint of an activity is equal to the end point. (e.g. the athlete starts cycling from home and finishes the tour by arriving at home again.) Therefore, the gained elevation has to be equal to the lost elevation. Also 'Time' correlates strongly positive with 'Elev Gain' and 'Elev Loss' (0.70, 0.71), which leads to the assumption that the longer an activity lasts, the more vertical meters are made. All in all this kind of visualization is very handy to get an overview over the data and it allows to draw some assumptions about it.

### 4. Dimension Reduction Techniques



Choose one of the following techniques in the dropdown menu and see which is the best for your data.
* PCA: Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. 
* MDS: Means of visualizing the level of similarity of individual cases of a dataset. 
* ICA: A computational method for separating a multivariate signal into additive subcomponents.
* t-SNE: A dimensionality reduction technique that is based on Stochastic Neighbor Embedding.
* UMAP: A nonlinear dimensionality reduction method.


In [13]:
downprojection_data = data.downprojection_df

@widgets.interact(components=[3,4,5])
def downprojection(components):
    global labels
    labels = ['X1', 'X2', 'X3', 'X4', 'X5'][0:components]

    def create_downproject_objects():
        if_tsne=False
        mds = manifold.MDS(n_components=components, random_state=42)
        pca = decomposition.PCA(n_components=components)
        tsvd = decomposition.TruncatedSVD(n_components=components)
        ica = decomposition.FastICA(n_components=components)
        if components>3:
            printmd('t-SNE only works with three components.')
            if_tsne=True
        try:
            tsne = manifold.TSNE(n_components=components)
        except:
            printmd('t-SNE only works with three components.')
            tsne=None
            if_tsne=True
        u_map = umap.UMAP(n_components=components)
        return mds, pca, tsvd, ica, tsne, u_map, if_tsne

    # Plot downprojection
    methods = [manifold.MDS,decomposition.PCA,decomposition.TruncatedSVD,decomposition.FastICA,manifold.TSNE,umap.UMAP]

    mds, pca, tsvd, ica, tsne, u_map, if_tsne = create_downproject_objects()
    downprojection = 'Place holder' # place holder for the downprojection 
    projections =''
    if if_tsne:
        projections = ['PCA','MDS','ICA', 'UMAP']
    else:
        projections = ['PCA','MDS','ICA','t-SNE', 'UMAP']
    

    @widgets.interact_manual(projection=projections)
    def plot_downprojection(projection):
        global downprojection
        if projection=='PCA':
            downprojection = pca.fit(downprojection_data).transform(downprojection_data)
        if projection=='MDS':
            downprojection = mds.fit(downprojection_data).embedding_
        if projection=='Truncated SVD':
            downprojection= tsvd.fit(downprojection_data).transform(downprojection_data)
        if projection=='ICA':
            downprojection = ica.fit(downprojection_data).transform(downprojection_data)
        if projection=='UMAP':
            downprojection = u_map.fit_transform(downprojection_data)
        if projection=='t-SNE':
            downprojection = tsne.fit(downprojection_data).embedding_

        palette = specify_palette_plt(activities)
        fig, ax = plt.subplots(1,components-1, figsize=[20,7])

        [a.set(ylabel="X1") for a in ax]  
        create_legend(ax[0], palette)

        for i in range(components-1):
            ax[i].scatter(downprojection[:, i+1], downprojection[:, 0], s=30, c=[palette[t] for t in target], alpha=0.7)
            ax[i].set(xlabel=labels[i+1]) 


    def plot_downprojection(projection):
        global downprojection
        if projection=='PCA':
            downprojection = pca.fit(downprojection_data).transform(downprojection_data)
        if projection=='MDS':
            downprojection = mds.fit(downprojection_data).embedding_
        if projection=='ICA':
            downprojection = ica.fit(downprojection_data).transform(downprojection_data)
        if projection=='UMAP':
            downprojection = u_map.fit_transform(downprojection_data)
        if projection=='t-SNE':
            downprojection = tsne.fit(downprojection_data).embedding_

        palette = specify_palette_plt(activities)
        fig, ax = plt.subplots(1,components-1, figsize=[20,7])

        [a.set(ylabel="X1") for a in ax]  
        create_legend(ax[0], palette)

        for i in range(components-1):
            ax[i].scatter(downprojection[:, i+1], downprojection[:, 0], s=30, c=[palette[t] for t in target], alpha=0.7)
            ax[i].set(xlabel=labels[i+1]) 


interactive(children=(Dropdown(description='components', options=(3, 4, 5), value=3), Output()), _dom_classes=…

**Interpretation:**


* PCA
  * Forms some visible clusters (e.g. Running, Walking and Cycling, Yoga and Breathwork), however the datapoints are very close to each other and the clusters are overlapping.

* MDS
  * Compared to PCA the samples in MDS are clustered better, but still the clusters are not easily separable.
  
* ICA
  * The datapoints have a little more space between each other compared to MDS.
  * Similar activities such as Walking and Running are clustered together.

* UMAP
  * UMAP separates clusters really well from each other compared to other downprojections.
  * Some datapoints lie within clusters of different activities, for example some Running points in Walking, some Strength points in Cycling and some Yoga points in Breathwork.
  * Also, Breathwork points form 3 different main clusters.

* t-SNE
  * Only works with a number of components = 3.
  * Similar to MDS, but datapoints are more separated.
  
It can be concluded that Walking and Running are the most obvious clusters in all algorithms, however this may be due to the fact that there's a disbalance in the distribution in activities and these activities make up a big part of the dataset. Overall, the UMAP method gives the most clearly separable clusters, although even there are some observable outliers.

## 5. Clustering Algorithms
The following clustering algorithms are available to choose from:

* Spectral clustering: performs a low-dimension embedding of the affinity matrix between samples, followed by clustering, e.g., by KMeans, of the components of the eigenvectors in the low dimensional space.

* Hierarchical clustering: a general family of clustering algorithms that build nested clusters by merging or splitting them successively. 
    * Ward: minimizes the sum of squared differences within all clusters.
    * Complete linkage: minimizes the maximum distance between observations of pairs of clusters.
    * Average linkage: minimizes the average of the distances between all observations of pairs of clusters.
    * Single linkage: minimizes the distance between the closest observations of pairs of clusters.


* DBSCAN: views clusters as areas of high density separated by areas of low density. CLusters can be of differnt shape 


* Birch: It is a memory-efficient, online-learning algorithm provided as an alternative to MiniBatchKMeans. It constructs a tree data structure with the cluster centroids being read off the leaf. These can be either the final cluster centroids or can be provided as input to another clustering algorithm such as AgglomerativeClustering.


In [14]:
printmd('<br>**Choose clustering algorithm and parameters.**')
df = pd.DataFrame(downprojection, columns=labels)
df[['activity_type', 'Date']] = cleaned_data[['Activity Type', 'Date']]
attributes = cleaned_data.columns.drop(['Activity Type', 'Date', 'Target', 'Avg Stride Length', 'Best Lap Time', 'Number of Laps'])
df[attributes] = cleaned_data[attributes]
# Shorten attribute names
df.columns = [i.replace('Run ', '').replace('ence', '') for i in df.columns]
attributes = [i.replace('Run ', '').replace('ence', '') for i in attributes]
instance = 'Place holder' # place holder for the clustering instance

def drawplot(m,**kwargs):
    global instance
    if m=='K-means':
        instance = KMeans(**kwargs)
    if m=='Affinity propagation':
        instance = AffinityPropagation(**kwargs)
    #if m=='Mean Shift':
        #instance = MeanShift(**kwargs)
    if m=='Spectral Clustering':
        instance = SpectralClustering(**kwargs)
    if m=='Hierarchical Clustering':
        instance = AgglomerativeClustering(**kwargs)
    if m=='DBSCAN':
        instance = DBSCAN(**kwargs)
    if m=='OPTICS':
        instance = OPTICS(**kwargs)
    if m=='Birch':
        instance=Birch(**kwargs)
        
    printmd("<br>**Filter by prediction label:**<br>Click on the legend elements (multiple elements can be selected with shift pressed and mouse clicks).")
    #printmd("Click on the legend elements (multiple elements can be selected with shift pressed and mouse clicks).")
    printmd("**Filter by target (activity type):**<br>Click on the respective bars in the bar chart.")
    printmd("Select **specific points** by brushing in cluster chart.")
        
    clusters_assignment = instance.fit_predict(downprojection_data)
    df['prediction'] = clusters_assignment+1
    df['target'] = target
    palette_target = specify_palette_altair(activities)
    palette_prediction = specify_palette_altair(np.unique(clusters_assignment+1))
    
    def define_selections():
        brush = alt.selection_interval()
        prediction = alt.selection_multi(fields=['prediction'], bind='legend')
        target = alt.selection_multi(fields=['activity_type'])
        return brush, prediction, target
    
    def scatter_plot(df):
        chart = alt.Chart(df).mark_circle().encode(
        y = alt.Y('X1', scale = alt.Scale(domain=[df['X1'].min(), df['X1'].max()])), 
        color = alt.condition(brush, 'prediction:N', 
                              alt.value('lightgray'), 
                              scale=palette_prediction,
                              legend = alt.Legend(symbolSize=400, orient="top", title="prediction", values=np.unique(clusters_assignment+1))),
        tooltip=['Date']
        ).properties(
            width=380,
            height=300,
            title="Cluster Chart"
        ).add_selection(brush, prediction_selection, target_selection
        ).transform_filter(prediction_selection
        ).transform_filter(target_selection
        )
        return chart

    def bar_chart():
        chart = alt.Chart(df).mark_bar(size=20).encode(
            y = alt.Y('activity_type:N', title='target'),
            x = alt.X('count()'),
            color = alt.condition(target_selection, 'prediction:N', 
                                  alt.value('lightgray'), 
                                  legend=None)
        ).properties(
            width=380,
            height=300,
            title="Bar Chart: Target vs. Prediction"
        ).transform_filter(brush
        ).transform_filter(prediction_selection
        ).add_selection(target_selection)
        return chart
    
    def get_heatmap_data():
        df_hm = df.groupby(['target','prediction']).size().reset_index().rename(columns={0:'count'})        
        target_counts = df.groupby('target').count()['prediction']
        df_hm['target_counts'] = np.take(np.array(list((target_counts))), list(df_hm['target']))
        df_hm['percentage'] = df_hm['count'] / df_hm['target_counts'] * 100
        df_hm['target'] = data.decoder(df_hm['target'])
        
        
        return df_hm
    
    def jitterplot(attribute):
        chart = alt.Chart(df, width=50).mark_circle(opacity=0.7).encode(
        x=alt.X(
            'jitter:Q',
            title=attribute,
            axis=alt.Axis(values=[0], ticks=True, grid=False, labels=False),
            scale=alt.Scale()
        ),
        y=alt.Y(attribute + ':Q', 
                axis= alt.Axis(title="", labelFontSize=8, tickCount=4)),
        color = alt.condition(brush, 'prediction:N', 
                  alt.value('lightgray'), 
                  scale=palette_target,
                  legend=None),
        tooltip = 'Date'
        ).transform_calculate(
        jitter='sqrt(-2*log(random()))*cos(2*PI*random())'
        ).transform_filter(prediction_selection
        ).transform_filter(target_selection
        ).interactive()
        return chart
    
    def attribute_jitterplots():
        charts = []
        for attribute in attributes:
            charts.append(jitterplot(attribute))
        return charts
        
    
    brush, prediction_selection, target_selection = define_selections()
    chart = scatter_plot(df)
    bar = bar_chart()
    jitter = attribute_jitterplots()
    scatter = chart.encode(alt.X('X2', scale=alt.Scale(domain=[df['X2'].min(), df['X2'].max()])))
    x =  (scatter | bar) & alt.hconcat(jitter[0],jitter[1],jitter[2],jitter[3],jitter[4],jitter[5],jitter[6],jitter[7],jitter[8],jitter[9],jitter[10],jitter[11], spacing=2)
    return x

@widgets.interact(model = ['Spectral Clustering','Hierarchical Clustering','DBSCAN','Birch'])
def plotter(model):
    try:
        if model =='Spectral Clustering':
            plot = widgets.interact_manual(drawplot, m=model,n_clusters=(1,10), n_components = (0,10),random_state=[123,24,42],n_init=(0,100),gamma=(0.,1.),affinity='nearest_neighbors',n_neighbors=(27,100),assign_labels=['kmeans', 'discretize'])
        if model=='Hierarchical Clustering':
            plot = widgets.interact_manual(drawplot, m=model, n_clusters=(1,10), affinity=['euclidean','l1','l2','manhattan','cosine'],linkage=['complete', 'average', 'single'])
        if model =='DBSCAN':
            plot = widgets.interact_manual(drawplot, m=model, eps=(0.0,1.0), min_samples=(2,10), algorithm=['auto', 'ball_tree', 'kd_tree', 'brute'],leaf_size = (0,60),p=[None,0.0,0.2,0.4,0.6,0.8,1.0])
        if model =='Birch':
            plot = widgets.interact_manual(drawplot, m=model, threshold=(0.,1.), branching_factor= (1,100), n_clusters=(1,10))
    except:
        print('Parameter selection not valid')
        plot = 'Placeholder'
    plot

<br>**Choose clustering algorithm and parameters.**

interactive(children=(Dropdown(description='model', options=('Spectral Clustering', 'Hierarchical Clustering',…

#### Aim:

With the help of this dashboard the user can explore the best clustering algorithm and parameter configuration.
All graphs in the dashboard are color-coded according to the result of the chosen clustering algorithm.

In the **cluster chart**, the user can see the two main axes of the respective downprojection that he chose above in the dimensionality reduction settings. He can visually compare observable clusters with the clustering results of the algorithm.

The **bar chart** corresponds to the bar chart that has already been presented in the statistics section above, showing the frequency of the respective target activities. The difference to the bar chart above is that the colors match the cluster predictions here, which gives the user an indication how "purely" the targets have been identified. A homogenously colored bar indicates that the respective target has been mainly put into one cluster.

Finally, the **jitter graphs** below let the user explore the attributes again to find out potential reasons why certain data points have been put into certain clusters, with the help of the interactive connection between all charts in the dashboard.
Again, these jitter plots correspond to the jitter plot in the statistics section, which makes it easier for the user because he does not have to adapt to a new chart style.


#### Interpretation:
Taking the user's perspective, this exploration aims to identify the optimal clustering algorithm.  Different downprojection methods (as mentioned earlier) and various clustering algorithms and settings are being tested to achieve uniform bars.
The spectral clustering algorithm with n_clusters = 5 and n_neighbours = 27 emerges as the most promising. Therefore, the focus of the analysis lies on this clustering parameter configuration. The results differ slightly, depending on the chosen downprojection parameters, but overall they can be described as follows:

* All **Breathwork** activities have been put into the same (**red**) cluster, meaning that the algorithm could recognize very well that they belong together. Gym activities have also been put into the Breathwork cluster, as well as a few Running, Yoga and Strength activities.
* **Cardio, Cycling and Strength** have mostly all been put into the same (**green**) cluster, meaning that the algorithm couldn't distinguish these activities well from each other. However, since all three bars are fairly homogenously green, at least they have been recognized to belong together.
* The **Walking** bar is perfectly homogenously **blue**. Some running activities have also been put into the blue walking cluster.
* Most of the **Yoga** activities have been put into an own **orange** cluster, some are in the red Breathwork cluster.
* **Swimming** (which has only little data points) has been halfly put into the blue Walking cluster, halfly into the green Cardio/Cycling/Strength cluster.
Since Swimming only has such few datapoints and has been clustered so ambiguously, we are not going to take it into consideration for our analysis.
* Finally, the most interesting bar: About 70% of the **Running** activities have gotten their own **purple** cluster, but the rest has been classified into the Walking (blue) and interestingly into the Breathwork (red) cluster.

Summary of the clustering results:
* **red: Breathwork**
* **green: Cardio, Cycling or Strength**
* **blue: Walking**
* **orange: Yoga**
* **purple: Running**

Having observed the clustering outcomes, attention shifts to analyzing the reasons for the algorithm's classification inaccuracies. This involves utilizing attribute jitter plots to address two specific classification issues:

**1. Different activities that have been classified into one.**

**Cardio, Cycling and Strength** have all been classified within the green main cluster.
To investigate potential reasons, attention is narrowed down to this cluster. Selecting it in the cluster chart (effective with UMAP downprojection) facilitates this. Now the similarities between Cardio and
Strength are explored by selecting them in the bar chart.
It is observable that the attributes for these two targets are very similar to each other. Also it can be seen that these activities don't have values for the attributes "Cadence", "Pace" and "Elevation Gain / Loss".
This adds an even stronger similarity because the missing values are set to zeros in the downprojection data preprocessing.


**2. Activities where the major part of data points have been put into one cluster, but some have been put into different ones.**

There are three targets that have not been homogenously identified into clusters:

* **Yoga** <br>
Yoga has been identified mostly as orange but also partly as red, which is the main cluster of Breathwork. This makes sense, since in the cluster chart as well as in the attribute plots the data points of yoga and breathwork are at very similar positions and flowingly merge into each other. They are hard to separate.


* **Strength** <br>
Strength has also partly been recognized as Breathwork. To observe the reasons for that, the dashboard is filtered for target activity = "Strength" by clicking on the "Strength" bar in the bar chart.
Now mainly green and some red (=Breathwork) points can be observed in the attribute chart. It is striking that the red, misclassified points have three things in common: As opposed to the green points, the red points
all have a considerably lower Max and Avg HR, and also the time is at almost zero.
This suggests that these activities have been started and stopped right afterwards, maybe by mistakenly pushing a button on the watch or false automatic activity recognition of the watch.


* **Running** <br>
Running has been classified as partly Walking, and partly Breathwork as well.
By filtering by the target "Running", it can be seen that the blue (as "Walking" classified) points vary significantly from the purple (main Running class) points for most of the attributes ("Distance", "Calories",
"Time",
"HR", "Cadence" and "Pace").
By adding an additional target filter for "Walking", it can be seen that the misclassified blue running points all fall into the range where the target Walking data points are situated.
It is also noticeable again that the time is very low, as well as the distance. This raises the theory that the watch has accidentally identified short walks as runs. Similarly as in "Strength", it could have started an activity recording but stopped right afterwards when the misclassification was recognized.
<br>
When inspecting the other misclassified points, the red ones, it is remarkable that as opposed to the correctly classified datapoints, they don't have any data entries for the attributes "Calories", "HR" and "Cadence". This seems rather obscure, since these are key metrics for Running. Observing the respective dates of the red datapoints (still filtered by target Running) by tooltip, it can be discovered that all these datapoints have been generated between November 2019 and February 2020. This raises the suspicion that there was maybe a bug in the running app of the watch during this time period, or maybe something in the settings, that prevented these metrics from being recorded. Since missing data has been set to zero for the downprojection, the clustering algorithms classifies these datapoints as Breathwork, were the "Calories" and "Cadence" attributes are also either nan or 0.


## 6. Conclusion

The created dashboard proves valuable for observing varied clustering algorithm results. Users can adjust parameters to identify optimal clustering configurations. Additionally, the dashboard aids in detecting data
anomalies and potential misclassification causes. Notably, in certain cases like "Strength" and "Running", the chosen clustering configuration might outperform the inherent clustering of the fitness watch.