In [3]:
import math
import pandas as pd
import numpy as np

In [33]:
def load_dataset(path):
    df = pd.read_csv(path, delimiter=',' , engine='python')
    return df

In [34]:
path="..\DatasetExos.csv"
df = load_dataset(path)

In [35]:
df.loc[df["Category"] == "heav", "Category"] = "heavy"
df = df.drop(columns=['ep (ms)', 'Exercise', 'ID', 'Category'])
df= df.dropna()

In [36]:
numerical_cols = df.select_dtypes(include=[np.number]).columns
non_numerical_cols = df.select_dtypes(exclude=[np.number]).columns
non_numerical_cols = non_numerical_cols.difference(['Exercise'])
print(numerical_cols)
print(non_numerical_cols)

Index(['Acc_x', 'Acc_y', 'Acc_z', 'Gyro_x', 'Gyro_y', 'Gyro_z', 'Set'], dtype='object')
Index([], dtype='object')


In [8]:
def manhattan_distance(instance1,instance2,columns):
    d = 0

    for column in columns:
        d +=  abs(instance1[column] - instance2[column])
    
    return d

In [37]:
instance1 = df.iloc[0]  # Get the first row
instance2 = df.iloc[1]
d = manhattan_distance(instance1, instance2, numerical_cols)
d

4.7994

In [39]:
def centroid(df_cluster,numerical_columns):
    
    #initialization
    c = pd.Series(index=df_cluster.columns, dtype=object) 

    for column in numerical_columns:
        c[column] = df_cluster[column].mean()

    return c

In [40]:
def closest_cluster(clusters_centroids, instance, numerical_columns):
    distances = {}

    for idx, centroid in clusters_centroids.iterrows():
        d = manhattan_distance(centroid, instance, numerical_columns)
        distances[idx] = d  
    
   
    closest_centroid = min(distances, key=distances.get)

    return closest_centroid


In [41]:
def kmeans(df, numerical_columns, k):

    centroids = df.sample(n=k).reset_index(drop=True)  
    clusters = {}
    d_prev = None

    while True:
        
        clusters = {i: [] for i in range(k)}
        
        # Calculating distances and  Assigning each instance to the closest cluster
        for _, row in df.iterrows():
            c = closest_cluster(centroids, row, numerical_columns)
            clusters[c].append(row)
        
        # Labeling the clusters
        for cluster_id, cluster_instances in clusters.items():
            for instance in cluster_instances:
                df.loc[df.index == instance.name, 'cluster'] = cluster_id  
        
        
        new_centroids = []
        for cluster_instances in clusters.values():
            cluster_df = pd.DataFrame(cluster_instances)
            new_centroid = centroid(cluster_df, numerical_columns)
            new_centroids.append(new_centroid)

        # if no change in the centroids end of the algorithm
        if d_prev is not None and all(d_prev[i].equals(new_centroids[i]) for i in range(k)):
            break

        d_prev = new_centroids
        centroids = pd.DataFrame(new_centroids)  

    return df


In [17]:
def view_clusters(df, k):
    for i in range(k):
        cluster_rows = df[df['cluster'] == i]
        cluster_size = len(cluster_rows)
        
        print(f'Cluster {i} (Number of instances: {cluster_size})')
        
        for _, row in cluster_rows.iterrows():
            print(row.values.tolist())  
        print('\n')  

In [21]:
def view_clusters2(df, k):
    for i in range(k):
        cluster_rows = df[df['cluster'] == i]
        cluster_size = len(cluster_rows)
        
        print(f'Cluster {i} (Number of instances: {cluster_size})') 

In [42]:
df_labeled = kmeans(df, numerical_cols, k=2)

In [43]:
view_clusters(df_labeled,k=2)

Cluster 0 (Number of instances: 1)
[100000.07, -0.883, 0.110333333333333, -30.829, 15.317, -13.634, 49.0, 0.0]


Cluster 1 (Number of instances: 8989)
[0.0135, 0.977, -0.071, -1.8904, 2.4392, 0.9388, 30.0, 1.0]
[-0.0015, 0.9705, -0.0795, -1.6826, -0.8904, 2.1708, 30.0, 1.0]
[0.001333333333333, 0.971666666666667, -0.064333333333333, 2.5608, -0.256, -1.4146, 30.0, 1.0]
[-0.024, 0.957, -0.0735, 8.061, -4.5244, -2.073, 30.0, 1.0]
[-0.028, 0.957666666666666, -0.115, 2.439, -1.5486, -3.6098, 30.0, 1.0]
[-0.026, 0.965, -0.118, 0.4634, 5.2194, -6.4636, 30.0, 1.0]
[-0.048666666666667, 0.79, -0.145333333333333, 21.695, 8.1708, -28.2196, 30.0, 1.0]
[-0.17, 0.8995, -0.25, 17.5246, 1.5976, -17.5854, 30.0, 1.0]
[-0.222666666666667, 0.907, -0.204333333333333, -7.2318, -1.3536, -0.4026, 30.0, 1.0]
[-0.2045, 0.93, -0.149, -28.683, -10.2076, 20.5732, 30.0, 1.0]
[-0.205, 1.40466666666667, -0.095, -4.1098, -9.3172, -3.3412, 30.0, 1.0]
[-0.1635, 0.996, -0.113, 35.5488, 11.5732, -17.2074, 30.0, 1.0]
[-0.220

In [44]:
df_labeled = kmeans(df, numerical_cols, k=5)

In [45]:
view_clusters2(df_labeled,k=5)

Cluster 0 (Number of instances: 3)
Cluster 1 (Number of instances: 1)
Cluster 2 (Number of instances: 3886)
Cluster 3 (Number of instances: 934)
Cluster 4 (Number of instances: 4166)


In [46]:
view_clusters(df_labeled,k=5)

Cluster 0 (Number of instances: 3)
[0.051, -0.86, 0.1385, -38.3538, -3.0854, -4474.195, 55.0, 0.0]
[-0.0485, -1.2755, -0.0875, -2.0244, -4.0, -9743.927, 6.0, 0.0]
[-0.005, -0.856666666666667, 0.021666666666667, -26.1582, -22.3902, -7998.5974, 13.0, 0.0]


Cluster 1 (Number of instances: 1)
[100000.07, -0.883, 0.110333333333333, -30.829, 15.317, -13.634, 49.0, 1.0]


Cluster 2 (Number of instances: 3886)
[0.003666666666667, 0.966333333333333, -0.081, 1.8412, -4.7806, -2.5608, 86.0, 2.0]
[-0.0125, 0.9625, -0.089, 2.195, -2.1096, -2.8538, 86.0, 2.0]
[-0.028, 0.867, -0.125, 9.5246, -2.829, -11.1828, 86.0, 2.0]
[-0.062, 0.873, -0.155, 16.5608, -4.4268, -13.0368, 86.0, 2.0]
[-0.096666666666667, 0.904333333333333, -0.169, 7.6952, -11.8538, -3.0364, 86.0, 2.0]
[-0.1155, 0.963, -0.1765, -1.5486, -13.9268, 7.9756, 86.0, 2.0]
[-0.104333333333333, 1.11, -0.140333333333333, -10.3658, -12.9512, 12.634, 86.0, 2.0]
[-0.1565, 1.323, -0.139, -4.4878, 8.3172, -20.4146, 86.0, 2.0]
[-0.152, 0.9333333333333

In [47]:
df_labeled = kmeans(df, numerical_cols, k=6)

In [48]:
view_clusters2(df_labeled,k=6)

Cluster 0 (Number of instances: 3417)
Cluster 1 (Number of instances: 3)
Cluster 2 (Number of instances: 1)
Cluster 3 (Number of instances: 1065)
Cluster 4 (Number of instances: 3146)
Cluster 5 (Number of instances: 1358)


In [49]:
view_clusters(df_labeled,k=6)

Cluster 0 (Number of instances: 3417)
[0.0135, 0.977, -0.071, -1.8904, 2.4392, 0.9388, 30.0, 0.0]
[-0.0015, 0.9705, -0.0795, -1.6826, -0.8904, 2.1708, 30.0, 0.0]
[0.001333333333333, 0.971666666666667, -0.064333333333333, 2.5608, -0.256, -1.4146, 30.0, 0.0]
[-0.024, 0.957, -0.0735, 8.061, -4.5244, -2.073, 30.0, 0.0]
[-0.028, 0.957666666666666, -0.115, 2.439, -1.5486, -3.6098, 30.0, 0.0]
[-0.026, 0.965, -0.118, 0.4634, 5.2194, -6.4636, 30.0, 0.0]
[-0.222666666666667, 0.907, -0.204333333333333, -7.2318, -1.3536, -0.4026, 30.0, 0.0]
[-0.205, 1.40466666666667, -0.095, -4.1098, -9.3172, -3.3412, 30.0, 0.0]
[-0.220666666666667, 0.904, -0.208333333333333, 4.8902, -0.2804, -0.134, 30.0, 0.0]
[-0.222, 0.9465, -0.221, -1.061, -2.183, 11.3536, 30.0, 0.0]
[-0.011, 0.850666666666667, -0.094333333333333, 12.0852, -0.3536, -5.4024, 30.0, 0.0]
[-0.0525, 1.025, -0.0375, -6.5366, -0.7194, 0.3414, 30.0, 0.0]
[-0.065, 0.955333333333333, -0.025333333333333, 1.2684, -3.0246, -1.2318, 30.0, 0.0]
[-0.0535, 0.9