In [68]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN
from umap import UMAP


In [2]:
data0 = pd.read_csv("../DSI_Module-26_Capstone/data/0.csv", header=None)
data1 = pd.read_csv("../DSI_Module-26_Capstone/data/1.csv", header=None)
data2 = pd.read_csv("../DSI_Module-26_Capstone/data/2.csv", header=None)
data3 = pd.read_csv("../DSI_Module-26_Capstone/data/3.csv", header=None)


In [44]:
def stack_data(df, n_sens=8, n_read=8):
    df_split = []
    df_sens = []
    df_out = pd.DataFrame()
    for i in range(n_read):
        df_split.append(df.iloc[:,(n_read)*i:(n_read)*(i+1)])
    for i in range(n_sens):
        new_split = []
        for j in range(n_read):
            new_split.append(df_split[j].iloc[:,i])
        df_sens.append(pd.concat(new_split, keys=[a for a in range(1,n_read+1)]))
    df_out = pd.concat(df_sens, axis=1, keys=[a for a in range(1,n_sens+1)])
    df_out = df_out.reset_index()
    df_out = df_out.rename(columns={'level_0': 'Reading', 'level_1': 'obsID'})
    df_out['target'] = df.iloc[0,64]
    df_out = df_out.drop(columns=['obsID'])
    return df_out

In [53]:
data0_tall = stack_data(data0)
data1_tall = stack_data(data1)
data2_tall = stack_data(data2)
data3_tall = stack_data(data3)
newdf = pd.concat([data0_tall, data1_tall, data2_tall, data3_tall]).reset_index(drop=True)
newdf.head(1)

Unnamed: 0,Reading,1,2,3,4,5,6,7,8,target
0,1,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0,0


In [58]:
X = newdf.drop(columns=['target'])
y = newdf['target']

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

Unnamed: 0,Reading,1,2,3,4,5,6,7,8
0,1,26.0,4.0,5.0,8.0,-1.0,-13.0,-109.0,-66.0
1,1,-47.0,-6.0,-5.0,-7.0,13.0,-1.0,35.0,-10.0
2,1,-19.0,-8.0,-8.0,-8.0,-21.0,-6.0,-79.0,12.0
3,1,2.0,3.0,0.0,2.0,0.0,22.0,106.0,-14.0
4,1,6.0,0.0,0.0,-2.0,-14.0,10.0,-51.0,5.0
...,...,...,...,...,...,...,...,...,...
93419,8,1.0,4.0,3.0,4.0,-51.0,-49.0,5.0,-9.0
93420,8,-3.0,-3.0,-3.0,-5.0,-4.0,-45.0,-12.0,-15.0
93421,8,-8.0,-4.0,-4.0,-4.0,-21.0,-29.0,-5.0,0.0
93422,8,-3.0,0.0,-3.0,-5.0,-36.0,-90.0,3.0,5.0


0        0
1        0
2        0
3        0
4        0
        ..
93419    3
93420    3
93421    3
93422    3
93423    3
Name: target, Length: 93424, dtype: int64

### PCA - DBSCAN Model

choosing n_components=5 since there are five fingers on a hand to make a gesture, hopefully we can see discrete clusters with components describing hand/finger postition in this number of components

In [64]:
pca = PCA(n_components=5)
pca_data = pca.fit_transform(scaled_X)
pca_df = pd.DataFrame(
    pca_data,
    index=X.index
                     )


used a base range of min_samples=[1:10] and epsilon=[0.1:1.0] and after a few iteration and narrowing down the hyperparameters (based on the number of clusters produced by the model.  We're looking for something in the area of 5 clusters if it maps finger postion or 4 clusters if it maps the actual hand gesture) based on the desired clustering we reached the following conclusion:



In [73]:
model_df = pd.DataFrame(columns=['Epsilon', 'Min_Samples', 'Num_Clusters'])
i=0
for epsilon in np.arange(0.6, 1.1, 0.05):
    for minsamp in np.arange(7, 15):
        model = DBSCAN(eps=epsilon, min_samples=minsamp)
        model.fit(pca_data)
        model_df.loc[i, 'Epsilon'] = epsilon
        model_df.loc[i, 'Min_Samples'] = minsamp
        model_df.loc[i, 'Num_Clusters'] = pd.Series(model.labels_).nunique()
        i += 1
        

display(
    model_df,
    model_df['Num_Clusters'].value_counts()
)

Unnamed: 0,Epsilon,Min_Samples,Num_Clusters
0,0.6,7,38
1,0.6,8,26
2,0.6,9,25
3,0.6,10,25
4,0.6,11,13
...,...,...,...
83,1.1,10,2
84,1.1,11,2
85,1.1,12,2
86,1.1,13,2


2     17
3     16
4     10
6      9
5      7
8      6
7      3
12     2
25     2
17     2
14     2
13     2
10     2
38     1
27     1
9      1
26     1
16     1
19     1
24     1
32     1
Name: Num_Clusters, dtype: int64

In [82]:
model_df[model_df['Num_Clusters'] == 5]

Unnamed: 0,Epsilon,Min_Samples,Num_Clusters
7,0.6,14,5
29,0.75,12,5
37,0.8,12,5
56,0.95,7,5
64,1.0,7,5
72,1.05,7,5
80,1.1,7,5


for a 4-cluster model, let's use eps=1 and ms=9
- This results in a model where the majority of the data is either in cluster zero or unclassified
let's try another subset of eps=0.65 and ms=13
- Same results
for a 5-cluster model, let's use eps=1 and ms=7
- Same results

so this doesn't seem to work well, let's try a different dim reduction method

In [83]:
model_4dbscanpca = DBSCAN(eps=1, min_samples=7)
model_4dbscanpca.fit(pca_data)
model_4dbscanpca_data = X.copy()
model_4dbscanpca_data['label'] = model_4dbscanpca.labels_
model_4dbscanpca_data['target'] = y
pd.crosstab(model_4dbscanpca_data['label'], model_4dbscanpca_data['target'])

target,0,1,2,3
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,1991,119,649,291
0,21279,23105,22889,23085
1,5,0,0,0
2,5,0,0,0
3,0,0,6,0
