## 1.0 Import Packages

In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm

from sklearn.cluster import MeanShift # MeanShift Clustering
from sklearn.cluster import DBSCAN # Hierarachical Agglomerative Clustering
from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import MinMaxScaler # use standardizer instead
from scipy.optimize import linear_sum_assignment

from sklearn.metrics import silhouette_samples, silhouette_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import adjusted_rand_score
from sklearn.metrics import normalized_mutual_info_score
from sklearn.metrics import confusion_matrix

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

## 2.0 Import Prepared Dataset

In [3]:
file_path = "training_data_v2.xlsx"
df = pd.read_excel(file_path)
print(df.head())
ground_truth = df["Category"].values

   Index  Category     dim_1     dim_2     dim_3     dim_4     dim_5  \
0      0         0 -0.882278 -0.647234  0.050173 -0.448188 -0.175582   
1      1         0 -0.702665 -0.462591  0.162085 -0.029182 -0.280842   
2      2         0 -0.327257 -0.397209  0.035037 -0.064671 -0.435734   
3      3         0 -0.793734 -0.470964 -0.278644 -0.292047 -0.565868   
4      4         0 -0.272840 -0.556684  0.001737 -0.229906 -0.495732   

      dim_6     dim_7     dim_8  ...   dim_759   dim_760   dim_761   dim_762  \
0  0.125284 -0.335781 -0.396106  ...  0.568470 -0.326577  0.026089 -0.407658   
1  0.047459  0.109864 -0.540210  ...  0.249440 -0.422917  0.033820 -0.226271   
2  0.535525  0.134867 -0.213102  ...  0.710835 -0.191510 -0.068998 -0.262279   
3  0.546791  0.604674 -0.069190  ...  0.387789 -0.596448 -0.291108 -0.320205   
4  0.176596 -0.141926 -0.352247  ...  0.660122 -0.534026  0.347033 -0.279629   

    dim_763   dim_764   dim_765   dim_766   dim_767   dim_768  
0 -0.162295 -0.121949 

# Part 1: Mean Shift Clustering

## 3.0 Clustering

In [5]:
# standardize the data
X = df.drop(columns=['Index', 'Category'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [6]:
mean_shift = MeanShift()
mean_shift.fit(X_scaled) # perform clustering

## 4.0 Result Evaluation

### 4.1 Evaluation by sight

In [7]:
labels = mean_shift.labels_
cluster_centers = mean_shift.cluster_centers_

In [9]:
labels

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0], dtype=int64)

In [10]:
ground_truth

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int64)

### 5.0 Silhouette Plot

### 6.0 Principal Component Analysis (PCA)

In [40]:
X = df.drop(columns=['Index', 'Category'])
pca = PCA(n_components=10)  # Reduce to 2 principal components
data_pca = pca.fit_transform(X) # PCA

mean_shift = MeanShift()
mean_shift.fit(data_pca) # perform clustering

cluster_labels = mean_shift.labels_

'''
contingency_matrix = pd.crosstab(df['Category'], cluster_labels)
row_idx, col_idx = linear_sum_assignment(-contingency_matrix.values)

mapped_labels = np.zeros_like(cluster_labels)
for i, col in enumerate(col_idx):
    mapped_labels[cluster_labels == col] = row_idx[i]
'''  
'''
accuracy = accuracy_score(df['Category'], mapped_labels)
print("Accuracy:", accuracy) # final accuracy equivalaent to "category_modification_by_sight"

print("Inertia:", kmeans.inertia_) # inertia is too high, indicates not well-defined clusters

ground_truth_2d = ground_truth.reshape(-1, 1)  # Reshape ground_truth to have one feature per sample
silhouette_score_value = silhouette_score(ground_truth_2d, cluster_labels)
print("Silhouette Score:", silhouette_score_value)

davies_bouldin_score_value = davies_bouldin_score(X, cluster_labels)
print("Davies-Bouldin Index:", davies_bouldin_score_value)

ari = adjusted_rand_score(ground_truth, cluster_labels)
print("ARI:", ari)

nmi = normalized_mutual_info_score(ground_truth, cluster_labels)
print(f"Normalized Mutual Information (NMI): {nmi}")

purity = cluster_purity(ground_truth, cluster_labels)
print(f"Cluster Purity: {purity:.2f}")
'''

'\naccuracy = accuracy_score(df[\'Category\'], mapped_labels)\nprint("Accuracy:", accuracy) # final accuracy equivalaent to "category_modification_by_sight"\n\nprint("Inertia:", kmeans.inertia_) # inertia is too high, indicates not well-defined clusters\n\nground_truth_2d = ground_truth.reshape(-1, 1)  # Reshape ground_truth to have one feature per sample\nsilhouette_score_value = silhouette_score(ground_truth_2d, cluster_labels)\nprint("Silhouette Score:", silhouette_score_value)\n\ndavies_bouldin_score_value = davies_bouldin_score(X, cluster_labels)\nprint("Davies-Bouldin Index:", davies_bouldin_score_value)\n\nari = adjusted_rand_score(ground_truth, cluster_labels)\nprint("ARI:", ari)\n\nnmi = normalized_mutual_info_score(ground_truth, cluster_labels)\nprint(f"Normalized Mutual Information (NMI): {nmi}")\n\npurity = cluster_purity(ground_truth, cluster_labels)\nprint(f"Cluster Purity: {purity:.2f}")\n'

cluster_labels

In [42]:
ground_truth

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int64)

## Part 2: DBSCAN

## 3.0 Clustering

In [47]:
# standardize the data
X = df.drop(columns=['Index', 'Category'])
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [48]:
dbscan_blobs = DBSCAN(eps=0.5, min_samples=5)
dbscan_blobs.fit(X_scaled)
labels_blobs = dbscan_blobs.labels_

In [49]:
cluster_labels = dbscan_blobs.labels_

In [50]:
cluster_labels

array([-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1], dtype=int64)

In [51]:
ground_truth

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int64)