In [139]:
import pandas as pd
import numpy as np

# Disable warnings from being printed
from warnings import filterwarnings
filterwarnings('ignore')

from scipy.linalg import eigh, eig
from sklearn.cluster import KMeans

In [140]:
# For seeds dataset
# Read the data
data = pd.read_csv("seeds_dataset.txt", sep=r"\s*", header=None)

# Given labels (natural clusters in data)
cluster_numbers = data.loc[:, 7].copy()
cluster_numbers_predicted = data.loc[:, 7].copy()
natural_clusters = cluster_numbers.unique().shape[0]

# Get attributes
data = data.loc[:, 0:6]

# Total number of points
n = data.shape[0]

In [141]:
# For Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN dataset
# Read the data
data = pd.read_csv("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.csv", sep=r"\s*", header=None)

# Given labels (natural clusters in data)
cluster_numbers = data.loc[:, 5].copy()
cluster_numbers[cluster_numbers == "very_low"] = 1
cluster_numbers[cluster_numbers == "High"] = 2
cluster_numbers[cluster_numbers == "Low"] = 3
cluster_numbers[cluster_numbers == "Middle"] = 4
cluster_numbers_predicted = cluster_numbers.copy()
natural_clusters = cluster_numbers.unique().shape[0]

# Get attributes
data = data.loc[:, 0:4]

# Total number of points
n = data.shape[0]

In [142]:
# Construct Affinity matrix

# For Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN dataset
# sigma_sq = 1

# For seeds dataset
sigma_sq = 1e5

affinity_matrix = squareform(pdist(data, 'sqeuclidean'))
for i in range(n):
    for j in range(n):
        if i == j:
            continue
        affinity_matrix[i][j] = np.exp(-affinity_matrix[i][j] / (2 * sigma_sq))

In [143]:
# Construct diagonal matrix
diagonal_matrix = np.zeros((n, n))
for i in range(n):
    diagonal_matrix[i][i] = np.sum(affinity_matrix[i])   

In [144]:
# Construct Laplacian matrix
L = diagonal_matrix - affinity_matrix

In [145]:
# Find eigenvectors corresponding to k smallest eigenvalues,
# of L and stack them columnwise
eigvals, eigvecs = np.linalg.eigh(L)
X = np.column_stack((eigvecs[i] for i in range(natural_clusters)))

In [146]:
clusters = KMeans(n_clusters=natural_clusters).fit(X)
cluster_numbers_predicted = clusters.labels_                                                

In [147]:
# Map the original cluster labels to new cluster labels
mappings = {}
mappings_unavailable = []
for i in range(1, natural_clusters + 1):
    maxcnt = -1
    maxj = 0
    for j in range(0, natural_clusters):
        if j in mappings_unavailable:
            continue
        # Count the number of points matching if i maps to j
        cnt = 0
        for k in range(n):
            if cluster_numbers[k] == i and cluster_numbers_predicted[k] == j:
                cnt = cnt + 1
        if maxcnt < cnt:
            maxcnt = cnt
            maxj = j
    mappings[i] = maxj
    mappings_unavailable.append(maxj)

for mapping in mappings.keys():
    cluster_numbers[cluster_numbers == mapping] = mappings[mapping]

In [148]:
# Finally compute accuracy
cnt = 0.0
for i in range(n):
    if cluster_numbers[i] == cluster_numbers_predicted[i]:
        cnt = cnt + 1.0
print("Accuracy: ", cnt/n)

Accuracy:  0.10077519379844961
