In [29]:
import pandas as pd
import numpy as np

# Disable warnings from being printed
from warnings import filterwarnings
filterwarnings('ignore')

from scipy.spatial.distance import pdist, squareform
from sys import maxsize

In [25]:
# For seeds dataset
# Read the data
data = pd.read_csv("seeds_dataset.txt", sep=r"\s*", header=None)

# Given labels (natural clusters in data)
cluster_numbers = data.loc[:, 7].copy()
cluster_numbers_predicted = data.loc[:, 7].copy()
natural_clusters = cluster_numbers.unique().shape[0]

# Get attributes
data = data.loc[:, 0:6]

# Total number of points
n = data.shape[0]

In [30]:
# For Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN dataset
# Read the data
data = pd.read_csv("Data_User_Modeling_Dataset_Hamdi Tolga KAHRAMAN.csv", sep=r"\s*", header=None)

# Given labels (natural clusters in data)
cluster_numbers = data.loc[:, 5].copy()
cluster_numbers[cluster_numbers == "very_low"] = 1
cluster_numbers[cluster_numbers == "High"] = 2
cluster_numbers[cluster_numbers == "Low"] = 3
cluster_numbers[cluster_numbers == "Middle"] = 4
cluster_numbers_predicted = cluster_numbers.copy()
natural_clusters = cluster_numbers.unique().shape[0]

# Get attributes
data = data.loc[:, 0:4]

# Total number of points
n = data.shape[0]

In [31]:
# Compute Kernel Matrix
gamma = 1e-3

sq_dists = pdist(data, 'sqeuclidean')

# Converting the pairwise distances into a symmetric NxN matrix.
mat_sq_dists = squareform(sq_dists)

# Computing the NxN RBF kernel matrix.
K = np.exp(-gamma * mat_sq_dists)

In [32]:
# Randomly initialise k centroids as first k data points
centroid_matrix = data.iloc[0:natural_clusters]
cluster_points = []
for i in range(natural_clusters):
    cluster_points.append((i, ))

In [33]:
# While data points are re-assigned clusters, loop
reassigned_flag = True
iter = 0
while reassigned_flag:
    iter = iter + 1
    if iter == 100:
        break        
    cluster_points_new = [()]*natural_clusters
    # For each data point compute nearest cluster
    for i in range(n):
        minj = 0
        mindist = maxsize
        for j in range(natural_clusters):
            cluster_cardinality = len(cluster_points[j])
            dist = K[i][i]
            sum = 0
            for point in cluster_points[j]:
                sum = sum + K[i][point]
            dist = dist - (2*sum) / cluster_cardinality
            sum = 0
            for p1 in cluster_points[j]:
                for p2 in cluster_points[j]:
                    sum = sum + K[p1][p2]
            dist = dist - (sum / (cluster_cardinality**2))
            
            if mindist > dist:
                mindist = dist
                minj = j
                
        if cluster_numbers_predicted[i] != minj:
            reassigned_flag = True
            cluster_numbers_predicted[i] = minj
            
        cluster_points_new[minj] = cluster_points_new[minj] + (i, )
        
    cluster_points = cluster_points_new.copy()

In [34]:
# Map the original cluster labels to new cluster labels
mappings = {}
mappings_unavailable = []
for i in range(1, natural_clusters + 1):
    maxcnt = -1
    maxj = 0
    for j in range(0, natural_clusters):
        if j in mappings_unavailable:
            continue
        # Count the number of points matching if i maps to j
        cnt = 0
        for k in range(n):
            if cluster_numbers[k] == i and cluster_numbers_predicted[k] == j:
                cnt = cnt + 1
        if maxcnt < cnt:
            maxcnt = cnt
            maxj = j
    mappings[i] = maxj
    mappings_unavailable.append(maxj)

for mapping in mappings.keys():
    cluster_numbers[cluster_numbers == mapping] = mappings[mapping]

In [35]:
# Finally compute accuracy
cnt = 0.0
for i in range(n):
    if cluster_numbers[i] == cluster_numbers_predicted[i]:
        cnt = cnt + 1.0
print("Accuracy: ", cnt/n)

Accuracy:  0.4418604651162791
