In [17]:
import random
import numpy as np
from sklearn.datasets import load_iris
from sklearn import metrics

In [18]:
def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [28]:
## KMedoids
class KMedoids:
    def __init__(self, n_cluster=2, random_state=None):
        self.n_cluster = n_cluster
        self.list_cluster = []
        self.random_state = int(random_state)
    def make_list_cluster(self, X):
        list_cluster = []
        used_index = []
        cluster = 0
        random.seed(self.random_state)
        while cluster < (self.n_cluster):
            random_index = random.randint(0, len(X)-1)
            if not random_index in used_index:
                list_cluster.append(X[random_index])
                used_index.append(random_index)
                cluster += 1
            self.list_cluster = list_cluster
        return list_cluster
    def count_distance(self, X, cluster):
        distance = 0
        for i in range(len(X)):
            distance = distance + abs(X[i] - cluster[i])
        return distance
    def assign_objects_cluster(self, X):
        objects_cluster = []
        list_cluster = self.make_list_cluster(X)
        for data in X:
            distance = []
            for cluster in list_cluster:
                distance.append(self.count_distance(data, cluster))
            index_min_distance = np.argmin(distance)
            objects_cluster.append(list_cluster[index_min_distance])
        return objects_cluster
    def count_cost(self, X, cluster):
        return self.count_distance(X, cluster)
    def randomize_cluster(self, X, cluster, changed_index):
        i = 0
        changed = False
        while(changed == False):
            if(X[i] not in cluster and i < len(X)):
                cluster[changed_index] = X[i]
                changed = True
            i += 1
        self.list_cluster = cluster
        return cluster
    def fit(self, X):
        list_cluster = self.make_list_cluster(X)
        objects_cluster = self.assign_objects_cluster(X)
        before_cost = 999999999
        final_cost = 99999999
        i = 1
        while(before_cost >= final_cost and i<=2):
            before_cost = final_cost
            cost = 0
            for i in range(len(list_cluster)):
                cost += self.count_cost(list_cluster[i], objects_cluster[i])
            final_cost = cost
            random.seed(self.random_state)
            random_index = random.randint(0, len(list_cluster)-1)
            list_cluster_before = list_cluster.copy()
            list_cluster = self.randomize_cluster(X, list_cluster, random_index)
            i += 1
        self.list_cluster = list_cluster_before
        return list_cluster_before
    def predict(self, X):
        result = []
        for data in X:
            cost = []
            for cluster in self.list_cluster:
                cost.append(self.count_distance(data, cluster))
            index_min_cost = np.argmin(cost)
            result.append(index_min_cost)
        return result

In [31]:
k_medoids = KMedoids(n_cluster=2, random_state=14)
X = [[1, 2, 3, 4], [5, 6, 7, 8], [201, 203, 205, 207], [200, 202, 204, 206]]
cluster = k_medoids.make_list_cluster(X)

print('cluster hasil fit : ', k_medoids.fit(X))
print('predict cluster : ', k_medoids.predict(X))

cluster hasil fit :  [[201, 203, 205, 207], [5, 6, 7, 8]]
predict cluster :  [1, 1, 0, 0]


In [8]:
np.array(X)

array([[  1,   2,   3,   4],
       [  5,   6,   7,   8],
       [201, 203, 205, 207],
       [200, 202, 204, 206]])

In [24]:
iris = load_iris()
X_iris = iris.data.tolist()
y_iris = iris.target
k_medoids = KMedoids(n_cluster=3, random_state=14)
k_medoids.fit(X_iris)
predict_iris = np.array(k_medoids.predict(X_iris))
print('purity : ', purity_score(y_iris, predict_iris))

purity :  0.9


In [27]:
y_iris

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

In [26]:
predict_iris

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1,
       2, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2], dtype=int64)