In [46]:
from scipy.io import arff
import pandas as pd
import numpy as np
from sklearn import preprocessing
import math
from copy import deepcopy

In [2]:
data = arff.loadarff('./Data/kc1.arff.txt')
df = pd.DataFrame(data[0])
df['defects'] = df['defects'].apply(lambda x: str(x)[1:]) #removing 'b' from classes
df['defects'] = df['defects'].map({"'true'": True, "'false'": False})
df.defects.value_counts()

False    1783
True      326
Name: defects, dtype: int64

In [169]:
from sklearn.model_selection import StratifiedKFold
data_set = df[df.defects == False]
Y = data_set.iloc[:, -1]
data_set = data_set.iloc[:, :-1]
data_set = np.array(data_set.values)
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=1)
train_index, test_index = skf.split(data_set, Y)

In [170]:
train_set = data_set[train_index[0]]
test_set = data_set[test_index[0]]

In [171]:
def get_init_centers(k, n_instances):
    init_ids = []
    while len(init_ids) < k:
        index = np.random.randint(0, n_instances)
        if not index in init_ids:
            init_ids.append(index)
    return init_ids

def calc_distance(x, y):
    return np.sqrt(np.sum((x - y)**2))

def get_cost(X, centers_id):
    dists = np.zeros((len(X), len(centers_id)))
    for j in range(len(centers_id)):
        center = X[centers_id[j]]
        for i in range(len(X)):
            if i != centers_id[j]:
                dists[i, j] = calc_distance(X[i], center)
    mask = np.argmin(dists, axis=1)
    members = np.zeros(len(X))
    costs = np.zeros(len(centers_id))
    for i in range(len(centers_id)):
        mem_id = np.where(mask==i)
        members[mem_id] = i
        costs[i] = np.sum(dists[mem_id, i])
    return members, costs, np.sum(costs), dists

In [191]:
def kmedoids(X, n_clusters):
    n_instances, n_features = X.shape
    centers = get_init_centers(n_clusters, n_instances)
    members, costs, total_cost, dists = get_cost(X, centers)
    count, SWAPED = 0, True
    while count < 1000 and SWAPED:
        SWAPED = False
        for i in range(n_instances):
            if not i in centers:
                for j in range(len(centers)):
                    centers_ = deepcopy(centers)
                    centers_[j] = i
                    members_, costs_, total_cost_, dists_ = get_cost(X, centers_)
                    if total_cost_ - total_cost < 0:
                        members, costs, total_cost, dists = members_, costs_, total_cost_, dists_
                        centers = centers_
                        SWAPED = True
        count += 1
    return centers, members, costs, total_cost, dists

In [None]:
centers, members, costs, total_cost, dists = kmedoids(train_set, 1)