<h1> Import </h1>

In [None]:
from pandas import read_csv
from pandas import DataFrame
from math import pow,sqrt
from sklearn import metrics
import numpy as np
import datetime
import random

<p> <b> Train Data </b> is used to search for best parameter / configuration. </p>
<p> <b> Test Data </b> is actual testing. </p>

<h1> Train Data - Load </h1>

In [None]:
trainData = read_csv("CencusIncome.csv", header = None)
trainLabel = trainData.loc[:,6]
trainLabel, trainLabelLevel = trainLabel.factorize()
trainData = trainData.drop(6, axis = 1)

<h1> Train Data - Preprocessing </h1>

In [None]:
def euclidean_distance(instance1,instance2):
    columnCount = len(instance1)
    sum = 0
    for i in range(columnCount):
        sum += pow(instance1[i]-instance2[i],2)
    return sqrt(sum)

def normalize(v):
    norm = np.linalg.norm(v)
    if norm == 0: 
       return v
    return v / norm

In [None]:
normalizedTrain = normalize(trainData) 
print(normalizedTrain)

In [None]:
len(normalizedTrain.index)

<h1> Clustering </h1>

In [None]:
def classify_k_medoids(instance, medoids):
    row_count = len(medoids.index)
    distances = np.zeros(row_count)
    for index, row in medoids.iterrows():
        distances[index] = euclidean_distance(instance,row)
    result = [0, 0]
    result[0] = np.argmin(distances)
    result[1] = distances[np.argmin(distances)]
    return result

def k_medoids(df, k, max_iterations):
    row_count = len(df.index)
    col_count = len(df.columns)
    medoids = df.sample(k)
    medoids = medoids.reset_index(drop=True)
    is_convergence = False;
    i = 0
    error = 0.0
    iteration = 0
    #Initiate array for membership
    membership = []
    for index in range(0,k):
        membership.append([])
    #First time classify
    print(i)
    print(datetime.datetime.now())
    prev_medoids = medoids.copy()
    pred = np.zeros(row_count).astype(int)
    #Classify each row & count error
    for index, row in df.iterrows():
        tmp_array = classify_k_medoids(row, prev_medoids)
        pred[index] = tmp_array[0]
        membership[tmp_array[0]].append(index)
        error += tmp_array[1]
    best_error = error
    best_pred = np.copy(pred)
    best_medoids = medoids.copy()
    print(best_error)
    print(prev_medoids)
    i += 1
    
    while (not is_convergence):
        #Get new medoids (randomize)
        random_class = random.randint(0, k-1)
        sum_members_of_class = len(membership[random_class])
        random_medoid = random.randint(0, sum_members_of_class-1)
        #print("membership " + str(membership[random_class]))
        print("sum member of class" + str(sum_members_of_class))
        print("random medoid" + str(random_medoid))
        for index in range(0, col_count):
            medoids.iat[random_class, index] = df.iat[random_medoid, index]
        print(prev_medoids)
        print(medoids)
        #Initiate array for membership
        membership = []
        for index in range(0,k):
            membership.append([])
        #Classify each row & count error
        print(i)
        print(datetime.datetime.now())
        prev_medoids = medoids.copy()
        pred = np.zeros(row_count).astype(int)
        error = 0.0        
        for index, row in df.iterrows():
            tmp_array = classify_k_medoids(row, prev_medoids)
            pred[index] = tmp_array[0]
            membership[tmp_array[0]].append(index)
            error += tmp_array[1]
        print(error)

        #Stop condition    
        #print(membership)
        if(error >= best_error):
            iteration += 1
        else:
            iteration = 0
            best_error = error
            best_pred = np.copy(pred)
            best_medoids = medoids.copy()
        i += 1
        if(iteration == max_iterations):
            is_convergence = True
    print("best error = " + str(best_error))
    print("best pred = " + str(best_pred))
    print("best medoids = " + str(best_medoids))
    return {"prediction": best_pred, "medoids": best_medoids} 

In [None]:
trainResult = k_medoids(normalizedTrain, 2, 1)

<h1> Train Data - Prepare Prediction Result </h1>

In [None]:
predictedTrainLabel, predictedTrainLabelLevel = DataFrame(trainResult["prediction"])[0].factorize()

<h1> Train Data - Print Prediction Result </h1>

In [None]:
print(metrics.confusion_matrix(trainLabel,predictedTrainLabel))
print(metrics.accuracy_score(trainLabel, predictedTrainLabel))

<h1> Test Data - Load </h1>

In [None]:
testData = read_csv("CencusIncome.csv", header=None)
testLabel = testData.loc[:,6]
testLabel, testLabelLevel = testLabel.factorize()
testData = testData.drop(6, axis = 1)

<h1> Test Data - Preprocess </h1>

In [None]:
normalizedTest = normalize(testData) 

<h1> Test Data - Running </h1>

In [None]:
testResult = k_medoids(normalizedTest, 2, 1)

<h1> Test Data - Prepare Prediction Result </h1>

In [None]:
predictionTestLabel, predictionTestLabelLevel = DataFrame(testResult["prediction"])[0].factorize()

<h1> Test Data - Print Prediction Result </h1>

In [None]:
print(metrics.confusion_matrix(testLabel,predictionTestLabel))
print(metrics.accuracy_score(testLabel, predictionTestLabel))