<h1> Import </h1>

In [None]:
from pandas import read_csv
from pandas import DataFrame
from math import pow,sqrt
from sklearn import metrics
import numpy as np
import datetime

<p> <b> Train Data </b> is used to search for best parameter / configuration. </p>
<p> <b> Test Data </b> is actual testing. </p>

<h1> Train Data - Load </h1>

In [None]:
trainData = read_csv("CencusIncome.csv", header = None)
trainLabel = trainData.loc[:,6]
trainLabel, trainLabelLevel = trainLabel.factorize()
trainData = trainData.drop(6, axis = 1)

<h1> Train Data - Preprocessing </h1>

In [None]:
def euclidean_distance(instance1,instance2):
    columnCount = len(instance1)
    sum = 0
    for i in range(columnCount):
        sum += pow(instance1[i]-instance2[i],2)
    return sqrt(sum)

def normalize(v):
    norm = np.linalg.norm(v,ord=None)
    if norm == 0: 
       return v
    return v / norm

In [None]:
normalizedTrain = normalize(trainData) 
print(normalizedTrain)

In [None]:
len(normalizedTrain.index)

<h1> Training - Running </h1>

In [None]:
def classify_k_means(instance, centroids):
    rowCount = len(centroids.index)
    distances = np.zeros(rowCount)
    for index, row in centroids.iterrows():
        distances[index] = euclidean_distance(instance,row)
    return np.argmin(distances)

def k_means(df, k):
    rowCount = len(df.index)
    colCount = len(df.columns)
    centroids = df.sample(k)
    centroids = centroids.reset_index(drop=True)
    isConvergence = False;
    i = 0
    
    while (not isConvergence):
        print()
        print(i)
        print(datetime.datetime.now())
        prev_centroids = centroids.copy()
        centroids[:] = 0.0 #Set all to zero
        pred = np.zeros(rowCount).astype(int)
        countLabel = np.zeros(k).astype(int)
        #Classify Each Row and Sum it
        for index, row in df.iterrows():
            pred[index] = classify_k_means(row, prev_centroids)
            countLabel[pred[index]] += 1
            for colIndex in range(0,colCount):
                centroids.iat[pred[index], colIndex] += row[colIndex]
        #Get new centroids (means)
        for labelIndex in range(0,k):
            for colIndex in range(0,colCount):
                centroids.iat[labelIndex, colIndex] /= countLabel[labelIndex]
        #Check convergence by comparing centroids
        convergeCheck = np.isclose(prev_centroids, centroids)
        print(prev_centroids)
        print(centroids)
        print(countLabel)
        if (convergeCheck.all()):
            isConvergence = True
        i+=1
    return {"prediction": pred,"centroids": centroids}

In [None]:
trainResult = k_means(normalizedTrain, 2)

<h1> Train Data - Prepare Prediction Result </h1>

In [None]:
predictedTrainLabel, predictedTrainLabelLevel = DataFrame(trainResult["prediction"])[0].factorize()

<h1> Train Data - Print Prediction Result </h1>

In [None]:
print(metrics.confusion_matrix(trainLabel,predictedTrainLabel))
print(metrics.accuracy_score(trainLabel, predictedTrainLabel))

<h1> Test Data - Load </h1>

In [None]:
testData = read_csv("CencusIncome.csv", header=None)
testLabel = testData.loc[:,6]
testLabel, testLabelLevel = testLabel.factorize()
testData = testData.drop(6, axis = 1)

<h1> Test Data - Preprocess </h1>

In [None]:
normalizedTest = normalize(testData) 

<h1> Test Data - Running </h1>

In [None]:
testResult = k_means(normalizedTest, 2)

<h1> Test Data - Prepare Prediction Result </h1>

In [None]:
predictionTestLabel, predictionTestLabelLevel = DataFrame(testResult["prediction"])[0].factorize()

<h1> Test Data - Print Prediction Result </h1>

In [None]:
print(metrics.confusion_matrix(testLabel,predictionTestLabel))
print(metrics.accuracy_score(testLabel, predictionTestLabel))

<h1> Benchmarking - Using SKLearn </h1>

In [None]:
from sklearn import cluster
benchmarkTrainResult = cluster.KMeans(n_clusters=2).fit(normalizedTrain)
benchmarkTestResult = cluster.KMeans(n_clusters=2).fit(normalizedTest)

In [None]:
print(metrics.confusion_matrix(testLabel, benchmarkTrainResult.labels_))
print(metrics.accuracy_score(testLabel, benchmarkTrainResult.labels_))

In [None]:
print(metrics.confusion_matrix(testLabel, benchmarkTestResult.labels_))
print(metrics.accuracy_score(testLabel, benchmarkTestResult.labels_))