<h1> Import </h1>

In [1]:
from pandas import read_csv
from pandas import DataFrame
from math import pow,sqrt
from sklearn import metrics
import numpy as np
import datetime

<h1> Train Data </h1>

In [2]:
cencus = read_csv("CencusIncome.csv", header = None)
cencusData = cencus.drop(6, axis = 1)
cencusData

Unnamed: 0,0,1,2,3,4,5
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40
5,37,284582,14,0,0,40
6,49,160187,5,0,0,16
7,52,209642,9,0,0,45
8,31,45781,14,14084,0,50
9,42,159449,13,5178,0,40


<h1> Preprocessing </h1>

In [45]:
def euclidean_distance(instance1,instance2):
#     asumsi jumlah tabel sama
    columnCount = len(instance1)
    sum = 0
    for i in range(columnCount):
        sum += pow(instance1[i]-instance2[i],2)
    return sqrt(sum)

def normalize(v):
    norm = np.linalg.norm(v,ord=1)
    if norm == 0: 
       return v
    return v / norm

In [46]:
# print(euclidean_distance(cencusData.loc[1],cencusData.loc[2]))
normalizedData = normalize(cencusData) 
print(normalizedData)

                  0         1             2             3             4  \
0      6.311320e-09  0.000013  2.103773e-09  3.518156e-07  0.000000e+00   
1      8.091435e-09  0.000013  2.103773e-09  0.000000e+00  0.000000e+00   
2      6.149491e-09  0.000035  1.456458e-09  0.000000e+00  0.000000e+00   
3      8.576921e-09  0.000038  1.132801e-09  0.000000e+00  0.000000e+00   
4      4.531204e-09  0.000055  2.103773e-09  0.000000e+00  0.000000e+00   
5      5.987662e-09  0.000046  2.265602e-09  0.000000e+00  0.000000e+00   
6      7.929607e-09  0.000026  8.091435e-10  0.000000e+00  0.000000e+00   
7      8.415093e-09  0.000034  1.456458e-09  0.000000e+00  0.000000e+00   
8      5.016690e-09  0.000007  2.265602e-09  2.279195e-06  0.000000e+00   
9      6.796806e-09  0.000026  2.103773e-09  8.379490e-07  0.000000e+00   
10     5.987662e-09  0.000045  1.618287e-09  0.000000e+00  0.000000e+00   
11     4.854861e-09  0.000023  2.103773e-09  0.000000e+00  0.000000e+00   
12     3.722060e-09  0.00

In [47]:
print(euclidean_distance(normalizedData.loc[1],normalizedData.loc[2]))

2.141560235382839e-05


<h1> Training - Finding Best Parameter </h1>

In [48]:
len(normalizedData.index)

32561

In [49]:
def classify_k_means(instance, centroids):
    rowCount = len(centroids.index)
    distances = np.zeros(rowCount)
    for index, row in centroids.iterrows():
        distances[index] = euclidean_distance(instance,row)
    return np.argmin(distances)

def k_means(df, k):
    rowCount = len(df.index)
    colCount = len(df.columns)
    centroids = df.sample(k)
    centroids = centroids.reset_index(drop=True)
    isConvergence = False;
    i = 0
    
    while (not isConvergence):
        print()
        print(i)
        print(datetime.datetime.now())
        prev_centroids = centroids.copy()
        centroids[:] = 0.0 #Set all to zero
        pred = np.zeros(rowCount).astype(int)
        countLabel = np.zeros(k).astype(int)
        #Classify Each Row and Sum it
        for index, row in df.iterrows():
            pred[index] = classify_k_means(row, prev_centroids)
            countLabel[pred[index]] += 1
            for colIndex in range(0,colCount):
                centroids.iat[pred[index], colIndex] += row[colIndex]
        #Get new centroids (means)
        for labelIndex in range(0,k):
            for colIndex in range(0,colCount):
                centroids.iat[labelIndex, colIndex] /= countLabel[labelIndex]
        #Check convergence by comparing centroids
        convergeCheck = np.isclose(prev_centroids, centroids)
        print(prev_centroids)
        print(centroids)
        print(countLabel)
        if (convergeCheck.all()):
            isConvergence = True
        i+=1
    return {0: pred,1: centroids}

In [50]:
tmp = k_means(normalizedData, 2)


0
2017-12-02 17:51:56.748454
              0         1             2    3    4             5
0  4.369375e-09  0.000049  1.456458e-09  0.0  0.0  1.132801e-08
1  5.502176e-09  0.000099  1.456458e-09  0.0  0.0  4.854861e-09
              0         1             2             3             4  \
0  6.253661e-09  0.000030  1.632665e-09  1.741062e-07  1.422814e-08   
1  5.714926e-09  0.000091  1.561767e-09  1.895737e-07  8.870719e-09   

              5  
0  6.545858e-09  
1  6.443022e-09  
[31954   607]

1
2017-12-02 17:52:27.673672
              0         1             2             3             4  \
0  6.253661e-09  0.000030  1.632665e-09  1.741062e-07  1.422814e-08   
1  5.714926e-09  0.000091  1.561767e-09  1.895737e-07  8.870719e-09   

              5  
0  6.545858e-09  
1  6.443022e-09  
              0         1             2             3             4  \
0  6.275476e-09  0.000028  1.635314e-09  1.745912e-07  1.430584e-08   
1  5.703949e-09  0.000074  1.564077e-09  1.710620e-07  1

1  6.514359e-09  
[24244  8317]

15
2017-12-02 17:59:21.046560
              0         1             2             3             4  \
0  6.333981e-09  0.000023  1.641463e-09  1.760342e-07  1.448784e-08   
1  5.980210e-09  0.000053  1.601845e-09  1.696147e-07  1.308011e-08   

              5  
0  6.554089e-09  
1  6.514359e-09  
              0         1             2             3             4  \
0  6.334076e-09  0.000023  1.641404e-09  1.760996e-07  1.449322e-08   
1  5.980315e-09  0.000053  1.602058e-09  1.694314e-07  1.306597e-08   

              5  
0  6.553899e-09  
1  6.514956e-09  
[24235  8326]

16
2017-12-02 17:59:48.566130
              0         1             2             3             4  \
0  6.334076e-09  0.000023  1.641404e-09  1.760996e-07  1.449322e-08   
1  5.980315e-09  0.000053  1.602058e-09  1.694314e-07  1.306597e-08   

              5  
0  6.553899e-09  
1  6.514956e-09  
              0         1             2             3             4  \
0  6.334261e-09  

<h1> Read Test Data </h1>

In [9]:
test_data = read_csv("CencusIncome.csv", header=None)
test_label = test_data.loc[:,6]
test_label,test_label_level = test_label.factorize()

<h1> Testing - Using Known Parameter </h1>

In [10]:
#Test in test

<h1> Prepare Prediction Result </h1>

In [11]:
pred_label,pred_label_level = DataFrame(tmp[0])[0].factorize()

<h1> Print Prediction Result </h1>

In [12]:
print(metrics.confusion_matrix(test_label,pred_label))
print(metrics.accuracy_score(test_label, pred_label))

[[18275  6445]
 [ 5947  1894]]
0.619422007924
