In [3]:
import time
import random
import numpy as np
import pandas as pd
from sklearn import cluster, datasets
from sklearn.metrics import accuracy_score
from sklearn import preprocessing

from pyspark.sql import SparkSession
from pyspark.sql import Row

# Data Preprocessing

## Abalone dataset

In [4]:
#data_x : features (Dataframe)
#data_y : labels   (np.array)
'''
column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
abalone = pd.read_csv("./dataset/abalone.data",names=column_names)

#0,1,2 labeling sex
data_x = abalone.drop(columns=["rings"])
data_x = data_x.replace(['M','F','I'],[0,1,2])

#one-hot encoding
dfDummies = pd.get_dummies(data_x['sex'], prefix = 'sex')
data_x = pd.concat([data_x.drop(columns=['sex']), dfDummies], axis=1)

data_y = np.array(abalone["rings"])
data_y[data_y<11]    = 0
data_y[data_y>=11]   = 1
Simulate_time = 10
data_x
'''

'\ncolumn_names = ["sex", "length", "diameter", "height", "whole weight", \n                "shucked weight", "viscera weight", "shell weight", "rings"]\nabalone = pd.read_csv("./dataset/abalone.data",names=column_names)\n\n#0,1,2 labeling sex\ndata_x = abalone.drop(columns=["rings"])\ndata_x = data_x.replace([\'M\',\'F\',\'I\'],[0,1,2])\n\n#one-hot encoding\ndfDummies = pd.get_dummies(data_x[\'sex\'], prefix = \'sex\')\ndata_x = pd.concat([data_x.drop(columns=[\'sex\']), dfDummies], axis=1)\n\ndata_y = np.array(abalone["rings"])\ndata_y[data_y<11]    = 0\ndata_y[data_y>=11]   = 1\nSimulate_time = 10\ndata_x\n'

## Iris Dataset

In [5]:
#data_x : features (np.array)
#data_y : labels   (np.array)
spark = SparkSession \
                .builder \
                .appName("K-Means:origin") \
                .config("spark.some.config.option", "some-value") \
                .getOrCreate()
sc = spark.sparkContext
line = sc.textFile("./dataset/c20d6n200000.txt")
data_x = spark.createDataFrame(line.map(lambda r : r.split(",")).collect()).toPandas()

Simulate_time = 1
K = 3

converge_dist = 0.05

In [8]:
class our_k_means:
    def __init__(self,data_x, K):
        self.data_x = pd.DataFrame(preprocessing.scale(data_x))
        
        #資料特性
        self.DCNT = len(self.data_x)               #資料個數
        self.DIM  = len(self.data_x.columns)       #資料維度
        self.K    = K                              #叢聚個數
        #self.K    = np.amax(self.data_y)+1        #叢聚個數
        self.MAX_ITER = 30                        #最大迭代 
        self.MIN_PT = 0                            #最小變動點
        
        #k-means過程的參數
        self.data =[]                       #data[DCNT][DIM]:資料
        self.cent =[]                       #cent[K][DIM]   :各centroid的座標
        self.table=[]                       #table[DCNT]    :各資料的所屬cluster
        self.dis_k=[]                       #dis_k[K][DIM]  :各cluster的座標和
        self.cent_c=[]                      #cent_c[K]      :各cluster的擁有資料數和
        self.ch_pt = 0                      #ch_pt          :紀錄變動點個數
        self.iterl = 0 
        self.sse2 = 0
        
        #計算acc時的參數
        self.origin_mass = []
        self.cent_name = []
        
    
    #run k-means    
    def run(self):
        
        #initialize tables
        self.kmeans_init()   #初始化centroid
                
        #first iteration 
        self.ch_pt = 0          
        self.iterl = 0
        self.sse2 = self.update_table()
        sse1 = self.sse2-1
        
        
        #update centroid & data clustering
        while self.iterl<self.MAX_ITER and abs(sse1-self.sse2)>0.05 :
            sse1 = self.sse2
            self.iterl+=1
            self.update_cent()
            self.sse2 = self.update_table()
            print(abs(self.sse2-sse1))
        
        self.table = self.table.astype(int)
        
        

#---------------------------------------------------------------------------------
#------------------------------Subfunctions of run()------------------------------
#---------------------------------------------------------------------------------    
    def kmeans_init(self):
        
        self.data = self.data_x.values
        self.cent = np.zeros((self.K,self.DIM))
        self.table= np.zeros(self.DCNT)
        self.dis_k= np.zeros((self.K,self.DIM))
        self.cent_c=np.zeros(self.K)
                
        pick = []
        counter = 0
        while(counter<self.K):
            rnd = random.randint(0,self.DCNT-1)
            if(rnd not in pick):
                pick.append(rnd)
                counter=counter+1
                
        for i in range(self.K):
            for j in range(self.DIM):
                self.cent[i][j] = self.data[pick[i]][j] 
        
    
    def cal_distance(self,x,y):
        sum = 0
        for i in range(self.DIM):
            sum = sum + (self.data[x][i]-self.cent[y][i])*( self.data[x][i]-self.cent[y][i])
        return sum

            
    def update_table(self):
        t_sse = 0
        self.ch_pt = 0 
        
        for i in range(self.DCNT):
            min_dis = self.cal_distance(i,0)
            min_k=0
            for j in range(1,self.K):
                dis = self.cal_distance(i,j)
                if(dis<min_dis):
                    min_dis = dis
                    min_k = j
            self.ch_pt+=(self.table[i]!=min_k)
            self.table[i] = min_k
            self.cent_c[min_k] +=1
            t_sse+=min_dis
            for j in range(self.DIM):
                self.dis_k[min_k][j]+=self.data[i][j]
                
        return t_sse

    def update_cent(self):
        for i in range(self.K):
            for j in range(self.DIM):
                if self.cent_c[i] != 0:
                    self.cent[i][j] = self.dis_k[i][j]/self.cent_c[i]
                else:
                    self.cent[i][j] = self.dis_k[i][j]

    def print_cent(self):
        print("Centroids:")
        print(self.cent)

    def print_result(self):
        print("K means:")
        print(self.table)
        print("sse = ",end='')
        print(self.sse2)
        print("ch_pt = ",end='')
        print(self.ch_pt)
        print("iter = ",end='')
        print(self.iterl)     
    

## Here comes our k-means
### Let's run for 1 time and check the performance

In [9]:
result = our_k_means(data_x,3)
start = time.time()
result.run()
result.print_cent()
end = time.time()
print("Total time : ",end="")
print(end-start)

  This is separate from the ipykernel package so we can avoid doing imports until


383244.9965154341
8884.10250949033
3168.718579864828
1440.9397754393285
807.18328226218
529.7460536226863
395.289251187467
315.9153376229806
271.8688437063247
242.66101575340144
218.29773011803627
201.47593350196257
189.9994695878122
177.26978130161297
165.58151152392384
155.76283880299889
144.56890569336247
135.2255418151617
126.54755119769834
118.77321510843467
110.41746157337911
102.5093450259883
95.4896971808048
87.95853491686285
81.57216850738041
76.09386691986583
70.61524614854716
65.45803576835897
60.69230003154371
56.19925281370524
Centroids:
[[-0.01343986  0.58024062  0.31009633 -0.90865086  0.48468029  0.49605311]
 [-0.00438008 -0.17928653 -0.30765258  0.49559251 -0.98736629 -0.11493114]
 [ 0.02564292 -0.49445572  0.08803391  0.41704802  0.9799146  -0.48647843]]
Total time : 241.83338809013367


In [10]:
result.print_cent()

Centroids:
[[-0.01343986  0.58024062  0.31009633 -0.90865086  0.48468029  0.49605311]
 [-0.00438008 -0.17928653 -0.30765258  0.49559251 -0.98736629 -0.11493114]
 [ 0.02564292 -0.49445572  0.08803391  0.41704802  0.9799146  -0.48647843]]


## Then , we run it for (Simulate_time) times

In [12]:
# Calculate
result.calculate_acc(Simulate_time)
result.acc

Average accuracy for  10  times :  0.6681829063921474


0.6681829063921474

# Let's run the k-means provides by sklearn
### -Then we can estimate how good we've done

In [13]:
DIM  = len(data_x.columns)       #資料維度
K    = len(np.unique(data_y))    #叢聚個數
label= pd.DataFrame(data_y)


def k_means_sklearn(x):
    
    # KMeans 演算法
    kmeans_fit = cluster.KMeans(n_clusters = K).fit(x)

    # 測試分群結果
    cluster_labels = kmeans_fit.predict(x)
    
    return cluster_labels

In [14]:
#all the clusters should be 1-D DataFrame which contains the same labels
def find_mass(k,dim,table,data):
    mass = np.zeros((k,dim))
    num = np.zeros(k)
    row_count = 0

    for i in table.values:
        for j in range(dim):
            mass[i][j] += data.iloc[row_count][j]
        row_count += 1
        num[i] += 1
        
    for i in range(k):
        for j in range(dim):
            mass[i][j] /= num[i]
    
    return mass

In [15]:
def calculate_closest(k,origin,after_clustering):
    closest = np.zeros(k)
    for i in range(k):
        min_dist=float("inf")
        for j in range(k):
            dist = np.linalg.norm(after_clustering[i]-origin[j])
            if dist < min_dist:
                min_dist = dist
                closest[i]=j
    return closest 

In [16]:
def relabel(origin_table,rename_table,target):
    target = target.replace(origin_table,rename_table)
    return target

In [17]:

cluster_labels = k_means_sklearn(data_x) # sklearn.cluster.k_means_sklearn

In [18]:
#A list that can be used to compared the order of label
temp = np.arange(K)
#Turn the labels trained by skilearn into DataFrame format
cluster_labels = pd.DataFrame(cluster_labels)

#Find the mass of data with trained labels
mass_sklean_kmeans = find_mass(K,DIM,cluster_labels.iloc[0:,0],data_x)
#Find tha mass of data with original labels
mass_origin        = find_mass(K,DIM,label.iloc[0:,0],data_x)
#Fine the correct cluster names & Relabel
closest_sklearn    = calculate_closest(K,mass_origin,mass_sklean_kmeans).astype(int)
cluster_labels     = relabel(temp,closest_sklearn ,cluster_labels)

In [19]:
#Valid accuracy
sklearn_acc = accuracy_score(data_y,cluster_labels)
sklearn_acc

0.5937275556619583