In [14]:

import random
import math
import numpy as np
import pandas as pd
from sklearn import cluster, datasets
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from pyspark.sql import SparkSession
from pyspark.sql import Row


# Data Preprocessing

## Abalone dataset

In [5]:
#data_x : features (Dataframe)
#data_y : labels   (np.array)

column_names = ["sex", "length", "diameter", "height", "whole weight", 
                "shucked weight", "viscera weight", "shell weight", "rings"]
abalone = pd.read_csv("./dataset/abalone.data",names=column_names)

data_x = abalone.drop(columns=["rings"])
dfDummies = pd.get_dummies(data_x['sex'], prefix = 'sex')
data_x = pd.concat([data_x.drop(columns=['sex']), dfDummies], axis=1)

data_y = np.array(abalone["rings"])
data_y[data_y<11]    = 0
data_y[data_y>=11]   = 1
Simulate_time = 5


## Iris Dataset

In [6]:
#data_x : features (Dataframe)
#data_y : labels   (np.array)
'''
iris = datasets.load_iris()
data_x = pd.DataFrame(iris.data)

data_y = iris.target

Simulate_time = 10
'''

'\niris = datasets.load_iris()\ndata_x = pd.DataFrame(iris.data)\n\ndata_y = iris.target\n\nSimulate_time = 10\n'

In [15]:
class our_k_means:
    def __init__(self,data_x,data_y,m = 2,epsilon = 1.0e-6):
        self.data_x = pd.DataFrame(preprocessing.scale(data_x))
        self.data_y = data_y
        self.label  = pd.DataFrame(data_y)
        self.acc    = 0
        
        #資料特性
        self.DCNT = len(self.data_x)               #資料個數
        self.DIM  = len(self.data_x.columns)       #資料維度
        self.K    = len(np.unique(data_y))         #叢聚個數
        #self.K    = np.amax(self.data_y)+1        #叢聚個數
        self.MAX_ITER = 100                        #最大迭代 
        
        #k-means過程的參數
        self.m    = m                       #m              :hyper parameter,控制fuzzy程度的變數,通常為2
        self.epsilon = epsilon              #epsilon        :收斂的閾值
        self.data =[]                       #data[DCNT][DIM]:資料
        self.cent =[]                       #cent[K][DIM]   :各centroid的座標
        self.table=[]                       #table[DCNT][K] :各資料對各cluster的membership values matrix
        self.dis_k=[]                       #dis_k[K][DIM]  :各cluster的座標和
        self.cent_c=[]                      #cent_c[K]      :各cluster的擁有資料數和
        self.nearest=[]                     #nearest[DNST]  :各資料最可能屬於的cluster
        self.iterl = 0 
        self.obj_value = 0
        self.prev_obj_value = 0
        
        #計算acc時的參數
        self.origin_mass = []
        self.cent_name = []

        
    #run k-means    
    def run(self):
        
        #initialize tables
        self.kmeans_init()   #初始化centroid
                
        #first iteration 
        self.iterl = 0
        self.update_table()
        self.obj_value = self.cal_obj_func()
        self.prev_obj_value = self.obj_value*2
        
        #update centroid & data clustering
        while self.iterl<self.MAX_ITER and abs(self.prev_obj_value-self.obj_value)>=self.epsilon :
            self.prev_obj_value = self.obj_value
            self.iterl+=1
            self.update_cent()
            self.update_table()
            self.obj_value = self.cal_obj_func()
        
        #self.print_result()    
        
    #Calculate average accuracy    
    def calculate_acc(self,iterate_times):
        self.acc = 0
        i = 0
        while( i < iterate_times):
            self.run()
            self.calculate_origin_mass()
            self.cent_name = self.centroid_names()
            # Avoid the rare situations that some cluster are gone
            #if len(np.unique(self.cent_name)) != self.K:
            #    continue
                
            self.nearest_cluster()
            i += 1
            self.acc += accuracy_score(self.data_y,self.nearest)
            
            #self.print_result()
        
        if iterate_times is not 0:
            self.acc /= iterate_times
        
        print("Average accuracy for ",iterate_times," times : ",self.acc)
        return self.acc
#---------------------------------------------------------------------------------
#----------------Subfunctions of calculate_acc(iterate_times)---------------------
#---------------------------------------------------------------------------------
    def centroid_names(self):
        cent_name = np.zeros(self.K)
        
        for i in range(self.K):
            min_dist=float("inf")
            name = 0
            for j in range(self.K):
                dist = np.linalg.norm(self.cent[i] - self.origin_mass[j])
                if dist < min_dist:
                    min_dist = dist
                    name = j
            cent_name[i] = name
            
        return cent_name
    
    def calculate_origin_mass(self):
        self.origin_mass = np.zeros((self.K,self.DIM))
        
        counter = np.zeros(self.K)
        for i in range(self.K):
            counter[i] = len(self.data_y[self.data_y==i])
            
        
        for j in range(self.DIM):
            for i in range(self.DCNT):
                a = self.data_y[i]
                self.origin_mass[a][j] += self.data_x.iloc[i,j]
            for i in range(self.K):  
                if counter[i] is not 0:
                    self.origin_mass[i][j] /= counter[i]
    
    def nearest_cluster(self):
        self.nearest = np.zeros(self.DCNT)
        
        for i in range(len(self.data_x.index)):
            self.nearest[i] = self.cent_name[np.argmax(self.table[i])]
        
#---------------------------------------------------------------------------------
#------------------------------Subfunctions of run()------------------------------
#---------------------------------------------------------------------------------    
    def kmeans_init(self):
        
        self.data = self.data_x.values
        self.cent = np.zeros((self.K,self.DIM))
        self.table= np.zeros((self.DCNT,self.K))
        self.dis_k= np.zeros((self.K,self.DIM))
        self.cent_c=np.zeros(self.K)
        self.U    = np.zeros((self.DCNT,self.K))
                
        pick = []
        counter = 0
        while(counter<self.K):
            rnd = random.randint(0,self.DCNT-1)
            if(rnd not in pick):
                pick.append(rnd)
                counter=counter+1
                
        for i in range(self.K):
            for j in range(self.DIM):
                self.cent[i][j] = self.data[pick[i]][j] 
      

    def update_cent(self):
        
        for k in range(self.K):
            down = 0
            for i in range(self.DCNT):
                down += self.table[i][k]
                
            for i in range(self.DCNT):    
                for j in range(self.DIM):
                    self.cent[k][j] += self.data_x.iloc[i,j]*self.table[i][k]
                    
            for j in range(self.DIM):
                self.cent[k][j] /= down
                    
    def cal_w(self,i,j):
        w = 0
        dis = np.linalg.norm(self.data_x.iloc[i].values-self.cent[j])
        for c in range(self.K):
            dis_c = np.linalg.norm(self.data_x.iloc[i].values-self.cent[c])
            if dis_c != 0:
                w += math.pow((dis/dis_c),2/(self.m-1))
        
        if(w != 0):
            w = 1/w
            
        return w
            
    def update_table(self):
        for i in range(self.DCNT):
            for j in range(self.K):
                self.table[i][j] = self.cal_w(i,j)
                
    def cal_obj_func(self):
        obj_value = 0
        for i in range(self.DCNT):
            for j in range(self.K):
                obj_value += self.table[i][j]*math.pow(np.linalg.norm(self.data_x.iloc[i].values-self.cent[j]),2)
        return obj_value

    def print_cent(self):
        print("Centroids:")
        print(self.cent)

    def print_result(self):
        print("K means:")
        print(self.table)
        print("Object function value = ",end='')
        print(self.obj_value)
        print("Previous Object function value = ",end='')
        print(self.prev_obj_value)
        print("iter = ",end='')
        print(self.iterl)     
    

## Here comes our FCM
### Let's run it!

In [16]:
result = our_k_means(data_x,data_y)
result.calculate_acc(Simulate_time)

  This is separate from the ipykernel package so we can avoid doing imports until


AttributeError: 'DataFrame' object has no attribute 'rdd'

In [5]:
result.print_cent()

Centroids:
[[-0.25444975 -0.25728413 -0.22810925 -0.26392622 -0.25291458 -0.26060043
  -0.2562821  -0.12244822  0.25400296 -0.12838033]
 [ 0.24841005  0.25117716  0.22269478  0.25766158  0.24691132  0.25441474
   0.25019891  0.11954175 -0.24797387  0.12533305]]


In [6]:
result.print_result()

K means:
[[0.63726119 0.36273881]
 [0.63513894 0.36486106]
 [0.51388198 0.48611802]
 ...
 [0.30726706 0.69273294]
 [0.30026951 0.69973049]
 [0.3763871  0.6236129 ]]
Object function value = 39544.823604668076
Previous Object function value = 39544.882397948495
iter = 100
