In [1]:
from __future__ import print_function
from numpy import *
import pandas as pd
import sklearn.datasets as datasets
%matplotlib inline

In [2]:
def loadDataSet(fileName):  
    dataSet = []
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = map(float, curLine) 
        dataSet.append(fltLine)
    return dataSet

In [3]:
def distEclud(vecA, vecB):
    return sqrt(sum(power(vecA - vecB, 2))) 

In [4]:
def randCent(dataMat, k):
    n = shape(dataMat)[1]
    centroids = mat(zeros((k, n))) 
    for j in range(n):
        minJ = min(dataMat[:, j]) 
        rangeJ = float(max(dataMat[:, j]) - minJ)  
        centroids[:, j] = mat(minJ + rangeJ * random.rand(k, 1))  
    return centroids

In [5]:
def kMeans(dataMat, k, distMeas=distEclud, createCent=randCent):
    m = shape(dataMat)[0] 
    clusterAssment = mat(zeros(
        (m, 2))) 
    centroids = createCent(dataMat, k)
    clusterChanged = True
    while clusterChanged:
        clusterChanged = False
        for i in range(m):  
            minDist = inf
            minIndex = -1
            for j in range(k):
                distJI = distMeas(centroids[j, :],
                                  dataMat[i, :]) 
                if distJI < minDist:  
                    minDist = distJI
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:  
                clusterChanged = True 
                clusterAssment[
                    i, :] = minIndex, minDist**2  
        print(centroids)
        for cent in range(k):  
            ptsInClust = dataMat[nonzero(
                clusterAssment[:, 0].A == cent)[0]]  
            centroids[cent, :] = mean(
                ptsInClust, axis=0)  
    return centroids, clusterAssment


In [6]:
def biKMeans(dataMat, k, distMeas=distEclud):
    m = shape(dataMat)[0]
    clusterAssment = mat(zeros((m, 2)))  
    centroid0 = mean(dataMat, axis=0).tolist()[0]  
    centList = [centroid0]  
    for j in range(m):  
        clusterAssment[j, 1] = distMeas(mat(centroid0), dataMat[j, :])**2
    while (len(centList) < k):  
        lowestSSE = inf
        for i in range(len(centList)):  
            ptsInCurrCluster = dataMat[nonzero(
                clusterAssment[:, 0].A == i)[0], :] 
            centroidMat, splitClustAss = kMeans(
                ptsInCurrCluster, 2, distMeas)  
            sseSplit = sum(splitClustAss[:, 1])  
            sseNotSplit = sum(
                clusterAssment[nonzero(clusterAssment[:, 0].A != i)[0],
                               1]) 
            print("sseSplit, and notSplit: ", sseSplit, sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
       
        bestClustAss[nonzero(bestClustAss[:, 0].A == 1)[0], 0] = len(
            centList)  
        bestClustAss[nonzero(bestClustAss[:, 0].A == 0)[0],
                     0] = bestCentToSplit 
        print('the bestCentToSplit is: ', bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        
        centList[bestCentToSplit] = bestNewCents[0, :].tolist()[
            0]  
        centList.append(
            bestNewCents[1, :].tolist()[0])  
        clusterAssment[nonzero(clusterAssment[:, 0].A == bestCentToSplit)[
            0], :] = bestClustAss  
    return mat(centList), clusterAssment

In [7]:
iris_set = datasets.load_iris()
iris_data = pd.DataFrame(iris_set.data, columns=iris_set.feature_names)
iris_data["species"] = iris_set.target
iris_data['species'] = iris_data['species'].replace(to_replace= [0, 1, 2], value = ['setosa', 'versicolor', 'virginica'])
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
# spliting the dataset in independent and dependent variables
x=iris_data.iloc[:,:4].values
y=iris_data['species'].values

In [9]:
#scaling
from sklearn.preprocessing import StandardScaler
standarScale=StandardScaler()
x_train=standarScale.fit_transform(x)


In [10]:
def testBiKMeans():
    centList, myNewAssments = biKMeans(x_train, 3)
    print('centList=', centList)

In [11]:
if __name__ == "__main__":
    testBiKMeans()

[[ 0.15198373  0.73116417 -0.60422679  0.75461092]
 [ 0.62177714  3.03417099 -1.2740155  -1.32730042]]
[[ 0.08688253 -0.22717363  0.16660631  0.15824234]
 [-0.67972799  1.77729958 -1.30344937 -1.2380136 ]]
[[ 0.48615109 -0.43966823  0.63220101  0.60951349]
 [-1.00206653  0.90625492 -1.30310821 -1.25634413]]
[[ 0.50728948 -0.42663134  0.65249366  0.62744675]
 [-1.01457897  0.85326268 -1.30498732 -1.25489349]]
sseSplit, and notSplit:  145.9039828254881 0.0
the bestCentToSplit is:  0
the len of bestClustAss is:  150
[[ 0.32429989 -2.30216078 -0.10011421 -0.08985795]
 [-0.99236353  0.61765983  1.20078413  0.13623694]]
[[ 0.2534428  -1.11031574  0.47146646  0.29704996]
 [ 0.67652061  0.02915826  0.77317846  0.84771127]]
[[-0.07570844 -0.97929949  0.3334543   0.23333438]
 [ 1.02428764  0.06347061  0.93541536  0.97694261]]
[[-0.07723421 -0.93062132  0.32313817  0.23727821]
 [ 1.06889068  0.05759433  0.96893325  1.00231456]]
sseSplit, and notSplit:  287.6532863186939 145.9039828254881
[[-1.268