In [1]:
import os, sys

import pandas as pd

from os.path import join

from library import SBKMeans, ErrorChecker
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.cluster import KMeans

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = True
need_standardization = False
need_pca = True
filename = 'iris.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 5
num_iterations = 300

In [3]:
df = pd.read_csv(datapath, low_memory=False)

In [4]:
print(len(df))

150


In [5]:
print(df.head())

   sepal.length  sepal.width  petal.length  petal.width variety
0           5.1          3.5           1.4          0.2  Setosa
1           4.9          3.0           1.4          0.2  Setosa
2           4.7          3.2           1.3          0.2  Setosa
3           4.6          3.1           1.5          0.2  Setosa
4           5.0          3.6           1.4          0.2  Setosa


In [6]:
print(df.dtypes)

sepal.length    float64
sepal.width     float64
petal.length    float64
petal.width     float64
variety          object
dtype: object


In [7]:
df = df[df.columns.tolist()[:-1]]
for col in df.columns.tolist():
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)

In [8]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [9]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [10]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [11]:
data = X
print(data[:5])
print(data.shape)

[[-2.1987406   1.86005711]
 [-0.44766702 -1.54379203]
 [ 0.87427365  0.25079339]
 [-2.2075877   1.48360936]
 [ 2.30492772  2.62632347]]
(150, 2)


In [12]:
sbkmeans = SBKMeans(
    n_clusters=num_clusters,
    n_iters=num_iterations,
)

In [13]:
centers = sbkmeans.fit(data)

Initial centers:
[[ 1.86270322 -0.17854949]
 [-2.28647514  0.44171539]
 [ 0.02345269 -1.57247559]
 [ 2.01481043  0.61388564]
 [-2.07563095  1.48917752]]


In [14]:
print(sbkmeans.centers)

[[ 0.75799185 -0.71974359]
 [-2.22766094  0.21927648]
 [-0.31734311 -1.60734344]
 [ 1.66245297  0.5048287 ]
 [-0.14288671  1.92837939]]


In [15]:
labels = sbkmeans.predict(data)

In [16]:
print(labels[:5])

[1, 2, 3, 1, 3]


In [17]:
error_checker = ErrorChecker(
    X=data,
    centers=centers,
    labels=labels,
)

In [18]:
dist_total = error_checker.potential_function()

In [19]:
print(dist_total)

5206.155676875963


In [20]:
kmeans = KMeans(
    n_clusters=num_clusters,
    max_iter=num_iterations,
    init='k-means++',
    algorithm='auto',
#     n_init=50,
)

In [21]:
labels = kmeans.fit_predict(data)

In [22]:
error_checker = ErrorChecker(
    data,
    kmeans.cluster_centers_,
    kmeans.labels_,
)

In [23]:
dist_total = error_checker.potential_function()

In [24]:
print(dist_total)

5705.3160780921835
