In [1]:
import os, sys

import numpy as np
import pandas as pd

from os.path import join

from kmeans_clustering import KMeansClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = True
need_pca = True
filename = 'iris.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 5
num_iterations = 500
num_iter_exp = 100

In [3]:
# f = open('data/data.csv')
# lines = f.read()
# print(type(lines))
# new_lines = ''
# for line in lines:
#     new_lines += str(line.encode('utf-8').strip())
# print(len(new_lines))

In [4]:
df = pd.read_csv(
    datapath, 
    low_memory=False,
    encoding='utf-8',
#     sep=',',
    header=None,
)

In [5]:
print(len(df))

151


In [6]:
print(df.head())

              0            1             2            3        4
0  sepal.length  sepal.width  petal.length  petal.width  variety
1           5.1          3.5           1.4           .2   Setosa
2           4.9            3           1.4           .2   Setosa
3           4.7          3.2           1.3           .2   Setosa
4           4.6          3.1           1.5           .2   Setosa


In [7]:
print(df.dtypes)

0    object
1    object
2    object
3    object
4    object
dtype: object


In [8]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
# cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
cols = columns[:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

151
0    float64
1    float64
2    float64
3    float64
dtype: object


In [9]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [10]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [11]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [12]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [13]:
data = X
print(data[:5])
print(data.shape)

[[ 0.60638672  0.06705915]
 [-0.09769428  0.00837857]
 [-0.10377722 -0.00327927]
 [ 0.46163789 -0.28988696]
 [-0.47625085 -0.02479771]]
(151, 2)


In [14]:
models = ['ostrovsky', 'kmeans', 'kmeans++', 'variance']

In [15]:
def plot_scatter(X, labels):
    fig, ax = plt.subplots()
    scatter = plt.scatter(
        X[:, 0],
        X[:, 1],
        c=labels,
    )
    handles, labels = scatter.legend_elements()
    legend = ax.legend(
        handles,
        labels,
        loc='upper right',
        title='Label',
    )
    ax.add_artist(legend)
    plt.title(model+' '+str(num_clusters)+ ' clusters')
    plt.xlabel('Component A')
    plt.ylabel('Component B')
    plt.show()

In [16]:
algorithms = {}
iterations = {}
inertias = {}
for model in models:
    iterations[model] = 0
    inertias[model] = 0
for i in range(num_iter_exp):
    for model in models:
        algorithms[model] = KMeansClustering(
            n_clusters=num_clusters,
            max_iter=num_iterations,
            init=model,
        #     algorithm='auto',
        #     n_init=50,
            verbose=1,
        )
        algorithm = algorithms[model].fit(data)
        centers = algorithm.cluster_centers_
        labels = algorithms[model].predict(data)
    #     plot_scatter(X, labels)
        iterations[model] += algorithm.iter_convergence_
        inertias[model] += algorithm.sse_

for model in models:
    print(model+', '+str(num_clusters))
    print('Inertia -', inertias[model]/num_iter_exp)
    print('Iterations -', iterations[model]/num_iter_exp)
    print('')


Initializtion ostrovsky
[[ 0.5640323   0.03891228]
 [-0.07365382  0.08330417]
 [-0.63468613 -0.16363639]
 [ 0.81743963  0.91654053]
 [ 0.5384113  -0.21969165]]
Initializtion kmeans
[[-0.1550974   0.02742227]
 [-0.58816878 -0.07539509]
 [ 0.51854402 -0.13495128]
 [-0.61215633 -0.23639126]
 [ 0.4142817  -0.030909  ]]
Initializtion kmeans++
[[ 0.53324414 -0.03139886]
 [-0.30728985  0.017252  ]
 [ 0.81743963  0.91654053]
 [ 0.5556559   0.06366906]
 [-0.18637581  0.15358739]]
Initializtion variance
[[-0.48500305 -0.07093792]
 [-0.22920392 -0.06716571]
 [ 0.58649221 -0.06984046]
 [ 0.52278235 -0.05703754]
 [-0.63468613 -0.16363639]]
Initializtion ostrovsky
[[ 0.56073306 -0.20563644]
 [-0.36632955  0.01574152]
 [-0.03367611  0.08934983]
 [ 0.81743963  0.91654053]
 [ 0.5239792   0.01504729]]
Initializtion kmeans
[[-0.06074779  0.10816687]
 [-0.11955696  0.12373619]
 [ 0.81743963  0.91654053]
 [ 0.60638672  0.06705915]
 [-0.45977362  0.03537599]]
Initializtion kmeans++
[[-0.13707474  0.0089378 

Initializtion ostrovsky
[[ 0.53310361  0.03253404]
 [-0.53669782 -0.07045798]
 [ 0.01988025  0.14468241]
 [-0.11955696  0.12373619]
 [-0.61215633 -0.23639126]]
Initializtion kmeans
[[-0.61215633 -0.23639126]
 [-0.09769428  0.00837857]
 [-0.29252459  0.03905219]
 [-0.18637581  0.15358739]
 [-0.46980357 -0.08248984]]
Initializtion kmeans++
[[-0.10221366  0.05833063]
 [-0.46980357 -0.08248984]
 [-0.58350411 -0.07152997]
 [ 0.48071686 -0.17235371]
 [ 0.5384113  -0.21969165]]
Initializtion variance
[[ 0.44452077 -0.17102342]
 [-0.19033979 -0.07468903]
 [-0.24676749  0.0807316 ]
 [ 0.52024812 -0.13727428]
 [ 0.54378133  0.01205914]]
Initializtion ostrovsky
[[ 0.53813024 -0.09182585]
 [-0.39377031 -0.01384156]
 [-0.14901445  0.0390801 ]
 [ 0.53269506 -0.24182441]
 [ 0.81743963  0.91654053]]
Initializtion kmeans
[[ 0.58649221 -0.06984046]
 [ 0.45447533 -0.07097692]
 [-0.06393217  0.10739581]
 [ 0.5303432  -0.10116068]
 [ 0.10606702  0.14519518]]
Initializtion kmeans++
[[-0.53669782 -0.07045798

Initializtion variance
[[-0.58350411 -0.07152997]
 [-0.05902909  0.16903725]
 [ 0.52285774 -0.03921821]
 [ 0.58649221 -0.06984046]
 [-0.53573387 -0.07432801]]
Initializtion ostrovsky
[[ 0.53269506 -0.24182441]
 [-0.38146389 -0.05445879]
 [-0.14828456 -0.05894442]
 [ 0.5640323   0.03891228]
 [-0.0557717   0.1287526 ]]
Initializtion kmeans
[[-0.20113318  0.17248371]
 [ 0.08531119  0.25162491]
 [ 0.53813024 -0.09182585]
 [-0.5319096  -0.25259419]
 [-0.39377031 -0.01384156]]
Initializtion kmeans++
[[-0.33872577  0.08528159]
 [ 0.52582382 -0.05120862]
 [ 0.00941846  0.11904373]
 [ 0.81743963  0.91654053]
 [-0.58816878 -0.07539509]]
Initializtion variance
[[ 0.01808616  0.0659927 ]
 [-0.60048308 -0.07547438]
 [ 0.53324414 -0.03139886]
 [ 0.54978888  0.00589765]
 [ 0.54497818  0.08414396]]
Initializtion ostrovsky
[[-0.16889198  0.02424892]
 [ 0.51854402 -0.13495128]
 [-0.60314996 -0.0510817 ]
 [ 0.05852058  0.14257615]
 [ 0.81743963  0.91654053]]
Initializtion kmeans
[[-0.48403238 -0.05231108

Initializtion ostrovsky
[[ 0.5303432  -0.10116068]
 [-0.40310827  0.0736603 ]
 [ 0.81743963  0.91654053]
 [ 0.10606702  0.14519518]
 [-0.01610229  0.04102408]]
Initializtion kmeans
[[-0.48500305 -0.07093792]
 [-0.45977362  0.03537599]
 [ 0.07379307  0.08996851]
 [ 0.4142817  -0.030909  ]
 [-0.48529437 -0.04264367]]
Initializtion kmeans++
[[-0.0597187   0.02254461]
 [ 0.54971349 -0.01192168]
 [-0.63468613 -0.16363639]
 [-0.35098954 -0.05974331]
 [ 0.05852058  0.14257615]]
Initializtion variance
[[-0.06913444  0.0333521 ]
 [-0.60048308 -0.07547438]
 [ 0.43391845 -0.13272919]
 [-0.36173477 -0.01639122]
 [-0.34250739  0.07790564]]
Initializtion ostrovsky
[[-0.48936492  0.03714967]
 [ 0.5303432  -0.10116068]
 [-0.02299839  0.06887493]
 [-0.38146389 -0.05445879]
 [-0.58350411 -0.07152997]]
Initializtion kmeans
[[-0.07365382  0.08330417]
 [ 0.01290875  0.15471393]
 [-0.03367611  0.08934983]
 [ 0.51744743 -0.02645183]
 [-0.27688539 -0.02403037]]
Initializtion kmeans++
[[-0.20502293 -0.05617302

Initializtion ostrovsky
[[-0.46008742 -0.04021966]
 [ 0.46163789 -0.28988696]
 [ 0.49970584 -0.13583317]
 [ 0.07379307  0.08996851]
 [-0.28318425  0.01042537]]
Initializtion kmeans
[[-0.48505358  0.0740077 ]
 [-0.27688539 -0.02403037]
 [-0.10377722 -0.00327927]
 [-0.11955696  0.12373619]
 [ 0.05852058  0.14257615]]
Initializtion kmeans++
[[ 0.52278235 -0.05703754]
 [-0.29252459  0.03905219]
 [-0.02299839  0.06887493]
 [-0.21919095 -0.07136832]
 [-0.46980357 -0.08248984]]
Initializtion variance
[[ 0.52024812 -0.13727428]
 [-0.38146389 -0.05445879]
 [-0.58816878 -0.07539509]
 [ 0.51744743 -0.02645183]
 [-0.2929813  -0.0314856 ]]
Initializtion ostrovsky
[[ 0.54971349 -0.01192168]
 [-0.48862716 -0.02017834]
 [-0.10377722 -0.00327927]
 [ 0.81743963  0.91654053]
 [-0.33872577  0.08528159]]
Initializtion kmeans
[[ 0.10829533  0.19053278]
 [ 0.60638672  0.06705915]
 [-0.33872577  0.08528159]
 [-0.05902909  0.16903725]
 [ 0.51595926 -0.0702424 ]]
Initializtion kmeans++
[[-0.3570646  -0.03070463

  probs = V/sum(V)


Initializtion variance
[[-0.39377031 -0.01384156]
 [-0.06393217  0.10739581]
 [ 0.5640323   0.03891228]
 [-0.48500305 -0.07093792]
 [-0.29252459  0.03905219]]
Initializtion ostrovsky
[[-0.40310827  0.0736603 ]
 [ 0.49970584 -0.13583317]
 [-0.55278585 -0.0372167 ]
 [ 0.01988025  0.14468241]
 [-0.21919095 -0.07136832]]
Initializtion kmeans
[[-0.48505358  0.0740077 ]
 [-0.46980357 -0.08248984]
 [ 0.60638672  0.06705915]
 [ 0.53324414 -0.03139886]
 [ 0.54378133  0.01205914]]
Initializtion kmeans++
[[-0.11955696  0.12373619]
 [-0.67939005  0.00775551]
 [ 0.60638672  0.06705915]
 [-0.38146389 -0.05445879]
 [ 0.46163789 -0.28988696]]
Initializtion variance
[[-0.18292735 -0.09557579]
 [ 0.51162225  0.20061013]
 [-0.63468613 -0.16363639]
 [ 0.52582382 -0.05120862]
 [-0.5319096  -0.25259419]]
Initializtion ostrovsky
[[-0.37507265  0.05097323]
 [ 0.54971349 -0.01192168]
 [ 0.05852058  0.14257615]
 [ 0.46163789 -0.28988696]
 [-0.30728985  0.017252  ]]
Initializtion kmeans
[[-0.46008742 -0.04021966

Initializtion kmeans
[[ 0.53310361  0.03253404]
 [ 0.5556559   0.06366906]
 [-0.67939005  0.00775551]
 [-0.1604402   0.01731146]
 [-0.67939005  0.00775551]]
Initializtion kmeans++
[[-0.48862716 -0.02017834]
 [ 0.81743963  0.91654053]
 [ 0.1129655   0.17621937]
 [-0.14687179 -0.03297318]
 [ 0.46848584 -0.11391715]]
Initializtion variance
[[-0.10288629  0.03390642]
 [ 0.53310361  0.03253404]
 [-0.55175439 -0.06396391]
 [ 0.52024812 -0.13727428]
 [ 0.52285774 -0.03921821]]
Initializtion ostrovsky
[[-0.06393217  0.10739581]
 [ 0.52285774 -0.03921821]
 [-0.38146389 -0.05445879]
 [-0.0422179   0.01527457]
 [ 0.53269506 -0.24182441]]
Initializtion kmeans
[[-0.13707474  0.0089378 ]
 [-0.21919095 -0.07136832]
 [-0.60314996 -0.0510817 ]
 [-0.11955696  0.12373619]
 [-0.22553798  0.16690808]]
Initializtion kmeans++
[[ 0.01808616  0.0659927 ]
 [-0.50724704 -0.00829876]
 [-0.67939005  0.00775551]
 [ 0.52285774 -0.03921821]
 [ 0.81743963  0.91654053]]
Initializtion variance
[[-0.19240822 -0.00301607]

Initializtion variance
[[-0.5049536  -0.04471338]
 [-0.10288629  0.03390642]
 [ 0.51744743 -0.02645183]
 [ 0.53269506 -0.24182441]
 [-0.5319096  -0.25259419]]
Initializtion ostrovsky
[[ 0.01290875  0.15471393]
 [-0.5049536  -0.04471338]
 [-0.20113318  0.17248371]
 [ 0.5215855  -0.12912236]
 [ 0.52278235 -0.05703754]]
Initializtion kmeans
[[-0.1604402   0.01731146]
 [-0.36173477 -0.01639122]
 [-0.48936492  0.03714967]
 [-0.40310827  0.0736603 ]
 [ 0.5589133   0.02338441]]
Initializtion kmeans++
[[-0.58816878 -0.07539509]
 [ 0.60638672  0.06705915]
 [-0.05902909  0.16903725]
 [-0.27688539 -0.02403037]
 [ 0.00941846  0.11904373]]
Initializtion variance
[[-0.46008742 -0.04021966]
 [-0.22920392 -0.06716571]
 [ 0.53139952  0.03485704]
 [-0.61215633 -0.23639126]
 [-0.5049536  -0.04471338]]
Initializtion ostrovsky
[[ 0.5535705   0.0132736 ]
 [-0.06393217  0.10739581]
 [-0.48862716 -0.02017834]
 [ 0.81743963  0.91654053]
 [-0.36173477 -0.01639122]]
Initializtion kmeans
[[ 0.5303432  -0.10116068