In [1]:
import os, sys, timeit

import numpy as np
import pandas as pd

from os.path import join

from kmeans_clustering import KMeansClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = True
need_pca = True
filename = 'cloud.csv'
datapath = join(data_dir, filename)
num_comps = 2
num_clusters = 5
num_iterations = 300
num_iter_exp = 10
verbose = False

In [3]:
# f = open('data/data.csv')
# lines = f.read()
# print(type(lines))
# new_lines = ''
# for line in lines:
#     new_lines += str(line.encode('utf-8').strip())
# print(len(new_lines))

In [4]:
df = pd.read_csv(
    datapath, 
    low_memory=False,
    encoding='utf-8',
#     sep=',',
    header=None,
)

In [5]:
print(len(df))

1024


In [6]:
print(df.head())

     0      1        2       3         4       5       6      7      8  \
0  3.0  140.0  43.5000  0.0833  862.8417  0.0254  3.8890  163.0  240.0   
1  3.0  135.0  41.9063  0.0790  690.3291  0.0259  3.8340  167.0  239.0   
2  2.0  126.0  21.0586  0.0406  308.3583  0.0684  3.1702  174.0  240.0   
3  4.0  197.0  77.4805  0.0890  874.4709  0.0243  3.9442  155.0  239.0   
4  7.0  193.0  88.8398  0.0884  810.1126  0.0223  3.9318  150.0  236.0   

          9  
0  213.3555  
1  213.7188  
2  227.5859  
3  197.2773  
4  186.0195  


In [7]:
print(df.dtypes)

0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
9    float64
dtype: object


In [8]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
# cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
# cols = columns[2:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

1024
0    float64
1    float64
2    float64
3    float64
4    float64
5    float64
6    float64
7    float64
8    float64
9    float64
dtype: object


In [9]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [10]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [11]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=num_comps,
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [12]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [13]:
data = X
print(data[:5])
print(data.shape)

[[ 0.91125296  0.19689659]
 [ 0.05031556 -0.13197927]
 [-0.09550929 -0.10844878]
 [ 0.63624413 -0.10053126]
 [ 0.27982568 -0.14858531]]
(1024, 2)


In [14]:
models = ['ostrovsky', 'kmeans', 'kmeans++', 'variance']

In [15]:
def plot_scatter(X, labels):
    fig, ax = plt.subplots()
    scatter = plt.scatter(
        X[:, 0],
        X[:, 1],
        c=labels,
    )
    handles, labels = scatter.legend_elements()
    legend = ax.legend(
        handles,
        labels,
        loc='upper right',
        title='Label',
    )
    ax.add_artist(legend)
    plt.title(model+' '+str(num_clusters)+ ' clusters')
    plt.xlabel('Component A')
    plt.ylabel('Component B')
    plt.show()

In [16]:
algorithms = {}
iterations = {}
inertias = {}
times = {}
inertias_min = {}
for model in models:
    iterations[model] = 0.0
    inertias[model] = 0.0
    times[model] = 0.0
    inertias_min[model] = 1e20
for i in range(num_iter_exp):
    for model in models:
        algorithms[model] = KMeansClustering(
            n_clusters=num_clusters,
            max_iter=num_iterations,
            init=model,
            verbose=verbose,
        )
        start = timeit.default_timer()
        algorithm = algorithms[model].fit(data)
        centers = algorithm.cluster_centers_
        labels = algorithms[model].predict(data)
#         plot_scatter(X, labels)
        inertias[model] += algorithm.sse_
        inertias_min[model] = min(inertias_min[model], algorithm.sse_)
        end = timeit.default_timer()
        times[model] += (end-start)

KeyboardInterrupt: 

In [None]:
for model in models:
    print(model+', '+str(num_clusters))
    print('Average Inertia -', inertias[model]/num_iter_exp)
    print('Minimum Inertia - ', inertias_min[model])
    print('Time -', times[model]/num_iter_exp)
    print('')