In [1]:
import os, sys, timeit

import numpy as np
import pandas as pd


from metrics import process

from os.path import join
from kmeans_clustering import KMeansClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.mixture import GaussianMixture

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = True
need_standardization = False
need_pca = True
filename = 'wine.csv'
datapath = join(data_dir, filename)
num_clusters = 5
num_iterations = 500
num_iter_exp = 20
verbose = False

In [3]:
# f = open('data/data.csv')
# lines = f.read()
# print(type(lines))
# new_lines = ''
# for line in lines:
#     new_lines += str(line.encode('utf-8').strip())
# print(len(new_lines))

In [4]:
df = pd.read_csv(
    datapath, 
    low_memory=False,
    encoding='utf-8',
#     sep=',',
    header=None,
)

In [5]:
print(len(df))

179


In [6]:
print(df.head())

        0           1     2             3          4              5   \
0  Alcohol  Malic_Acid   Ash  Ash_Alcanity  Magnesium  Total_Phenols   
1    14.23        1.71  2.43          15.6        127            2.8   
2     13.2        1.78  2.14          11.2        100           2.65   
3    13.16        2.36  2.67          18.6        101            2.8   
4    14.37        1.95   2.5          16.8        113           3.85   

           6                     7                8                9     10  \
0  Flavanoids  Nonflavanoid_Phenols  Proanthocyanins  Color_Intensity   Hue   
1        3.06                  0.28             2.29             5.64  1.04   
2        2.76                  0.26             1.28             4.38  1.05   
3        3.24                   0.3             2.81             5.68  1.03   
4        3.49                  0.24             2.18              7.8  0.86   

      11       12                13  
0  OD280  Proline  Customer_Segment  
1   3.92     106

In [7]:
print(df.dtypes)

0     object
1     object
2     object
3     object
4     object
5     object
6     object
7     object
8     object
9     object
10    object
11    object
12    object
13    object
dtype: object


In [8]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
cols = columns[:-1]
# cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
# cols = columns[2:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

179
0     float64
1     float64
2     float64
3     float64
4     float64
5     float64
6     float64
7     float64
8     float64
9     float64
10    float64
11    float64
12    float64
dtype: object


In [9]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [10]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [11]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=len(cols),
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [12]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [13]:
data = X
print(data[:5])
print(data.shape)

[[ 0.40671762  1.28211576 -1.51782211 -0.67432984 -0.17004657  1.20947388
  -0.41236638  0.41553725  0.2502837   0.4384852  -0.18823535  0.3433128
  -0.25896505]
 [-0.72759544  1.37610527 -0.37730173  0.87500206  0.71965933  1.15579811
  -0.32877247  0.28964952 -0.22064164 -0.90683662 -1.27940828  0.43765637
  -0.39186619]
 [ 0.3645804  -0.15030117 -0.658086   -0.2156418   3.42989239 -0.06329432
  -0.09513995  0.63338098 -0.6111177  -0.07512562 -0.67457472  0.26752932
  -0.52252397]
 [ 1.18255668 -2.3430737   0.89951508  0.62576514 -1.2709059   2.84281533
   0.01578601 -0.54248965  0.2550968  -0.1050272  -0.25485171  0.02416374
  -0.79322458]
 [-1.48936128 -0.44033211 -0.0737009  -0.80675951  0.52333018 -0.34563438
  -0.48448707  0.39526327 -0.46510774 -0.15383133 -0.20493521  0.20260866
   0.10843757]]
(179, 13)


In [14]:
models = ['kmeans', 'kmeans++', 'kmeans++_improved', 'ostrovsky', 'variance']

In [15]:
def plot_scatter(X, labels):
    fig, ax = plt.subplots()
    scatter = plt.scatter(
        X[:, 0],
        X[:, 1],
        c=labels,
    )
    handles, labels = scatter.legend_elements()
    legend = ax.legend(
        handles,
        labels,
        loc='upper right',
        title='Label',
    )
    ax.add_artist(legend)
    plt.title(model+' '+str(num_clusters)+ ' clusters')
    plt.xlabel('Component A')
    plt.ylabel('Component B')
    plt.show()

In [16]:
algorithms = {}
iterations = {}
inertias = {}
times = {}
inertias_min = {}
iters_min = {}
for model in models:
    iterations[model] = 0.0
    inertias[model] = 0.0
    times[model] = 0.0
    iters_min[model] = 0
    inertias_min[model] = 1e20
for i in range(num_iter_exp):
    for model in models:
        algorithms[model] = KMeansClustering(
            n_clusters=num_clusters,
            max_iter=num_iterations,
            init=model,
            verbose=verbose,
        )
        start = timeit.default_timer()
        algorithm = algorithms[model].fit(data)
        centers = algorithm.cluster_centers_
        labels = algorithms[model].predict(data)
#         plot_scatter(X, labels)
        inertias[model] += algorithm.best_inertia_
        inertias_min[model] = min(inertias_min[model], algorithm.best_inertia_)
        iters_min[model] += algorithm.n_iters_
        end = timeit.default_timer()
        times[model] += (end-start)

In [17]:
for model in models:
    print(model+', '+str(num_clusters))
    print('Average Inertia -', round(inertias[model]/num_iter_exp, 2))
    print('Minimum Inertia - ', round(inertias_min[model], 2))
    print('Time -', round(times[model]/num_iter_exp, 2))
#     print('Average number of iterations', round(iters_min[model]/num_iter_exp,2))
    print('')

kmeans, 5
Average Inertia - 1181.19
Minimum Inertia -  1012.28
Time - 1.49

kmeans++, 5
Average Inertia - 1120.07
Minimum Inertia -  1015.19
Time - 1.49

kmeans++_improved, 5
Average Inertia - 1103.17
Minimum Inertia -  1014.54
Time - 1.49

ostrovsky, 5
Average Inertia - 1103.8
Minimum Inertia -  1012.93
Time - 1.49

variance, 5
Average Inertia - 1164.23
Minimum Inertia -  1019.49
Time - 1.5

