In [1]:
import os, sys, timeit

import numpy as np
import pandas as pd

from metrics import process

from tqdm import tqdm
from os.path import join
from kmeans_clustering import KMeansClustering
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.datasets import load_boston

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
pwd = os.getcwd()
data_dir = join(pwd, 'data/')
need_normalization = False
need_standardization = False
need_pca = True
filename = 'wine.csv'
datapath = join(data_dir, filename)
num_clusters = 5
num_iterations = 300
num_iter_exp = 20
verbose = False
data_boston = load_boston()
do_boston = False

In [3]:
# f = open('data/data.csv')
# lines = f.read()
# print(type(lines))
# new_lines = ''
# for line in lines:
#     new_lines += str(line.encode('utf-8').strip())
# print(len(new_lines))

In [4]:
df = pd.read_csv(
    datapath, 
    low_memory=False,
    encoding='utf-8',
#     sep=',',
#     header=None,
)
if do_boston is True:
    df = pd.DataFrame(
        data_boston.data, 
        columns=data_boston.feature_names,
    )

In [5]:
print(len(df))

178


In [6]:
print(df.head())

   Alcohol  Malic_Acid   Ash  Ash_Alcanity  Magnesium  Total_Phenols  \
0    14.23        1.71  2.43          15.6        127           2.80   
1    13.20        1.78  2.14          11.2        100           2.65   
2    13.16        2.36  2.67          18.6        101           2.80   
3    14.37        1.95  2.50          16.8        113           3.85   
4    13.24        2.59  2.87          21.0        118           2.80   

   Flavanoids  Nonflavanoid_Phenols  Proanthocyanins  Color_Intensity   Hue  \
0        3.06                  0.28             2.29             5.64  1.04   
1        2.76                  0.26             1.28             4.38  1.05   
2        3.24                  0.30             2.81             5.68  1.03   
3        3.49                  0.24             2.18             7.80  0.86   
4        2.69                  0.39             1.82             4.32  1.04   

   OD280  Proline  Customer_Segment  
0   3.92     1065                 1  
1   3.40     105

In [7]:
print(df.dtypes)

Alcohol                 float64
Malic_Acid              float64
Ash                     float64
Ash_Alcanity            float64
Magnesium                 int64
Total_Phenols           float64
Flavanoids              float64
Nonflavanoid_Phenols    float64
Proanthocyanins         float64
Color_Intensity         float64
Hue                     float64
OD280                   float64
Proline                   int64
Customer_Segment          int64
dtype: object


In [8]:
columns = df.columns.tolist()
cols = columns
# cols = columns[1:]
# cols = ['Survived', 'Pclass', 'Age', 'SibSp', 'Fare']
# cols = columns[:-1]
# cols = ['Passenger Count', 'Adjusted Passenger Count']
# cols = columns[1:]
# cols = columns[2:]
df = df[cols]
for col in cols:
    df[col] = pd.to_numeric(df[col], errors='coerce')
df = df.fillna(0)
df = df.replace([-np.inf, np.inf], np.nan)
df = df.dropna()
df = df.loc[:, (df != 0).any(axis=0)]
print(len(df))
print(df.dtypes)

178
Alcohol                 float64
Malic_Acid              float64
Ash                     float64
Ash_Alcanity            float64
Magnesium                 int64
Total_Phenols           float64
Flavanoids              float64
Nonflavanoid_Phenols    float64
Proanthocyanins         float64
Color_Intensity         float64
Hue                     float64
OD280                   float64
Proline                   int64
Customer_Segment          int64
dtype: object


In [9]:
df = df.sample(
    frac=1.0,
    random_state=1,
)
df = df.reset_index(drop=True)

In [10]:
if need_normalization is True:
    normalizer = StandardScaler()
    tmp = normalizer.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)
elif need_standardization is True:
    scaler = MinMaxScaler()
    tmp = scaler.fit_transform(df)
    df = pd.DataFrame(tmp, columns=df.columns)

In [11]:
X = df
if need_pca is True:
    pca = PCA(
        n_components=len(cols),
        svd_solver='auto',
    )
    X = pca.fit_transform(X)
else:
    X = X.to_numpy()

In [12]:
for x in X:
    for a in x:
        if np.isinf(a):
            print(x)

In [13]:
data = X
print(data[:5])
print(data.shape)

[[-6.67570058e+01  8.47295011e+00 -6.68512351e-01 -1.55245403e+00
  -1.56517056e+00 -2.22732154e-01 -6.94384135e-01 -4.77392946e-02
   1.65228101e-01  4.49783893e-01  1.10115844e-02  2.40583214e-01
   1.95650398e-01  1.78690767e-02]
 [-4.01695582e+02  1.54242204e+01  3.53298790e-01  2.73362103e+00
   5.24855219e-01 -2.56372224e-01 -4.86966649e-01 -1.49052165e-01
  -1.35033033e-01 -1.39428892e-01  1.72072436e-01 -1.42904420e-01
  -1.28119440e-02  4.56855966e-02]
 [ 9.84045208e+01  1.43903679e+01  4.15278052e+00 -9.26199325e-01
   7.38298876e-02  1.52446086e+00 -3.23705937e-01  4.63461608e-01
  -1.68789003e-01 -2.05510177e-01 -5.18627291e-02  1.45711197e-01
  -5.98928362e-02 -1.04915243e-01]
 [-2.79661834e+01  5.16298857e+01  5.00697726e+00  1.34615337e+00
  -8.81184250e-01 -1.15726937e+00  5.13229368e-01 -8.18324550e-01
  -8.86929544e-01  1.95508614e-01  1.21817334e-01 -2.85186169e-01
   2.20809718e-01 -6.66033612e-02]
 [ 6.28286770e+02  4.04125269e+00 -3.54295775e-01  3.68620385e-01
  

In [14]:
models = ['coc', 'kmeans++', 'ostrovsky', 'kmeans']
# models = ['kmeans++', 'kmeans++_improved']

In [15]:
def plot_scatter(X, labels):
    fig, ax = plt.subplots()
    scatter = plt.scatter(
        X[:, 0],
        X[:, 1],
        c=labels,
    )
    handles, labels = scatter.legend_elements()
    legend = ax.legend(
        handles,
        labels,
        loc='upper right',
        title='Label',
    )
    ax.add_artist(legend)
    plt.title(model+' '+str(num_clusters)+ ' clusters')
    plt.xlabel('Component A')
    plt.ylabel('Component B')
    plt.show()

In [16]:
algorithms = {}
iterations = {}
inertias = {}
times = {}
inertias_min = {}
iters_min = {}
for model in models:
    iterations[model] = 0.0
    inertias[model] = 0.0
    times[model] = 0.0
    iters_min[model] = 0
    inertias_min[model] = 1e20
for i in tqdm(range(num_iter_exp)):
    for model in (models):
        algorithms[model] = KMeansClustering(
            n_clusters=num_clusters,
            max_iter=num_iterations,
            init=model,
            verbose=verbose,
        )
        start = timeit.default_timer()
        algorithm = algorithms[model].fit(data)
        centers = algorithm.cluster_centers_
        labels = algorithms[model].predict(data)
#         plot_scatter(X, labels)
        inertias[model] += algorithm.sse_
        inertias_min[model] = min(inertias_min[model], algorithm.sse_)
        iters_min[model] += algorithm.iter_convergence_
        end = timeit.default_timer()
        times[model] += (end-start)

100%|██████████████████████████████████████████████████████████████████████████████████| 20/20 [01:15<00:00,  3.75s/it]


In [17]:
for model in models:
    print(model+', '+str(num_clusters))
    print('Average Inertia -', round(inertias[model]/num_iter_exp, 2))
    print('Minimum Inertia - ', round(inertias_min[model], 2))
    print('Time -', round(times[model]/num_iter_exp, 2))
#     print('Average number of iterations', round(iters_min[model]/num_iter_exp,2))
    print('')

coc, 5
Average Inertia - 994075.55
Minimum Inertia -  916424.19
Time - 0.94

kmeans++, 5
Average Inertia - 1011171.27
Minimum Inertia -  916424.19
Time - 0.94

ostrovsky, 5
Average Inertia - 1020273.2
Minimum Inertia -  916424.19
Time - 0.94

kmeans, 5
Average Inertia - 1017557.27
Minimum Inertia -  916424.19
Time - 0.93

