In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [4]:
data = pd.read_csv('seed.txt', sep='\t', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,15.26,14.84,0.871,5.763,3.312,2.221,5.22,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.905,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1


In [8]:
from pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
from kmeans import KMeans
from sklearn.metrics import silhouette_score

In [9]:
x = data.drop([7], axis=1)
x = x.values
x = normalize(x)

# K-Means

In [10]:
kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
kmeans.fit(x)

In [11]:
predicted_kmeans = kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans.SSE)
print('Quantization:', quantization_error(centroids=kmeans.centroid, data=x, labels=predicted_kmeans))

Silhouette: 0.4221267624201065
SSE: 22.026451226606625
Quantization: 0.30155146657476695


In [12]:
kmeans2 = KMeans(n_cluster=3, init_pp=True, seed=2018)
kmeans2.fit(x)
predicted_kmeans2 = kmeans2.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans2.SSE)
print('Quantization:', quantization_error(centroids=kmeans2.centroid, data=x, labels=predicted_kmeans2))

Silhouette: 0.4221267624201065
SSE: 22.024363075666038
Quantization: 0.30166461874754386


# PSO

In [13]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=50)

  dist /= len(idx)


In [14]:
hist = pso.run()

Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Iteration 0051/2000 current gbest score 0.301485499042801475
Iteration 0101/2000 current gbest score 0.301369509078166564
Iteration 0151/2000 current gbest score 0.301219075592380203
Iteration 0201/2000 current gbest score 0.301190438204482991
Iteration 0251/2000 current gbest score 0.301181630195897831
Iteration 0301/2000 current gbest score 0.301179044034220078
Iteration 0351/2000 current gbest score 0.301178481146949339
Iteration 0401/2000 current gbest score 0.301178402549978352
Iteration 0451/2000 current gbest score 0.301178376334236997
Iteration 0501/2000 current gbest score 0.301178348964677989
Iteration 0551/2000 current gbest score 0.301178341584707743
Iteration 0601/2000 current gbest score 0.301178339069568979
Iteration 0651/2000 current gbest score 0.301178337911871818
Iteration 0701/2000 current gbest score 0.301178337409410135
Iteration 0751/2000 current gbest score

In [15]:
pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)

In [16]:
pso_kmeans.centroid = pso.gbest_centroids.copy()
pso_kmeans.centroid

array([[0.12330603, 0.17630162, 0.37177673, 0.18537936, 0.15987898,
        0.50104333, 0.2792433 ],
       [0.76501534, 0.79977017, 0.71280354, 0.73348157, 0.78058066,
        0.35171889, 0.75293528],
       [0.37796153, 0.41431241, 0.66858342, 0.36558468, 0.46162616,
        0.26123977, 0.31312769]])

In [17]:
predicted_pso = pso_kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_pso))
print('SSE:', calc_sse(centroids=pso.gbest_centroids, data=x, labels=predicted_pso))
print('Quantization:', pso.gbest_score)

Silhouette: 0.4221267624201065
SSE: 22.08922698059202
Quantization: 0.3011783370281868


# Repeated Test

### K-Means++

In [18]:
kmeanspp = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    kmean_rep = KMeans(n_cluster=3, init_pp=True)
    kmean_rep.fit(x)
    predicted_kmean_rep = kmean_rep.predict(x)
    silhouette = silhouette_score(x, predicted_kmean_rep)
    sse = kmean_rep.SSE
    quantization = quantization_error(centroids=kmean_rep.centroid, data=x, labels=predicted_kmean_rep)
    kmeanspp['silhouette'].append(silhouette)
    kmeanspp['sse'].append(sse)
    kmeanspp['quantization'].append(quantization)

### PSO 

In [19]:
%%time
pso_plain = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=False, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_plain['silhouette'].append(silhouette)
    pso_plain['sse'].append(sse)
    pso_plain['quantization'].append(quantization)

Initial global best score 0.3650153381853994
Iteration 0001/2000 current gbest score 0.362608003982757554
Finish with gbest score 0.326137523748685598
Initial global best score 0.38464925715480097
Iteration 0001/2000 current gbest score 0.362749587497033066
Finish with gbest score 0.337683297694484297
Initial global best score 0.36481568471325065
Iteration 0001/2000 current gbest score 0.360884665099836688


  dist /= len(idx)


Finish with gbest score 0.321298541949087191
Initial global best score 0.35949230833181134
Iteration 0001/2000 current gbest score 0.359492308331811339
Finish with gbest score 0.332265752015113269
Initial global best score 0.33884575008148027
Iteration 0001/2000 current gbest score 0.338845750081480268
Finish with gbest score 0.320122718474826040
Initial global best score 0.3590859732178408
Iteration 0001/2000 current gbest score 0.349914890743986129


  dist /= len(idx)


Finish with gbest score 0.320113156507219110
Initial global best score 0.34848435998713656
Iteration 0001/2000 current gbest score 0.342403380020504911
Finish with gbest score 0.310158524491501386
Initial global best score 0.360532291537304
Iteration 0001/2000 current gbest score 0.356038959978094649


  dist /= len(idx)


Finish with gbest score 0.311932850797551009
Initial global best score 0.36537241526792114
Iteration 0001/2000 current gbest score 0.365372415267921136


  dist /= len(idx)


Finish with gbest score 0.308955161992045368
Initial global best score 0.350315908225822
Iteration 0001/2000 current gbest score 0.350315908225821981


  dist /= len(idx)


Finish with gbest score 0.312440037743332955
Initial global best score 0.33730252270071254
Iteration 0001/2000 current gbest score 0.337302522700712537
Finish with gbest score 0.318618850346610472
Initial global best score 0.37508365354687606
Iteration 0001/2000 current gbest score 0.365776626442698705
Finish with gbest score 0.334651269124097872
Initial global best score 0.34981696209647817
Iteration 0001/2000 current gbest score 0.349816962096478168
Finish with gbest score 0.314057344476855793
Initial global best score 0.38439747963543125
Iteration 0001/2000 current gbest score 0.361812705879800245
Finish with gbest score 0.317780346184011464
Initial global best score 0.32261556138344893
Iteration 0001/2000 current gbest score 0.322615561383448934
Finish with gbest score 0.310997414183862730
Initial global best score 0.3892315000983044
Iteration 0001/2000 current gbest score 0.389006805299056346
Finish with gbest score 0.329532918297960276
Initial global best score 0.3430575835821275

  dist /= len(idx)


Finish with gbest score 0.324624866286624625
Initial global best score 0.3676757094945322
Iteration 0001/2000 current gbest score 0.367675709494532210
Finish with gbest score 0.320892940550437489
CPU times: user 1min 36s, sys: 9.02 s, total: 1min 45s
Wall time: 1min 34s


### PSO Hybrid

In [20]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)

Initial global best score 0.30155146657476695
Iteration 0001/2000 current gbest score 0.301551466574766946
Finish with gbest score 0.301154836231899647
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856


  dist /= len(idx)


Finish with gbest score 0.301260423879870176
Initial global best score 0.30155146657476695
Iteration 0001/2000 current gbest score 0.301551466574766946


  dist /= len(idx)


Finish with gbest score 0.301135092719994280
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856


  dist /= len(idx)


Finish with gbest score 0.301156368062323765
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301131829724883116
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301362189588276319
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301102299028291609
Initial global best score 0.30155146657476695
Iteration 0001/2000 current gbest score 0.301551466574766946
Finish with gbest score 0.301140541809194251
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301247825039121675
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301380967784858511
Initial global best score 0.301664618747543

  dist /= len(idx)


Finish with gbest score 0.301285195216301760
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301248069942525731
Initial global best score 0.3438501164401444
Iteration 0001/2000 current gbest score 0.343850116440144404


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(
  dist /= len(idx)


Finish with gbest score 0.313518573012841584
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301151644699240661
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301361238233388617
Initial global best score 0.301551466574767
Iteration 0001/2000 current gbest score 0.301551466574767002
Finish with gbest score 0.301106338003337193
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301243525930651235
Initial global best score 0.30155146657476695
Iteration 0001/2000 current gbest score 0.301551466574766946
Finish with gbest score 0.301197879533225410
Initial global best score 0.30166461874754386
Iteration 0001/2000 current gbest score 0.301664618747543856
Finish with gbest score 0.301322454062579181
CPU times: user 1min 35s, sys: 9.15 s, total:

# Comparison

In [21]:
benchmark = {
    'method' : ['K-Means++', 'PSO', 'PSO Hybrid'],
    'sse_mean' : [
        np.around(np.mean(kmeanspp['sse']), decimals=10),
        np.around(np.mean(pso_plain['sse']), decimals=10),
        np.around(np.mean(pso_hybrid['sse']), decimals=10),
    ],
    'sse_stdev' : [
        np.around(np.std(kmeanspp['sse']), decimals=10),
        np.around(np.std(pso_plain['sse']), decimals=10),
        np.around(np.std(pso_hybrid['sse']), decimals=10),
    ],
    'silhouette_mean' : [
        np.around(np.mean(kmeanspp['silhouette']), decimals=10),
        np.around(np.mean(pso_plain['silhouette']), decimals=10),
        np.around(np.mean(pso_hybrid['silhouette']), decimals=10),
    ],
    'silhouette_stdev' : [
        np.around(np.std(kmeanspp['silhouette']), decimals=10),
        np.around(np.std(pso_plain['silhouette']), decimals=10),
        np.around(np.std(pso_hybrid['silhouette']), decimals=10),
    ],
    'quantization_mean' : [
        np.around(np.mean(kmeanspp['quantization']), decimals=10),
        np.around(np.mean(pso_plain['quantization']), decimals=10),
        np.around(np.mean(pso_hybrid['quantization']), decimals=10),
    ],
    'quantization_stdev' : [
        np.around(np.std(kmeanspp['quantization']), decimals=10),
        np.around(np.std(pso_plain['quantization']), decimals=10),
        np.around(np.std(pso_hybrid['quantization']), decimals=10),
    ],
}

In [22]:
benchmark

{'method': ['K-Means++', 'PSO', 'PSO Hybrid'],
 'sse_mean': [22.0245718908, 26.8100429723, 22.1722081422],
 'sse_stdev': [0.0006264453, 4.1065503521, 0.4407198128],
 'silhouette_mean': [0.4221074074, 0.3872134749, 0.4216918578],
 'silhouette_stdev': [6.4517e-06, 0.0390602608, 0.0018957055],
 'quantization_mean': [0.3016533035, 0.3198187361, 0.3018347054],
 'quantization_stdev': [3.39457e-05, 0.0084367183, 0.0026818356]}

In [23]:
benchmark_df = pd.DataFrame.from_dict(benchmark)
benchmark_df

Unnamed: 0,method,sse_mean,sse_stdev,silhouette_mean,silhouette_stdev,quantization_mean,quantization_stdev
0,K-Means++,22.024572,0.000626,0.422107,6e-06,0.301653,3.4e-05
1,PSO,26.810043,4.10655,0.387213,0.03906,0.319819,0.008437
2,PSO Hybrid,22.172208,0.44072,0.421692,0.001896,0.301835,0.002682


In [24]:
benchmark_df.to_excel('benchmark_res.xlsx', index=False)

ModuleNotFoundError: No module named 'openpyxl'

In [None]:
benchmark_df.to_csv('benchmark_res.csv', index=False)