In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import MeanShift, estimate_bandwidth

In [2]:
PATH = './Cluster/'

PATH_TO_DATA_S = PATH + 's3.txt'
PATH_TO_RESULT_S = PATH + 's3-cb.txt'

In [3]:
# read input data set
data = np.array(pd.read_csv(PATH_TO_DATA_S, header=None, delim_whitespace=True))
# read ground truth clusters
labels = np.sort(np.array(pd.read_csv(PATH_TO_RESULT_S, header=None, delim_whitespace=True)), axis=0)

In [4]:
quantile=0.045      # parameter to estimate bandwidth
                    # it was matched manually to have 15 clusters

bw = estimate_bandwidth(data, quantile=quantile, n_samples=data.shape[0], random_state=100)
clustering = MeanShift(bandwidth=bw).fit(data)

In [5]:
clusters = np.sort(clustering.cluster_centers_.astype(int), axis=0)

In [6]:
distances = np.sqrt((labels[:,0] - clusters[:,0])**2 + (labels[:,1] - clusters[:,1])**2)
# mean distance between 
np.mean(distances)

12945.605752856196

Depite of the mean distance value, the result is pretty good, as point coordinate values are very large. Let's just compare ceters visually.

In [7]:
print(clusters)
print(labels)

[[199455 225999]
 [262865 238715]
 [302915 259124]
 [343063 326903]
 [357180 399919]
 [369050 403517]
 [448144 441735]
 [519488 459637]
 [526191 464570]
 [564225 592086]
 [606784 602502]
 [666711 653624]
 [767523 763767]
 [774316 769922]
 [775033 787490]]
[[192400 213438]
 [228603 234627]
 [302308 256784]
 [339104 295369]
 [357923 346765]
 [365597 406680]
 [440961 445394]
 [517585 458056]
 [532869 467162]
 [571726 591348]
 [604661 602360]
 [674703 644492]
 [763783 768063]
 [771756 769944]
 [773235 792663]]


In [8]:
PATH_TO_DATA_G2 = PATH + 'g2-2-100.txt'
PATH_TO_RESULT_G2 = PATH + 'g2-2-100-gt.txt'

In [9]:
# read input data set
data_g2 = np.array(pd.read_csv(PATH_TO_DATA_G2, header=None, delim_whitespace=True))
# read ground truth clusters
labels_g2 = np.sort(np.array(pd.read_csv(PATH_TO_RESULT_G2, header=None, delim_whitespace=True)), axis=0)

In [10]:
quantile_g2=0.1       # parameter to estimate bandwidth
                      # it was matched manually to have 2 clusters

bw_g2 = estimate_bandwidth(data_g2, quantile=quantile_g2, n_samples=data_g2.shape[0], random_state=100)
clustering_g2 = MeanShift(bandwidth=bw_g2).fit(data_g2)

clusters_g2 = np.sort(clustering_g2.cluster_centers_.astype(int), axis=0)

In [11]:
distances_g2 = np.sqrt((labels_g2[:,0] - clusters_g2[:,0])**2 + (labels_g2[:,1] - clusters_g2[:,1])**2)
# mean distance between predicted and 
np.mean(distances_g2)

218.89642843508443

In [12]:
print(clusters_g2)
print(labels_g2)

[[445 555]
 [562 958]]
[[500 500]
 [600 600]]


On the G2 dataset result does not seem very good. The first cluster center is predicted pretty well but the second has a large difference of the second value. It looks like it happend due to cluster overlap.