In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from joblib import Parallel, delayed
from sklearn.neighbors import NearestNeighbors
from scipy.spatial import distance
from sklearn.metrics.cluster import adjusted_rand_score, rand_score
from sklearn.decomposition import PCA
from time import time
# import warnings filter
from warnings import simplefilter
# ignore all future warnings

In [2]:
def get_data():
    data = np.loadtxt('./semeion.data', dtype=np.int8)
    return data[:, :256], data[:, 256:]


def one_hot_decode(y: np.array):
    return np.argmax(y, axis=1)


def get_data_transformed():
    data = np.loadtxt('./semeion.data', dtype=np.int8)
    return data[:, :256], one_hot_decode(data[:, 256:])


In [3]:
# fake class for no dim. red.
class NoDR:
    def __init__(self, n_components):
        pass
    def fit_transform(self, X):
        return X.copy()


def plotimg(row: np.array):
    img = row.reshape((16,16))
    plt.imshow(img)
    plt.show()


In [4]:
simplefilter(action='ignore', category=FutureWarning)
ks = range(5,16)
ds = (2,64,128,256)

In [5]:
class Mean_Shift:
    def __init__(self, bandwidth, max_iter=200, n_jobs=4):

        self.bandwidth = bandwidth
        self.max_iter = max_iter
        self.n_jobs = n_jobs

    def __str__(self):
        return 'MS'

    def fit(self, X):

        seeds = X
        n_samples, n_features = X.shape
        center_intensity_dict = {}

        nbrs = NearestNeighbors(radius=self.bandwidth, n_jobs=1).fit(X)

        all_res = Parallel(n_jobs=self.n_jobs)(
            delayed(_mean_shift_single_seed)
            (seed, X, nbrs, self.max_iter) for seed in seeds)
        for i in range(len(seeds)):
            if all_res[i][1]:  # i.e. len(points_within) > 0
                center_intensity_dict[all_res[i][0]] = all_res[i][1]

        self.n_iter_ = max([x[2] for x in all_res])

        if not center_intensity_dict:
            # nothing near seeds
            raise ValueError("No point was within bandwidth=%f of any seed."
                             " Try a different seeding strategy \
                             or increase the bandwidth."
                             % self.bandwidth)

        sorted_by_intensity = sorted(center_intensity_dict.items(),
                                     key=lambda tup: (tup[1], tup[0]),
                                     reverse=True)
        sorted_centers = np.array([tup[0] for tup in sorted_by_intensity])
        unique = np.ones(len(sorted_centers), dtype=bool)
        nbrs = NearestNeighbors(radius=self.bandwidth,
                                n_jobs=self.n_jobs).fit(sorted_centers)
        for i, center in enumerate(sorted_centers):
            if unique[i]:
                neighbor_idxs = nbrs.radius_neighbors([center],
                                                      return_distance=False)[0]
                unique[neighbor_idxs] = 0
                unique[i] = 1 
        cluster_centers = sorted_centers[unique]

        nbrs = NearestNeighbors(n_neighbors=1,
                                n_jobs=self.n_jobs).fit(cluster_centers)
        labels = np.zeros(n_samples, dtype=int)
        idxs = nbrs.kneighbors(X, return_distance=False)
        labels = idxs.flatten()

        self.cluster_centers_, self.labels_, self.nlabels_ = cluster_centers, labels, len(np.unique(labels))
        return self

In [6]:

def _mean_shift_single_seed(my_mean, X, nbrs, max_iter):

    bandwidth = nbrs.get_params()['radius']
    stop_thresh = 1e-3 * bandwidth  
    completed_iterations = 0
    while True:
        i_nbrs = nbrs.radius_neighbors([my_mean], bandwidth,
                                       return_distance=False)[0]
        points_within = X[i_nbrs]
        if len(points_within) == 0:
            break 
        my_old_mean = my_mean  
        my_mean = np.mean(points_within, axis=0)
        if (np.linalg.norm(my_mean - my_old_mean) < stop_thresh or
                completed_iterations == max_iter):
            break
        completed_iterations += 1
    return tuple(my_mean), len(points_within), completed_iterations




In [7]:
def MS(p=False):

    columns = ('clustering_model','dim_red_model','d','k','time','ARI','RI')
    rows = []
    params = ((1, 2), (6.3, 64), (7.5, 128), (7, 256))
    X, y = get_data_transformed()
    for w,d in params:
        model = Mean_Shift(w)
        start_time = time()
        if d != 256:
            embedding_model = PCA
        else:
            embedding_model = NoDR
        X_transformed = embedding_model(n_components=d).fit_transform(X)
        ename = embedding_model.__name__
        model.fit(X_transformed)
        elapsed = time() - start_time
        title = f'Results for {model} on dim={d} ({elapsed:.02f}s)\n' +\
            f'number of centers: {model.nlabels_} - window size: {w}'
        labels = model.labels_
        print(title)
        ari, ri = print_rand(y, model.labels_)
        if p: plot2D(X_transformed, model.labels_, title)
        result = (str(model),ename,d, w, elapsed, ari, ri)
        rows.append(result)
    df = pd.DataFrame(data=rows, columns=columns)
    print(df)
    df.to_csv('MS.csv', index=False)

In [8]:
def print_rand(y_true, y_pred):
    'Print and return the Rand Indexes'
    ari = adjusted_rand_score(y_true, y_pred)
    ri = rand_score(y_true, y_pred)
    print('adjusted_rand_score', round(ari * 100, 2), '%')
    print('rand_score', round(ri * 100, 2), '%', end='\n\n')
    return ari, ri

In [9]:
def results_analysis():
    df = pd.read_csv('MS.csv')

    dfg_k = df.groupby('k').mean()[['time','ARI','RI']]
    print('averages by number of cluster:')
    print(dfg_k.sort_values('ARI', ascending=False))
    print()

    dfg_dr = df.groupby('dim_red_model').mean()[['time','ARI','RI']]
    print('averages by DR model:')
    print(dfg_dr.sort_values('ARI', ascending=False))
    print()


    print('best ARI achieved by:')
    print(df.sort_values('ARI', ascending=False).head(3))


In [10]:
def main():

    MS()
    results_analysis()

In [11]:
if __name__ == '__main__':
    main()

Results for MS on dim=2 (6.91s)
number of centers: 8 - window size: 1
adjusted_rand_score 20.68 %
rand_score 81.86 %

Results for MS on dim=64 (6.35s)
number of centers: 40 - window size: 6.3
adjusted_rand_score 3.64 %
rand_score 51.54 %

Results for MS on dim=128 (16.81s)
number of centers: 13 - window size: 7.5
adjusted_rand_score 0.01 %
rand_score 11.85 %

Results for MS on dim=256 (31.51s)
number of centers: 250 - window size: 7
adjusted_rand_score 6.42 %
rand_score 69.51 %

  clustering_model dim_red_model    d    k       time       ARI        RI
0               MS           PCA    2  1.0   6.913914  0.206825  0.818561
1               MS           PCA   64  6.3   6.353009  0.036450  0.515360
2               MS           PCA  128  7.5  16.814545  0.000115  0.118547
3               MS          NoDR  256  7.0  31.511524  0.064224  0.695134
averages by number of cluster:
          time       ARI        RI
k                                 
1.0   6.913914  0.206825  0.818561
7.0  31.51