In [11]:
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering
import matplotlib.pyplot as plt
from matplotlib.image import imread
import pandas as pd
import seaborn as sns
from sklearn.metrics import silhouette_samples, silhouette_score

In [150]:
import numpy

def quantization_score(centroids: np.ndarray, labels: np.ndarray, data: np.ndarray) -> float:
    error = 0.0
    for i, c in enumerate(centroids):
        idx = np.where(labels == i)
        dist = np.linalg.norm(data[idx] - c)
        dist /= len(idx)
        error += dist
    error /= len(centroids)
    return error

def calc_mse(centroids: numpy.ndarray, labels: np.ndarray, data: np.ndarray):
    distances = []
    for i, c in enumerate(centroids):
        idx = numpy.where(labels == i)
        dist = numpy.mean((data[idx] - c)**2)
        distances.append(dist)
    return np.mean(distances)


class Particle:
    
    def __init__(self,
                 n_cluster: int,
                 data: np.ndarray,
                 w: float = 0.9,
                 c1: float = 0.5,
                 c2: float = 0.3):
        index = np.random.choice(list(range(len(data))), n_cluster)
        self.centroids = data[index].copy()
        self.best_position = self.centroids.copy()
        self.best_score = quantization_score(self.centroids, self._predict(data), data)
        self.best_mse = calc_mse(self.centroids, self._predict(data), data)
        self.velocity = np.zeros_like(self.centroids)
        self._w = w
        self._c1 = c1
        self._c2 = c2

    def update(self, gbest_position: np.ndarray, data: np.ndarray):
        self._update_velocity(gbest_position)
        self._update_centroids(data)

    def _update_velocity(self, gbest_position: np.ndarray):

        v_old = self._w * self.velocity
        cognitive_component = self._c1 * np.random.random() * (self.best_position - self.centroids)
        social_component = self._c2 * np.random.random() * (gbest_position - self.centroids)
        self.velocity = v_old + cognitive_component + social_component

    def _update_centroids(self, data: np.ndarray):
        self.centroids = self.centroids + self.velocity
        new_score = quantization_score(self.centroids, self._predict(data), data)
        mse = calc_mse(self.centroids, self._predict(data), data)
        self.best_mse = min(mse, self.best_mse)
        if new_score < self.best_score:
            self.best_score = new_score
            self.best_position = self.centroids.copy()

    def _predict(self, data: np.ndarray) -> np.ndarray:
        
        distance = self._calc_distance(data)
        cluster = self._assign_cluster(distance)
        return cluster

    def _calc_distance(self, data: np.ndarray) -> np.ndarray:
        
        distances = []
        for c in self.centroids:
            distance = np.sum((data - c) * (data - c), axis=1)
            distances.append(distance)

        distances = np.array(distances)
        distances = np.transpose(distances)
        return distances

    def _assign_cluster(self, distance: np.ndarray) -> np.ndarray:
        
        cluster = np.argmin(distance, axis=1)
        return cluster


In [151]:
import numpy as np

class ParticleSwarmOptimizedClustering:
    def __init__(self,
                 n_cluster: int,
                 n_particles: int,
                 data: np.ndarray,
                 max_iter: int = 100,
                 print_debug: int = 10):
        self.n_cluster = n_cluster
        self.n_particles = n_particles
        self.data = data
        self.max_iter = max_iter
        self.particles = []

        self.print_debug = print_debug
        self.gbest_score = np.inf
        self.gbest_centroids = None
        self.gbest_mse = np.inf
        self._init_particles()

    def _init_particles(self):
        for i in range(self.n_particles):
            particle = None
            particle = Particle(self.n_cluster, self.data)
            if particle.best_score < self.gbest_score:
                self.gbest_centroids = particle.centroids.copy()
                self.gbest_score = particle.best_score
            self.particles.append(particle)
            self.gbest_mse = min(particle.best_mse, self.gbest_mse)

    def run(self):
        print('Initial global best score', self.gbest_score)
        history = []
        for i in range(self.max_iter):
            for particle in self.particles:
                particle.update(self.gbest_centroids, self.data)
                
            for particle in self.particles:
                if particle.best_score <= self.gbest_score:
                    self.gbest_centroids = particle.centroids.copy()
                    self.gbest_score = particle.best_score
            history.append(self.gbest_score)
            if i % self.print_debug == 0:
                print('Iteration {:04d}/{:04d} current gbest score {:.18f}'.format(
                    i + 1, self.max_iter, self.gbest_score))
        print('Finish with gbest score {:.18f}'.format(self.gbest_score))
        return history
    
    def _calc_distance(self, data: numpy.ndarray):
        
        distances = []
        for c in self.centroid:
            distance = numpy.sum((data - c) * (data - c), axis=1)
            distances.append(distance)

        distances = numpy.array(distances)
        distances = distances.T
        return distances

    def predict(self, distance: numpy.ndarray):
        
        cluster = numpy.argmin(distance, axis=1)
        return cluster

In [152]:
from sklearn.metrics.pairwise import euclidean_distances

def delta(ck, cl):
    values = np.ones([len(ck), len(cl)])*10000
    
    for i in range(0, len(ck)):
        for j in range(0, len(cl)):
            values[i, j] = np.linalg.norm(ck[i]-cl[j])
            
    return np.min(values)
    
def big_delta(ci):
    values = np.zeros([len(ci), len(ci)])
    
    for i in range(0, len(ci)):
        for j in range(0, len(ci)):
            values[i, j] = np.linalg.norm(ci[i]-ci[j])
            
    return np.max(values)
    
def dunn(k_list):
    
    deltas = np.ones([len(k_list), len(k_list)])*1000000
    big_deltas = np.zeros([len(k_list), 1])
    l_range = list(range(0, len(k_list)))
    
    for k in l_range:
        for l in (l_range[0:k]+l_range[k+1:]):
            deltas[k, l] = delta(k_list[k], k_list[l])
        
        big_deltas[k] = big_delta(k_list[k])

    di = np.min(deltas)/np.max(big_deltas)
    return di

In [153]:
df=pd.read_csv('norm.csv')

In [154]:
df_req = df[['Stride Length (m)', 'Cadence(steps/min)', 'Leg Length (m)',
       'Age(years)','classs']]
X  = df_req[['Stride Length (m)', 'Cadence(steps/min)', 'Leg Length (m)',
       'Age(years)']]
labels = df.classs.values

In [155]:
from sklearn.model_selection import train_test_split
x_tr,x_tt,y_tr,y_tt = train_test_split(X,labels,test_size=.30)

In [156]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=2, n_particles=50, data=X.values, max_iter=8000, print_debug=50)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [157]:
hist = pso.run()

Initial global best score 163.62907733577805
Iteration 0001/8000 current gbest score 163.629077335778049473
Iteration 0051/8000 current gbest score 156.293832302095268005
Iteration 0101/8000 current gbest score 156.217018573168644480
Iteration 0151/8000 current gbest score 156.206373821478962327
Iteration 0201/8000 current gbest score 156.206203938285170807
Iteration 0251/8000 current gbest score 156.206195726828752868
Iteration 0301/8000 current gbest score 156.206141801718302986
Iteration 0351/8000 current gbest score 156.205713943997579918
Iteration 0401/8000 current gbest score 156.205318210909439358
Iteration 0451/8000 current gbest score 156.204938361334995989
Iteration 0501/8000 current gbest score 156.204851402045392206
Iteration 0551/8000 current gbest score 156.204826786287242157
Iteration 0601/8000 current gbest score 156.204775340341541323
Iteration 0651/8000 current gbest score 156.204669259214568910
Iteration 0701/8000 current gbest score 156.204586246115411541
Iteration 

Iteration 6501/8000 current gbest score 156.204512111948673692
Iteration 6551/8000 current gbest score 156.204512111948673692
Iteration 6601/8000 current gbest score 156.204512111948673692
Iteration 6651/8000 current gbest score 156.204512111948673692
Iteration 6701/8000 current gbest score 156.204512111948673692
Iteration 6751/8000 current gbest score 156.204512111948673692
Iteration 6801/8000 current gbest score 156.204512111948673692
Iteration 6851/8000 current gbest score 156.204512111948673692
Iteration 6901/8000 current gbest score 156.204512111948673692
Iteration 6951/8000 current gbest score 156.204512111948673692
Iteration 7001/8000 current gbest score 156.204512111948673692
Iteration 7051/8000 current gbest score 156.204512111948673692
Iteration 7101/8000 current gbest score 156.204512111948673692
Iteration 7151/8000 current gbest score 156.204512111948673692
Iteration 7201/8000 current gbest score 156.204512111948673692
Iteration 7251/8000 current gbest score 156.20451211194

In [158]:
pso.gbest_centroids

array([[  0.99058651, 139.7506603 ,   0.54507355,   7.57521861],
       [  0.64637618,  88.66132506,   0.59825411,  11.87725654]])

In [159]:
dunn(pso.gbest_centroids)

0.0003820288781148214

In [160]:
pso.gbest_mse

96.33664813186815

In [161]:
d = pso.predict(X.values)

In [162]:
print("Silhouette Coefficient: %0.3f"
      % silhouette_score(d.reshape(-1,1),labels, metric='euclidean'))

Silhouette Coefficient: 0.107


In [163]:
import numpy as np
from sklearn import metrics

def purity_score(y_true, y_pred):
    # compute contingency matrix (also called confusion matrix)
    contingency_matrix = metrics.cluster.contingency_matrix(y_true, y_pred)
    # return purity
    return np.sum(np.amax(contingency_matrix, axis=0)) / np.sum(contingency_matrix) 

In [164]:
purity_score(labels,d)

0.6153846153846154