# Introduction

# Setup

## Imports

In [556]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MeanShift
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris, load_breast_cancer, load_digits

## Dataset

In [559]:
iris = load_iris()
breast = load_breast_cancer()
digits = load_digits()

# MeanShift

In [548]:
class CustomMeanShift(BaseEstimator, ClusterMixin):
    def __init__(self, radius=5, max_iter=300, tol=0.0001):
        self.radius = radius
        self.max_iter = max_iter
        self.tol = tol
        
        
    def fit(self, X):
        self.X = X
        self._check_params(self.X)
        
        self._meanshift(self.X)
        self.fitted_ = True
        
        return self
        
    
    def _meanshift(self, X):
        #initially setting each point as a independent centroid
        centroids = pd.DataFrame(X).drop_duplicates().values

        while True:
            new_centroids = []
            
            #getting new centroids
            for centroid in centroids:
                points_in_radius = []
                
                #getting all points that are in bandwidth of current centroid
                for x in X:
                    if np.linalg.norm(x - centroid) <= self.radius:
                        points_in_radius.append(x)

                #computing new centroid by taking mean of all points present in current centroid's bandwidth
                new_centroid = np.mean(points_in_radius, axis=0)
                new_centroids.append(new_centroid)

            #assigning previous centroids to new variable
            prev_centroids = centroids
            
            #assigning new centroids obtained
            #taking unique centroids from our list of new centroids
            centroids = pd.DataFrame(new_centroids).drop_duplicates().values

            #if no of new centroids is less than no of old centroids then skipping
            if len(centroids) < len(prev_centroids):
                continue
                
            #else if centroids present in our new and old centroids are close(tolerance) enough then finishing
            optimized = True
            for i,centroid in enumerate(centroids):
                if not np.isclose(centroids[i], prev_centroids[i], atol=0.0001).all():
                    optimized = False
                    break
            if optimized: break
                
        self.cluster_centers_ = centroids
    
        
    def predict(self, X):
        if self.fitted_ == None:
            raise Exception('"predict()" called before fit()')
        else:
            pass
    
    
    def _check_params(self, X):
        pass

# CustomMeanShift vs MeanShift

### Breast Canccer Dataset

In [580]:
dataset = breast
X = dataset.data
y = dataset.target

In [582]:
skModel = MeanShift(bandwidth=800).fit(X)
custModel = CustomMeanShift(radius=800).fit(X)

In [583]:
print(skModel.cluster_centers_.shape)
print(custModel.cluster_centers_.shape)
print(pd.DataFrame(np.vstack((skModel.cluster_centers_, custModel.cluster_centers_))).drop_duplicates().values.shape)

(2, 30)
(2, 30)
(3, 30)


In [585]:
print(skModel.cluster_centers_)
print()
print(custModel.cluster_centers_)
print()

#unique cluster centers
#as it can be seen that even though the 2 cluster centers are same, but due to floating point accuracies they are treated as different
print(skModel.cluster_centers_[0] - custModel.cluster_centers_[0])

[[  1.27937909e+01   1.86596336e+01   8.27568534e+01   5.17550431e+02
    9.50542888e-02   9.31873276e-02   6.62066825e-02   3.57974224e-02
    1.78625431e-01   6.32597198e-02   3.18474569e-01   1.21181659e+00
    2.25011142e+00   2.58279052e+01   7.13251724e-03   2.37828233e-02
    2.91449237e-02   1.08076034e-02   2.05462457e-02   3.73069677e-03
    1.43887263e+01   2.48701724e+01   9.42817457e+01   6.55478017e+02
    1.30479159e-01   2.29325668e-01   2.28455274e-01   9.54293987e-02
    2.85591379e-01   8.33652802e-02]
 [  2.74200000e+01   2.62700000e+01   1.86900000e+02   2.50100000e+03
    1.08400000e-01   1.98800000e-01   3.63500000e-01   1.68900000e-01
    2.06100000e-01   5.62300000e-02   2.54700000e+00   1.30600000e+00
    1.86500000e+01   5.42200000e+02   7.65000000e-03   5.37400000e-02
    8.05500000e-02   2.59800000e-02   1.69700000e-02   4.55800000e-03
    3.60400000e+01   3.13700000e+01   2.51200000e+02   4.25400000e+03
    1.35700000e-01   4.25600000e-01   6.83300000e-01 

### Iris Dataset

In [586]:
dataset = iris
X = dataset.data
y = dataset.target

In [587]:
skModel = MeanShift(bandwidth=1.2).fit(X)
custModel = CustomMeanShift(radius=1.2).fit(X)

In [588]:
print(skModel.cluster_centers_.shape)
print(custModel.cluster_centers_.shape)

#unique cluster centers
print(pd.DataFrame(np.vstack((skModel.cluster_centers_, custModel.cluster_centers_))).drop_duplicates().values.shape)

(2, 4)
(7, 4)
(9, 4)


In [589]:
print(skModel.cluster_centers_)
print()
print(custModel.cluster_centers_)

[[ 6.21142857  2.89285714  4.85285714  1.67285714]
 [ 5.01632653  3.44081633  1.46734694  0.24285714]]

[[ 5.01632653  3.44081633  1.46734694  0.24285714]
 [ 5.00208333  3.42083333  1.46666667  0.23958333]
 [ 6.21142857  2.89285714  4.85285714  1.67285714]
 [ 6.15217391  2.85072464  4.74057971  1.60724638]
 [ 6.18529412  2.87058824  4.80588235  1.63970588]
 [ 6.17272727  2.87424242  4.77575758  1.62878788]
 [ 6.19275362  2.87681159  4.81884058  1.64637681]]
