# Introduction

# Setup

# Imports

In [32]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cross_validation import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

# Dataset

In [52]:
dataset = load_iris()
X = dataset.data
y = dataset.target

xTrain, xTest, yTrain, yTest = train_test_split(X, y)

# KMeans

In [49]:
class CustomKMeans(BaseEstimator, ClusterMixin):
    def __init__(self, k=10, max_iter=300, tol=0.0001):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol

    @property
    def cluster_centers_(self):
        return np.array(list(self.centroids.values()))

    @property
    def clusters_(self):
        for i in self.clusters:
            self.clusters[i] = np.array(self.clusters[i])
        return self.clusters

    def fit(self, X):
        self._check_params(X)
        
        self.centroids = {}
        #initially taking first "k" points as centroids
        for i in range(self.k):
            self.centroids[i] = X[i]

        self.clusters = {}
        for _ in range(self.max_iter):
            for i in range(self.k):
                self.clusters[i] = []

            # iterating over X and adding point to closes cluster it belongs to
            for point in X:
                # computing eucledian distances of current point from each centroid
                distances = [np.linalg.norm(self.centroids[centroid]-point) for centroid in self.centroids]

                # getting index of closest cluster
                index = np.argmin(distances)

                # appending point(feature set) to list of cluster it belongs to
                self.clusters[index].append(point)

            old_centroids = dict(self.centroids)
            for i in self.clusters:
                # computing new centroid by taking mean of all points in current cluster
                centroid = np.average(self.clusters[i], axis=0)

                self.centroids[i] = centroid

            bools = [True if np.array_equal(self.centroids[k], old_centroids[k]) else False for k in range(self.k)]
            if all(bools):
                break
        
        return self


    def predict(self, point):
        # computing  eucledian distances between given point and centroids
        distances = [np.linalg.norm(centroid-point) for centroid in self.centroids]

        # return cendroid with minimum distance
        return distances.argmin()


    def plot(self):
        colors = ['g', 'r', 'b', 'y', 'm']

        # plotting clusters
        for i in self._clusters:
            plt.scatter(self._clusters[i][:,0], self._clusters[i][:,1], s=25, color=colors[i%5])

        # plotting centroids
        plt.scatter(self._centroids[:,0], self._centroids[:,1], marker='x', color='pink')
        plt.show()
        
    
    def _check_params(self, X, y=None):
        pass

# CustomKMeans vs KMeans

In [53]:
skModel = KMeans().fit(xTrain)
custModel = CustomKMeans().fit(xTrain)

print(skModel.cluster_centers_)
print()
print(custModel.cluster_centers_)

[[ 5.6         2.74285714  4.12857143  1.31428571]
 [ 5.22631579  3.66842105  1.50526316  0.27894737]
 [ 6.56470588  3.06470588  5.5         2.15882353]
 [ 7.46        3.21        6.26        2.04      ]
 [ 6.          2.70714286  4.92142857  1.8       ]
 [ 4.75555556  3.17777778  1.43333333  0.18333333]
 [ 5.          2.3         3.275       1.025     ]
 [ 6.475       3.025       4.625       1.45      ]]

[[ 5.72222222  2.75555556  4.          1.25555556]
 [ 5.          2.3         3.275       1.025     ]
 [ 6.72222222  3.          4.67777778  1.45555556]
 [ 6.11        3.06        4.64        1.55      ]
 [ 4.74117647  3.16470588  1.42941176  0.18235294]
 [ 7.46        3.21        6.26        2.04      ]
 [ 6.          2.60909091  4.94545455  1.8       ]
 [ 5.38        2.72        4.36        1.42      ]
 [ 6.56470588  3.06470588  5.5         2.15882353]
 [ 5.215       3.655       1.505       0.275     ]]
