# Introduction

# Setup

## Imports

In [6]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.base import BaseEstimator, ClusterMixin
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_iris

## Dataset

In [3]:
dataset = load_iris()
X = dataset.data
y = dataset.target

xTrain, xTest, yTrain, yTest = train_test_split(X, y)

# KMeans

In [4]:
class CustomKMeans(BaseEstimator, ClusterMixin):
    def __init__(self, k=10, max_iter=300, tol=0.0001):
        self.k = k
        self.max_iter = max_iter
        self.tol = tol

    @property
    def cluster_centers_(self):
        return np.array(list(self.centroids.values()))

    @property
    def clusters_(self):
        for i in self.clusters:
            self.clusters[i] = np.array(self.clusters[i])
        return self.clusters

    def fit(self, X):
        self.X = X
        self._check_params(self.X)
        self._kmeans(self.X)
        self.fitted_ = True
        
        return self
    
    
    def _kmeans(self,X):
        self.centroids = {}
        #initially taking first "k" points as centroids
        for i in range(self.k):
            self.centroids[i] = X[i]

        self.clusters = {}
        for _ in range(self.max_iter):
            for i in range(self.k):
                self.clusters[i] = []

            # iterating over X and adding point to closes cluster it belongs to
            for point in X:
                # computing eucledian distances of current point from each centroid
                distances = [np.linalg.norm(self.centroids[centroid]-point) for centroid in self.centroids]

                # getting index of closest cluster
                index = np.argmin(distances)

                # appending point(feature set) to list of cluster it belongs to
                self.clusters[index].append(point)

            old_centroids = dict(self.centroids)
            for i in self.clusters:
                # computing new centroid by taking mean of all points in current cluster
                centroid = np.average(self.clusters[i], axis=0)

                self.centroids[i] = centroid

            bools = [True if np.array_equal(self.centroids[k], old_centroids[k]) else False for k in range(self.k)]
            if all(bools):
                break
        
    
    def _find_optimum_k(self):
        pass


    def predict(self, point):
        # computing  eucledian distances between given point and centroids
        distances = [np.linalg.norm(centroid-point) for centroid in self.centroids]

        # return cendroid with minimum distance
        return distances.argmin()


    def plot(self):
        colors = ['g', 'r', 'b', 'y', 'm']

        # plotting clusters
        for i in self._clusters:
            plt.scatter(self._clusters[i][:,0], self._clusters[i][:,1], s=25, color=colors[i%5])

        # plotting centroids
        plt.scatter(self.cluster_centers_[:,0], self.cluster_centers_[:,1], marker='x', color='pink')
        plt.show()
        
    
    def _check_params(self, X):
        pass

# CustomKMeans vs KMeans

In [5]:
skModel = KMeans().fit(xTrain)
custModel = CustomKMeans().fit(xTrain)

print(skModel.cluster_centers_)
print()
print(custModel.cluster_centers_)

[[ 5.          2.3         3.275       1.025     ]
 [ 6.58823529  3.11176471  5.48823529  2.15294118]
 [ 5.26363636  3.65909091  1.50454545  0.29545455]
 [ 5.66315789  2.71578947  4.07368421  1.25263158]
 [ 6.4         2.98125     4.5875      1.425     ]
 [ 5.95555556  2.72222222  4.98888889  1.77777778]
 [ 7.58333333  3.2         6.45        2.05      ]
 [ 4.74736842  3.12105263  1.43157895  0.19473684]]

[[ 5.95555556  2.72222222  4.98888889  1.77777778]
 [ 6.48        3.07        5.35        2.07      ]
 [ 6.74285714  3.17142857  5.68571429  2.27142857]
 [ 7.58333333  3.2         6.45        2.05      ]
 [ 5.08        3.48        1.54666667  0.3       ]
 [ 5.48888889  3.88888889  1.46666667  0.26666667]
 [ 5.37272727  2.44545455  3.63636364  1.13636364]
 [ 5.85789474  2.84736842  4.32631579  1.32105263]
 [ 4.72941176  3.09411765  1.40588235  0.19411765]
 [ 6.62222222  3.05555556  4.63333333  1.45555556]]
