## K Means algorithm implementation from scratch using numpy

### Imports

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

### Reading Dataset

In [None]:
df = pd.read_csv("/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv")
df = df.set_index("CustomerID")

display(df)

### Data Cleaning / Pre processing

In [None]:
from sklearn.preprocessing import LabelEncoder
df["Gender"] = LabelEncoder().fit_transform(df["Gender"])
display(df)

## K Means Implementation

#### Cluster Assignment

In [None]:
def cluster_assignment(X,mu):
    C = []
    for i in range(len(X)):
        distance_array = []
        for centroid in range(len(mu)):
            distance = np.sqrt(np.sum((X[i] - mu[centroid])**2))
            distance_array.append(distance) # MSE 
        C.append(np.argmin(distance_array))
    return C
    

#### Move Centeroids

In [None]:
def centroid_recalculation(X,C,NUMBER_OF_CENTROID):
    mu_new = []
    for k in range(NUMBER_OF_CENTROID):
        cnt = 0
        sum_of_all_points = np.zeros([1,len(X[0])])
        for i in range(len(X)):
            if C[i] == k:
                cnt += 1
                sum_of_all_points = np.add(sum_of_all_points, X[i])


        new_centroid_location = sum_of_all_points / cnt
        mu_new.append(new_centroid_location)
    return mu_new


#### Cost Calculation

In [None]:
def calculate_cost(X,C,mu):
    cost = 0
    for i in range(len(X)):
          cost += (X[i] - mu[C[i]])**2
    return (1/len(X)) * np.sqrt(np.sum(cost))

#### Driver Function

In [None]:
def main(NUMBER_OF_CENTROID):
    X = df.values
    INITIAL_CENTEROIDS = df.sample(n=NUMBER_OF_CENTROID).values # initialize centroids
    mu = INITIAL_CENTEROIDS
    J = []
    for i in range(MAX_NUMBER_OF_ITERATIONS):
        C = cluster_assignment(X,mu)
        mu_new = centroid_recalculation(X,C,NUMBER_OF_CENTROID)

        J.append(calculate_cost(X,C,mu_new))

        if not i == 0 :  
            if J[i] == J[i-1] :
                break
        mu = mu_new
    return J[-1]

### Define global parameters

In [None]:
NUMBER_OF_EXAMPLES = df.shape[0]
NUMBER_OF_ATTRIBUTES = df.shape[1]
MAX_NUMBER_OF_ITERATIONS = 20

final_cost = []
for k in range(1,10):
    NUMBER_OF_CENTROID = k
    cost = main(NUMBER_OF_CENTROID)
    final_cost.append((k,cost))

## Elbow Method

In [None]:
plt.plot([x[0] for x in final_cost],[x[1] for x in final_cost])

Optimum values of K is 7

### Comparing Results with Sklearn

In [None]:
from sklearn.cluster import KMeans
clf = KMeans(n_clusters=7,init=df.sample(n=7).values)
clf.fit(df)
clf.cluster_centers_

### Data Viz
#### Looking at the effect of every attribute 

In [None]:
 X = df.values
NUMBER_OF_CENTROID = 7
INITIAL_CENTEROIDS = df.sample(n=NUMBER_OF_CENTROID).values # initialize centroids
mu = INITIAL_CENTEROIDS
J = []
for i in range(MAX_NUMBER_OF_ITERATIONS):
    C = cluster_assignment(X,mu)
    mu_new = centroid_recalculation(X,C,NUMBER_OF_CENTROID)
    J.append(calculate_cost(X,C,mu_new))
    
    if not i == 0 :  
        if J[i] == J[i-1] :
            break
    mu = mu_new

In [None]:
for x in range(NUMBER_OF_ATTRIBUTES-1):
    PARAM_X = x
    for y in range(x+1,NUMBER_OF_ATTRIBUTES):
        PARAM_Y = y
    
        plt.scatter([x[PARAM_X] for x in X],[x[PARAM_Y] for x in X],c=C)
        plt.scatter([i[0][PARAM_X] for i in mu],[i[0][PARAM_Y] for i in mu],linewidth=3,color='red', marker='o',)
        plt.xlabel(df.columns[x])
        plt.ylabel(df.columns[y])
        
        plt.show()