In [None]:
import numpy as np
import pandas as pd
from pandas import plotting
import matplotlib.pyplot as plt
import seaborn as sns

First, we will input our dataset

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/mall-customers/Mall_Customers.csv')
data.head()

Now we will look for a good pair of parameters for clustering

In [None]:
sns.pairplot(data)
plt.title('Pairplot for the Data', fontsize = 20)
plt.show()

Annual Income to Spending Score looks good, but their names aren't easy to write repeatedly, so i will rename them

In [None]:
data.rename(columns={'Annual Income (k$)' : 'Income', 'Spending Score (1-100)' : 'Spending'}, inplace = True)

I will look for optimal number of clusters using Elbow Method

In [None]:
import sklearn.cluster as cluster

sse = [] #Sum of Squared Errors
for i in range(1, 11):
    km = cluster.KMeans(n_clusters = i, init = 'k-means++', max_iter = 500, n_init = 10, random_state = 0)
    km.fit(data[['Spending','Income']])
    sse.append(km.inertia_)
    
plt.plot(range(1, 11), sse)
plt.xlabel('Number of Clusters')
plt.ylabel('Sum of Squared Errors')
plt.show()

Looks like our "elbow" spots are for 3 clusters and 5 clusters, but 5 have lesser SSE, so we will take it

In [None]:
km = cluster.KMeans(n_clusters = 5, init = 'k-means++', max_iter = 500, n_init = 10, random_state = 0)
km = km.fit(data[['Spending','Income']])
data['Clusters'] = km.labels_
data.head()

Now i will visualize plot of Spending Score to Annual Income with colorized clusters and marked centeroids

In [None]:
sns.scatterplot(x="Spending", y="Income",hue = 'Clusters',  data = data)
plt.scatter(km.cluster_centers_[:,0], km.cluster_centers_[:, 1], s = 30, c = 'magenta')

Now i will try to do that with my own algorithm

In [None]:
dt = pd.DataFrame(data[['Spending','Income']])

We know that the best number o clusters for that dataset is 5, so k = 5 and i will randomly select 5 centroids

In [None]:
k = 5
np.random.seed(300)
centroids = {
    i+1: [np.random.randint(0, 100), np.random.randint(0, 140)]
    for i in range(k)
}
centroids

Visualizing our centroids and coloring them in different colors

In [None]:
colors = {1: 'green', 2: 'blue', 3: 'red', 4: 'magenta', 5: 'yellow'}
sns.scatterplot(x="Spending", y="Income",  data = dt)
for i in centroids.keys():
    plt.scatter(*centroids[i], color = colors[i])

function to assign points to closest centroid using formula: sqrt((x1 - x2)^2 + (y1 - y2)^2)and remembering color of their centroids

In [None]:
def assign(dt, centroids):
    for i in centroids.keys():
        dt['distance_from_{}'.format(i)] = (
        np.sqrt(
        (dt['Spending'] - centroids[i][0]) ** 2
        + (dt['Income'] - centroids[i][1]) ** 2
        ))
    centroid_dist_cols = ['distance_from_{}'.format(i) for i in centroids.keys()]
    dt['closest'] = dt.loc[:, centroid_dist_cols].idxmin(axis = 1)
    dt['closest'] = dt['closest'].map(lambda x: int(x.lstrip('distance_from_')))
    dt['color'] = dt['closest'].map(lambda x: colors[x])
    return dt

Using function and coloring points to slightly lighter colors of their centroid so we still can see our centroids

In [None]:
dt = assign(dt, centroids)
plt.scatter(dt['Spending'], dt['Income'], color = dt['color'], alpha = 0.3)
for i in centroids.keys():
    plt.scatter(*centroids[i], color=colors[i])
plt.show()

Function for updating centroids placements

In [None]:
def update(k):
    for i in centroids.keys():
        centroids[i][0] = np.mean(dt[dt['closest'] == i]['Spending'])
        centroids[i][1] = np.mean(dt[dt['closest'] == i]['Income'])
    return k

Updating and assigning again

In [None]:
centroids = update(centroids)
dt = assign(dt, centroids)
plt.scatter(dt['Spending'], dt['Income'], color = dt['color'], alpha = 0.3)
for i in centroids.keys():
    plt.scatter(*centroids[i], color=colors[i])
plt.show()

i will do it again 50 times

In [None]:
x = 0
while x != 50:
    centroids = update(centroids)
    dt = assign(dt, centroids)
    x = x + 1
    
plt.scatter(dt['Spending'], dt['Income'], color = dt['color'], alpha = 0.3)
for i in centroids.keys():
    plt.scatter(*centroids[i], color=colors[i])
plt.show()

Final result is pretty similar to sklearn method, so i consider it a success