In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import missingno as msno
sns.set()

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')
data.head()
# Spending score -> Score given to customers given by company on the basis of money spent and behavior

# EDA

In [None]:
data.info()

In [None]:
data.describe(include=['int64','object'])

In [None]:
sns.countplot(x='Gender', data=data);

Dataset has more females as compared to males

In [None]:
data['Gender_int'] = (data['Gender'] == 'Female').astype('int')

In [None]:
print(data['Annual Income (k$)'].mean())
sns.displot(data, x="Annual Income (k$)")

Dataset has more people in the lower half range of the incomes

In [None]:
sns.displot(data, x="Age")

Dataset has more people <= 40 years in age than people > 40 years in age

In [None]:
sns.displot(data, x="Spending Score (1-100)", kde=True)

Distribution is close to normal distribution

In [None]:
sns.heatmap(data.drop(['CustomerID'], axis=1).corr(), annot=True, cmap='coolwarm')

Moderate negative correlation between age and spending score  
=> Lower age, higher spending score

In [None]:
data['Young'] = (data['Age'] <= data['Age'].mean()).astype('int')

In [None]:
data.groupby(['Young'])[['Spending Score (1-100)']].describe()

In [None]:
data['High Spending Score'] = (data['Spending Score (1-100)'] >= data['Spending Score (1-100)'].mean()).astype('int')
data['Rich'] = (data['Annual Income (k$)'] >= data['Annual Income (k$)'].mean()).astype('int')

In [None]:
sns.countplot(x='High Spending Score', hue='Young', data=data);

The plot further solidifies the observation that the spending score is higher with young people

In [None]:
sns.countplot(x='High Spending Score', hue='Rich', data=data);

Not significant correlation between Income and Spending Score

In [None]:
sns.countplot(x='High Spending Score', hue='Gender_int', data=data);

No correlation between gender and spending score

In [None]:
sns.displot(data, x='Spending Score (1-100)', y='Age')

In [None]:
import matplotlib.pyplot as plt
plt.scatter(data['Age'],data['Spending Score (1-100)'])
plt.show()

# Clustering

In [None]:
xdata = data[['Annual Income (k$)','Spending Score (1-100)','Age']].to_numpy()
xdata = (xdata - xdata.mean(axis=0)) / (xdata.std(axis=0))

In [None]:
def distance (x, cluster) :
    return np.sqrt(np.sum((cluster-x)**2,axis=1))

In [None]:
def find_cluster(x, cluster) :
    c = np.ones(x.shape[0])
    for i in range (x.shape[0]):
        c[i] = np.argmin(distance(x[i],cluster))
    return c

In [None]:
def error(x, c, cluster) :
    e = 0
    for i in range (x.shape[0]):
#         print(int(c[i]))
        e += distance(x[i], cluster[int(c[i]):int(c[i])+1])
    return e

In [None]:
def redefine_clusters(x, c, k) :
    n,m = x.shape
    clusters = np.zeros(shape=(k,m))
    
    for i in range(k) :
        s = np.zeros(m)
        count  = 0
        for j in range(n) :
            if c[j] == i :
                s = s + x[j]
                count += 1
        if count > 0 :
            s = s / count
        clusters[i] = s
    return clusters

In [None]:
def clustering (xdata, k = 2, epochs = 50, det=False) :
    n, m = xdata.shape
    clusters = np.random.rand(k,m) * 2 - 1 # k clusters with m coordinates
    e = 0
    c = np.zeros(n)
    if det: 
        print(clusters)
    for epoch in range(epochs) :
        c = find_cluster(xdata, clusters)
        e = error(xdata, c, clusters)
        if det and epoch % 10 == 0 :
            print("Epoch {}/{} ====> Error: {}".format(epoch, epochs, e))
            print(c)
        clusters = redefine_clusters(xdata,c,k)
    if det:
        print(clusters)
    return c,e

In [None]:
errors = []
kval = []
for k in range(1,20) :
    errorlst = []
    for i in range(5) :
        c,e = clustering(xdata,k,15)
        errorlst.append(e)
    errors.append(sum(errorlst)/len(errorlst))
    kval.append(k)

In [None]:
import matplotlib.pyplot as plt

In [None]:
plt.plot(kval,errors)
plt.show()

In [None]:
cl,e = clustering(xdata,5,15)
len(cl)

In [None]:
print(e)

In [None]:
plt.scatter(data['Spending Score (1-100)'],data['Annual Income (k$)'],c=cl)
plt.xlabel('Spending Score')
plt.ylabel('Annual Income')
plt.show()

In [None]:
plt.scatter(data['Spending Score (1-100)'],data['Age'],c=cl)
plt.xlabel('Spending Score')
plt.ylabel('Age')
plt.show()

In [None]:
clustered = data.copy()
clustered['Cluster'] = cl.astype('int')

In [None]:
clustered.groupby(['Cluster'])[['Spending Score (1-100)','Gender', 'Annual Income (k$)', 'Age']].agg([np.mean, np.std, np.min, np.max])

In [None]:
from mpl_toolkits import mplot3d
import matplotlib.pyplot as plt
plt.ion()
%matplotlib widget

fig = plt.figure()
ax = plt.axes(projection='3d')

ax = plt.axes(projection='3d')

zdata = data['Spending Score (1-100)']
xdata = data['Age']
ydata = data['Annual Income (k$)']
ax.scatter3D(xdata, ydata, zdata, c=clustered['Cluster'],cmap='Set2')
plt.show()

## Result/Observations
* The customers can be divided into 5 clusters
* Customers from cluster 0 and 4 have high average spending score
    * Better targets for conversations
    * Rich people with average age and young people in low income group 