# Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import matplotlib.patches as mpatches
from sklearn.preprocessing import OneHotEncoder

# Import Dataset

In [None]:
dataset = pd.read_csv('../input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/Supermarket_CustomerMembers.csv')
dataset

In [None]:
dataset.info()

CustomerID is irrelevant, so we'll exclude the column.

# Encoding Gender

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(dataset['Genre'].values.reshape((-1,1)))

In [None]:
encoded_cols = list(encoder.get_feature_names())
encoded_cols

In [None]:
categorical_col = ['Genre']
numerical_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

In [None]:
dataset[numerical_cols + categorical_col]

In [None]:
X = dataset[categorical_col + numerical_cols].copy()

In [None]:
X[encoded_cols] = encoder.transform( X['Genre'].values.reshape((-1,1)) )

In [None]:
X = X[numerical_cols + encoded_cols]
X

# Find Optimal Numer of Clusters (k) using Elbow Method

In [None]:
wcss = []
for i in range(1,11):
    kmeans = KMeans(i, init='k-means++', random_state=42)
    kmeans.fit(X)    
    wcss.append(kmeans.inertia_)

In [None]:
wcss

In [None]:
plt.figure(figsize=(20, 10));
plt.plot(range(1,11), wcss);
plt.title('The Elbow Method');
plt.xlabel('Number of Clusters');
plt.ylabel('WCSS');
plt.show()

From the graph, it can be seen that k=5 is optimal.<br>
Therefore, there are 5 clusters namely 1, 2, 3, 4, 5.<br>
That is, there are 5 groups of customers in the mall.<br>

# Training K-Means

In [None]:
kmeans = KMeans(n_clusters=5, random_state=42, init='k-means++', verbose=1)
y = kmeans.fit_predict(X)

In [None]:
y

In [None]:
dataset['Categories'] = pd.DataFrame(y)

In [None]:
dataset

# Visualizing the Clusters (Using Only Annual Income & Spending Score Features)

In [None]:
dataset[dataset['Categories'] == 0]['Spending Score (1-100)']

In [None]:
X

In [None]:
y

In [None]:
X[y==0]['Spending Score (1-100)']

In [None]:
kmeans.cluster_centers_

In [None]:
plt.figure(figsize=(15, 10));
plt.scatter(X[y==0]['Annual Income (k$)'], X[y==0]['Spending Score (1-100)'],c='red', label='Cluster 1');
plt.scatter(X[y==1]['Annual Income (k$)'], X[y==1]['Spending Score (1-100)'],c='blue', label='Cluster 2');
plt.scatter(X[y==2]['Annual Income (k$)'], X[y==2]['Spending Score (1-100)'],c='green', label='Cluster 3');
plt.scatter(X[y==3]['Annual Income (k$)'], X[y==3]['Spending Score (1-100)'],c='cyan', label='Cluster 4');
plt.scatter(X[y==4]['Annual Income (k$)'], X[y==4]['Spending Score (1-100)'],c='magenta', label='Cluster 5');
plt.scatter(kmeans.cluster_centers_[:,1], kmeans.cluster_centers_[:,2],c='black', marker='v', s=100, label='Centroids');

plt.title('Clusters')
plt.xlabel('Annual Income (k$)');
plt.ylabel('Spending Score (1-100)');
plt.legend(['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']);
plt.show();

# Analysis
- Cluster 3 (Green) : People with low annual income and who spends lower in the mall. These are customers still less targeted.
<br><br>
- Cluster 2 (Blue) : People with high annual income and who still doesn't spend much money in the mall. These customers can be targeted so that they can be tempted to spend more in the mall. These are the customers with 3rd priority to be targeted (1 - Cluster 5, 2 - Cluster 2).
<br><br>
- Cluster 4 (Cyan) : People with low annual income and who spend a lot in the mall. These Customers are less targeted by the mall (if the mall wishes to so that the people with low income doesn't need to spend too much in the mall - ethics).
<br><br>
- Cluster 5 (Magenta) : People with higher annual income and who spends more in the mall. These customers are likely to buy more in the mall so they are the priority targets for the mall.
<br><br>
- Cluster 1 (Red) : People with average annual income and spends averagely in the mall.