# Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from scipy.cluster import hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

# Import Dataset

In [None]:
dataset = pd.read_csv('../input/marketing-data-for-a-supermarket-in-united-states/supermarket_marketing/Supermarket_CustomerMembers.csv')
dataset

In [None]:
dataset.info()

CustomerID is irrelevant, so we'll exclude the column.

# Encoding Gender

In [None]:
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoder.fit(dataset['Genre'].values.reshape((-1,1)))

In [None]:
encoded_cols = list(encoder.get_feature_names())
encoded_cols

In [None]:
categorical_col = ['Genre']
numerical_cols = ['Age', 'Annual Income (k$)', 'Spending Score (1-100)']

In [None]:
dataset[numerical_cols + categorical_col]

In [None]:
X = dataset[categorical_col + numerical_cols].copy()
X[encoded_cols] = encoder.transform( X['Genre'].values.reshape((-1,1)) )
X = X[numerical_cols + encoded_cols]
X

# Find Optimal K(Number of Clusters) Using Dendrogram

In [None]:
plt.figure(figsize=(25,15))
dendrogram = sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Dendrogram')
plt.xlabel('Customers')
plt.ylabel('Euclidean Distances')
plt.show()

# Training Hierarchical Clustering

In [None]:
hc = AgglomerativeClustering(5,affinity='euclidean', linkage='ward')
y = hc.fit_predict(X)
y

In [None]:
dataset['Clusters'] =y
dataset

# Visualize Clusters

In [None]:
X

In [None]:
plt.figure(figsize=(15, 10));
plt.scatter(X[y==0]['Annual Income (k$)'], X[y==0]['Spending Score (1-100)'],c='red', label='Cluster 1');
plt.scatter(X[y==1]['Annual Income (k$)'], X[y==1]['Spending Score (1-100)'],c='blue', label='Cluster 2');
plt.scatter(X[y==2]['Annual Income (k$)'], X[y==2]['Spending Score (1-100)'],c='green', label='Cluster 3');
plt.scatter(X[y==3]['Annual Income (k$)'], X[y==3]['Spending Score (1-100)'],c='cyan', label='Cluster 4');
plt.scatter(X[y==4]['Annual Income (k$)'], X[y==4]['Spending Score (1-100)'],c='magenta', label='Cluster 5');

plt.title('Clusters')
plt.xlabel('Annual Income (k$)');
plt.ylabel('Spending Score (1-100)');
plt.legend(['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4', 'Cluster 5']);
plt.show();

# Analysis
- Cluster 5 (Magenta) : People with low annual income and who spends lower in the mall. These are customers still less targeted.
<br><br>
- Cluster 2 (Blue) : People with high annual income and who still doesn't spend much money in the mall. These customers can be targeted so that they can be tempted to spend more in the mall. These are the customers with 3rd priority to be targeted (1 - Cluster 5, 2 - Cluster 2).
<br><br>
- Cluster 4 (Cyan) : People with low annual income and who spend a lot in the mall. These Customers are less targeted by the mall (if the mall wishes to so that the people with low income doesn't need to spend too much in the mall - ethics).
<br><br>
- Cluster 3 (Green) : People with higher annual income and who spends more in the mall. These customers are likely to buy more in the mall so they are the priority targets for the mall.
<br><br>
- Cluster 1 (Red) : People with average annual income and spends averagely in the mall.