In [None]:
import numpy as np 
import pandas as pd 
from matplotlib import pyplot as plt
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from sklearn.cluster import KMeans, MeanShift,estimate_bandwidth, AgglomerativeClustering, AffinityPropagation
from sklearn.preprocessing import LabelEncoder
from sklearn.datasets import make_blobs
from sklearn.metrics import silhouette_score
import scipy.cluster.hierarchy as sch

**Read the data**

In [None]:
file_path = '/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv'
data = pd.read_csv(file_path)
data.info()
print('\nData shape: ', data.shape)

In [None]:
data.describe()

In [None]:
data.isnull().sum()

There are no missing values and all the columns except for 'Gender' column have numerical values. I'll use LabelEncoder to get rid of these categorical values.

In [None]:
data.head()

In [None]:
label_encoder = LabelEncoder()
data['GenderColumn'] = label_encoder.fit_transform(data['Gender'])

genders_before_encoding = data['Gender'].unique()
print('\n--- Genders before label encoding: \n', genders_before_encoding)
genders_after_encoding = data['GenderColumn'].unique()

print('\n--- Genders after label encoding: \n', genders_after_encoding)


In [None]:
data.info()

Delete the Gender column

In [None]:
data = data.drop('Gender', axis = 1)
data.head()

Plot the data

In [None]:
plt.plot(data['Age'], data['GenderColumn'], 'o')
plt.title('Age vs. Gender')
plt.xlabel('Age')
plt.ylabel('Gender')
plt.show()

In [None]:
plt.plot(data['Annual Income (k$)'], data['GenderColumn'], 'o')
plt.title('Annual Income vs. Gender')
plt.xlabel('Annual Income')
plt.ylabel('Gender')
plt.show()

In [None]:
plt.plot(data['Annual Income (k$)'], data['Spending Score (1-100)'], 'o')
plt.title('Annual Income vs. Spending Score')
plt.xlabel('Annual Income')
plt.ylabel('Spending Score')
plt.show()

# Income and spending score

In [None]:
income_and_spending_data = data.iloc[:,[2,3]]
income_and_spending_data.head()

In [None]:
plt.plot(data['Age'], data['Annual Income (k$)'], 'o')
plt.plot(data['Age'], data['Spending Score (1-100)'], 'o')
plt.legend(['Age vs. Annual Income', 'Age vs. Spending Score'])
plt.title('Age vs. Annual Income\n Age vs. Spending Score')
plt.xlabel('Age')
plt.show()


# 1. K-Means algorithm

I'll determine the optimal number of clusters. To do this I'll use the Elbow method, testing number of clusters between 1 and 10. I'll choose the number of clusters based on the obtained elbow. To obtain the elbow, I'll use the inertia which is the sum of distances of all the points
within a cluster from the centriod of that cluster. The lesser the inertia, the better the clusters are.

In [None]:
inertia_values = []
k_values = list(range(1, 10))

for value in k_values:
    kmeans = KMeans(n_clusters=value)
    kmeans.fit(income_and_spending_data)
    inertia_values.append(kmeans.inertia_)

plt.figure(figsize=(6, 6))
plt.plot(k_values, inertia_values, '-o')
plt.xlabel(r'Number of clusters')
plt.ylabel('Sum of squared distance')

In [None]:
number_of_clusters = 5

kmeans_model = KMeans(n_clusters=number_of_clusters) 
kmeans_model.fit(income_and_spending_data)

y_k_means = kmeans.fit_predict(income_and_spending_data)

income_and_spending_data = np.array(income_and_spending_data)

plt.title("Clusters", fontsize=20)
plt.xlabel("Annual Income")
plt.ylabel("Spending Score")

plt.scatter(income_and_spending_data[y_k_means ==0,0], income_and_spending_data[y_k_means == 0,1], c='blue')
plt.scatter(income_and_spending_data[y_k_means ==1,0], income_and_spending_data[y_k_means == 1,1], c='black')
plt.scatter(income_and_spending_data[y_k_means ==2,0], income_and_spending_data[y_k_means == 2,1], s=100, c='green')
plt.scatter(income_and_spending_data[y_k_means ==3,0], income_and_spending_data[y_k_means == 3,1], s=100, c='gray')
plt.scatter(income_and_spending_data[y_k_means ==4,0], income_and_spending_data[y_k_means == 4,1], s=100, c='red')

Based on the previous plot, the clients can be classified as follows:

* clients that don't earn too much:
      they don't spend too much(the black cluster)
      they spend a lot(the red cluster)
      they have medium spending levels(the green cluster)
* clients with high incomes and low spending levels(the gray cluster)
* clients with medium incomes but high spendings(the blue cluster)

In [None]:
# assign the label
data['cluster_id'] = kmeans_model.labels_
data.head()

# Clusters analysis

In [None]:
sns.stripplot(x='cluster_id', y='Age', data=data)
plt.title('Age variation across clusters')
plt.show()

In [None]:
sns.stripplot(x='cluster_id', y='Annual Income (k$)', data=data)
plt.title('Annual Income across clusters')
plt.show()

In [None]:
sns.stripplot(x='cluster_id', y='Spending Score (1-100)', data=data)
plt.title('Spending Score across clusters')
plt.show()

# 2. Mean Shift Algorithm

As opposed to K-Means, when using Mean Shift, I don’t need to know the number of clusters beforehand. 
Although the Mean Shift is computationally expensive(O(n²)), I'll use it because the dataset is not that big.


Steps:
1. Define a window (bandwidth of the kernel) and place the window on a data point.
2. Calculate the mean for all the points in the window.
3. Move the center of the window to the location of the mean.
4. Repeat steps 2 and 3 until there is convergence.

In [None]:
bandwidth = estimate_bandwidth(income_and_spending_data, quantile=0.2)

mean_shift = MeanShift(bandwidth=bandwidth, bin_seeding=True)
mean_shift = mean_shift.fit(income_and_spending_data)
y_mean_shift = mean_shift.predict(income_and_spending_data)

mean_shift_labels = mean_shift.labels_

labels_unique = np.unique(mean_shift_labels)
mean_shift_estimated_clusters = len(labels_unique)

print("number of estimated clusters : %d" % mean_shift_estimated_clusters)

plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], c=mean_shift_labels)
plt.xlabel('Annual income')
plt.ylabel('Spending score')
plt.show()

# 3. Agglomerative Clustering

The dendrogram will help me figure out the optimal number of clusters. 

In [None]:
dendrogram = sch.dendrogram(sch.linkage(income_and_spending_data, method='ward'))

From the dendrogram we can see that the optimal number of clusters is also 5.

* The linkage criteria refers to how the distance between clusters is calculated. I used ward linkage which computes the distance between clusters as the sum of squared differences within all clusters. 
* The affinity is the method used to calculate the distance between data points. I used the euclidean distance.

In [None]:
agglomerative_clustering_model = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
y_agglomerative_clustering = agglomerative_clustering_model.fit_predict(income_and_spending_data)
agglomerative_labels = agglomerative_clustering_model.labels_
agglomerative_labels

In [None]:
plt.scatter(income_and_spending_data[agglomerative_labels==0, 0], income_and_spending_data[agglomerative_labels==0, 1], s=50, marker='o', color='blue')
plt.scatter(income_and_spending_data[agglomerative_labels==1, 0], income_and_spending_data[agglomerative_labels==1, 1], s=50, marker='o', color='red')
plt.scatter(income_and_spending_data[agglomerative_labels==2, 0], income_and_spending_data[agglomerative_labels==2, 1], s=50, marker='o', color='green')
plt.scatter(income_and_spending_data[agglomerative_labels==3, 0], income_and_spending_data[agglomerative_labels==3, 1], s=50, marker='o', color='purple')
plt.scatter(income_and_spending_data[agglomerative_labels==4, 0], income_and_spending_data[agglomerative_labels==4, 1], s=50, marker='o', color='yellow')
plt.show()

In [None]:
# assign cluster labels
data['aggl_labels'] = agglomerative_clustering_model.labels_
data.head()

# Clusters analysis

In [None]:
# plots
sns.stripplot(x='aggl_labels', y='Age', data=data)
plt.show()

In [None]:
sns.stripplot(x='aggl_labels', y='Annual Income (k$)', data=data)
plt.show()

In [None]:
sns.stripplot(x='aggl_labels', y='Spending Score (1-100)', data=data)
plt.show()

# 4. Affinity Propagation

This algorithm doesn't require a preset cluster number. It takes as input measures of similarity between pair of data points. As they have similarities, they can belong to the same cluster. 

In [None]:
affinity_propagation = AffinityPropagation(max_iter=150)
affinity_propagation.fit(income_and_spending_data)
cluster_centers_indices = affinity_propagation.cluster_centers_indices_
affinity_estimated_clusters = len(cluster_centers_indices)

# Predict the cluster for all the samples
y_affinity_propagation = affinity_propagation.predict(income_and_spending_data)

plt.scatter(data['Annual Income (k$)'], data['Spending Score (1-100)'], c=affinity_propagation.labels_.astype(float), marker="o", picker=True)
plt.title(f'Estimated number of clusters = {affinity_estimated_clusters}')
plt.xlabel('Annual income')
plt.ylabel('Spending score')
plt.show()

In [None]:
affinity_prop = AffinityPropagation(random_state=5).fit(np.array(income_and_spending_data))
affinity_prop
affinity_prop.labels_

# Comparing the results
#  Silhouette Score    

The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. 
The Silhouette Coefficient for a sample is (b - a) / max(a, b), where b is the distance between a sample and the nearest cluster that 
the sample is not a part of. The Silhouette Coefficient is only defined if the number of labels is 2 <= n_labels <= n_samples - 1.
The best value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been 
assigned to the wrong cluster, as a different cluster is more similar.


The silhouette_score gives the average value for all the samples.
This gives a perspective into the density and separation of the formed clusters

**For K-Means**

In [None]:
silhouette_avg_k_means = silhouette_score(income_and_spending_data, y_k_means, metric='euclidean')
print(f'For 5 clusters, the average silhouette_score is: {silhouette_avg_k_means}')

**For Mean Shift**

In [None]:
silhouette_avg_mean_shift = silhouette_score(income_and_spending_data, y_mean_shift, metric='euclidean')
print(f'For {mean_shift_estimated_clusters} clusters, the average silhouette_score is: {silhouette_avg_mean_shift}')

**For Agglomerative Clustering**

In [None]:
silhouette_avg_agglomerative = silhouette_score(income_and_spending_data, y_agglomerative_clustering, metric='euclidean')
print(f'For 5 clusters, the average silhouette_score is: {silhouette_avg_agglomerative}')

**For Affinity Propagation**

In [None]:
silhouette_avg_affinity = silhouette_score(income_and_spending_data, y_affinity_propagation, metric='euclidean')
print(f'For {affinity_estimated_clusters} clusters, the average silhouette_score is: {silhouette_avg_affinity}')