# <Center>*Clustering Project*</Center>

In [None]:
# Importing data handle libraries
import numpy as np
import pandas as pd

# Importing visualisation libraries
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from yellowbrick.cluster import silhouette_visualizer

# Importing modeling libraries
from sklearn.cluster import KMeans

# Evaluation
from sklearn.metrics import silhouette_score

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Reading dataset using pandas
df = pd.read_csv('/kaggle/input/customer-segmentation-tutorial-in-python/Mall_Customers.csv')

# Checking first five rows
df.head()

In [None]:
# Checking characteristics of the dataset using .info()
df.info()

Based on that, it is possible to see that there are no missing values that need further treatment.

In [None]:
# Checking statistical information
df.describe()

In [None]:
# Checking skewness
df.skew()

The level of skewness is considered moderate (>-0.5 and <0.5), so it is not a relevant parameter for this analysis.

## Visualisation

Before modeling, I performed some plots to understand the data.

In [None]:
# I created a function to plot as many histograms I need
# In the function I added the kind of plot I want and all the details of the plot
def hist(j,col,ax):
    ax=ax[j]
    sns.histplot(x=col, data=df, kde=True, bins=20,ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(col, fontweight='bold')

In [None]:
# I stablished the number of plots I need and a suptitle
fig, ax = plt.subplots(1,3, figsize=(16,12))
plt.suptitle('Histograms \n', fontsize=28, fontweight='bold')

# Calling the function hist with the information required (The position of the plot, name of the column and the ax)
hist(0,'Age',ax)
hist(1,'Annual Income (k$)',ax)
hist(2,'Spending Score (1-100)', ax)

# Organising the plot
plt.tight_layout();

Observing the histograms, the numerical columns are slightly skweed, which was checked before by the skewness calculation.

In [None]:
# Just like the last plot, I created a function to plot as many boxplots I need
# In the function I added the kind of plot I want and all the details of the plot
def box_plot(j,col,ax):
    ax=ax[j]
    sns.boxplot(y=col, data=df, ax=ax)
    ax.set_xlabel('')
    ax.set_ylabel('')
    ax.set_title(col, fontweight='bold')

In [None]:
# I stablished the number of plots I need and a suptitle
fig, ax = plt.subplots(1,3, figsize=(16,12))
plt.suptitle('Box Plots \n', fontsize=28, fontweight='bold')

# Calling the function box_plot with the information required
box_plot(0,'Age',ax)
box_plot(1,'Annual Income (k$)',ax)
box_plot(2,'Spending Score (1-100)', ax)

# Organising the plot
plt.tight_layout();

I identified the presence of outliers in the column Annual Income. Those will be treated shortly.

In [None]:
# I used a pairplot to check the scatter distributiong of all the variables at the same time
sns.pairplot(df);

Based on the pairplot, the Annual Income and Spending Score columns form clusters.

In [None]:
# Plotting the relation between Spending Score and Annual Income to check for clusters
fig, ax = plt.subplots(figsize=(12,6))
sns.scatterplot(x='Spending Score (1-100)', y='Annual Income (k$)', data=df, ax=ax)
ax.set_xlabel('Spending Score',fontsize=14, fontweight='bold')
ax.set_ylabel('Annual Income', fontsize=14, fontweight='bold')
ax.set_title('Spending Score X Annual Income \n', fontsize=22, fontweight='bold');

## Data cleaning

As detected before, there were outliers in the column Annual Income. As the dataset is quite small, I replaced the outliers by the mean of the column.

In [None]:
# To treat the outliers first I calculated the Interquartile Range (IQR)
Q1 = df['Annual Income (k$)'].quantile(0.25)
Q3 = df['Annual Income (k$)'].quantile(0.75)
IQR = Q3 - Q1

In [None]:
# Then I stablished the lower and upper limits
lower = Q1 - 1.5 * IQR
upper = Q3 + 1.5 * IQR
print('Lower limit is', lower)
print('Upper limit is', upper)

In [None]:
# Checking outliers and replacing them
outlier = []
for x in df['Annual Income (k$)']:
    if ((x> upper) or (x<lower)):
        outlier.append(x)
        df['Annual Income (k$)'].replace(x, df['Annual Income (k$)'].mean(), inplace=True)
print('Outliers on the dataset are: ', outlier)

In [None]:
# Plotting graph to check if the outliers were treated
sns.boxplot(y='Annual Income (k$)', data=df);

In [None]:
# Encoding categorical variable Gender
df = pd.get_dummies(df, columns=['Gender'], drop_first=True)

In [None]:
df.head()

## Modeling using K-Means

In [None]:
# Defining X as the two columns to be clustered
X = df[['Spending Score (1-100)', 'Annual Income (k$)']]

In [None]:
# I used the elbow method and the silhouette score to find the optimal number of clusters

WCSS = []                               # Within Cluster Sum of Squares
Silhouette_scores = []
cluster = range(2,11)
for k in cluster:
    kmeans = KMeans(n_clusters=k, random_state=123)
    df_km = kmeans.fit(X)
    score = silhouette_score(X, kmeans.labels_)
    WCSS.append(df_km.inertia_)
    Silhouette_scores.append(score)

In [None]:
# Plotting the elbow curve
plt.figure(figsize=(8,6))
plt.plot(cluster, WCSS)
plt.xlabel('K', fontsize=14, fontweight='bold')
plt.ylabel('Sum of Squared Distance', fontsize=14, fontweight='bold')
plt.title('Optimal Number k of clusters \n', fontsize=22, fontweight='bold');

In [None]:
# Plotting the silhouette curve
plt.figure(figsize=(8,6))
plt.plot(cluster, Silhouette_scores)
plt.xlabel('K', fontsize=14, fontweight='bold')
plt.ylabel('Silhouette', fontsize=14, fontweight='bold')
plt.title('Optimal Number k of clusters \n', fontsize=22, fontweight='bold');

According to both graphs the optimal number of clusters is k=5.

In [None]:
# Applying KMeans to n_clusters=5
kmeans = KMeans(n_clusters=5, random_state=123)
pred = kmeans.fit_predict(X)

In [None]:
# Creating column with the predicted clusters
df['Cluster'] = pred
df.head()

In [None]:
# Position of the centroids
kmeans.cluster_centers_

In [None]:
# Plotting the clusters with their respective centroids
plt.figure(figsize=(12,6))

plt.scatter(df[df['Cluster'] == 0]['Spending Score (1-100)'], df[df['Cluster'] == 0]['Annual Income (k$)'],
            s=60, c='orange', label='Cluster 0')
plt.scatter(df[df['Cluster'] == 1]['Spending Score (1-100)'], df[df['Cluster'] == 1]['Annual Income (k$)'],
            s=60, c='blue', label='Cluster 1')
plt.scatter(df[df['Cluster'] == 2]['Spending Score (1-100)'], df[df['Cluster'] == 2]['Annual Income (k$)'],
            s=60, c='green', label='Cluster 2')
plt.scatter(df[df['Cluster'] == 3]['Spending Score (1-100)'], df[df['Cluster'] == 3]['Annual Income (k$)'],
            s=60, c='magenta', label='Cluster 3')
plt.scatter(df[df['Cluster'] == 4]['Spending Score (1-100)'], df[df['Cluster'] == 4]['Annual Income (k$)'],
            s=60, c='red', label='Cluster 4')

# Plotting centroids and enumerating them
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=250, marker='o',
            c="white", alpha=1, edgecolors='k')

for i, c in enumerate(kmeans.cluster_centers_):
    plt.scatter(c[0], c[1], marker='$%d$' % i, alpha=1, s=100, edgecolor='k')

plt.title('Clusters \n', fontsize=22, fontweight='bold')
plt.xlabel('Spending Score', fontsize=14, fontweight='bold')
plt.ylabel('Annual Income (k$)', fontsize=14, fontweight='bold');

In [None]:
# Evaluating first model using Silhouette Coefficient
score = silhouette_score(X, kmeans.labels_)
print('Silhouette Score: %.2f' % score)

In [None]:
# Showing the silhouette plot
silhouette_visualizer(kmeans, X, colors='yellowbrick');

All the clusters have approximately the same size and thickness is following a pattern, which indicates that 5 clusters was the best option to sort the data.
Based on the results of the model, the clients can be grouped as:
- **First Group:** Low Annual Income and Low Spending Score
- **Second Group:** High Annual Income and High Spending Score
- **Third Group:** Average Annual Income and Average Spending Score (The bigger group according to the graph above)
- **Fourth Group:** High Annual Income and Low Spending Score
- **Fifth Group:** Low Annual Income and High Spending Score

Now it will be possible to create marketing strategies more effective for each group.