In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
dataset = pd.read_csv('Mall_Customers.csv')
dataset.head(10)

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100)
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40
5,6,Female,22,17,76
6,7,Female,35,18,6
7,8,Female,23,18,94
8,9,Male,64,19,3
9,10,Female,30,19,72


In [3]:
X = dataset.iloc[:, [3, 4]].values

In [4]:
from sklearn.cluster import KMeans
import warnings
warnings.simplefilter("ignore")

#---------------------------------------------------------------------------------------------------------------------------------------------#
# Estimation of Within-Cluster-Sum-of-Squares (WCSS) for different numbers of clusters
#---------------------------------------------------------------------------------------------------------------------------------------------#
wcss = []
#---------------------------------------------------------------------------------------------------------------------------------------------#
# Iterate over a range of cluster numbers (1 to 14)
#---------------------------------------------------------------------------------------------------------------------------------------------#
for i in range(1, 15):
#---------------------------------------------------------------------------------------------------------------------------------------------#
    # Create a KMeans model with 'i' clusters using k-means++ initialization
#---------------------------------------------------------------------------------------------------------------------------------------------#
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 0)
#---------------------------------------------------------------------------------------------------------------------------------------------#
    # Fit the model to the data and compute the WCSS
#---------------------------------------------------------------------------------------------------------------------------------------------#
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
#---------------------------------------------------------------------------------------------------------------------------------------------#
# After running this code, typically plot the wcss values and visually inspect the plot to identify the optimal number of clusters for the 
# dataset. The optimal number of clusters is often chosen at the "elbow" point where the reduction in WCSS starts to diminish.
#---------------------------------------------------------------------------------------------------------------------------------------------#

In [5]:
#---------------------------------------------------------------------------------------------------------------------------------------------#
# However, it is generally difficult to visually estimate what the appropriate value is. Using the kneed library, the code automatically 
# determines the optimal number of clusters based on the knee/elbow point in the WCSS plot. This provides a more objective and automated 
# approach to selecting the appropriate number of clusters for your dataset.
#---------------------------------------------------------------------------------------------------------------------------------------------#
from kneed import KneeLocator
# Use the KneeLocator to identify the elbow point in the WCSS plot
kneedle = KneeLocator(range(1,15), wcss, curve = 'convex', direction = 'decreasing')
elbow_point = kneedle.elbow
elbow_point

5

In [6]:
# Create a line plot for the Elbow Method
fig1 = px.line(x=range(1, 15), y=wcss, markers=True, line_shape='linear')

fig1.update_layout(
    title='<b style="font-size:24px">The Elbow Method</b>',
    xaxis=dict(title='Number of Clusters'),
    yaxis=dict(title='WCSS'),
    font=dict(family='Montserrat', color='#F2F2F2'),
    template='plotly_dark'
)

fig1.update_traces(
    line=dict(color='#6331C5', width=2),
    marker=dict(size=12, line=dict(color='#F2F2F2', width=0.8), color='#6331C5'),
    hovertemplate="<b>Nº Clusters:</b> %{x}<br><b>WCSS:</b> %{y}",
)

fig1.add_shape(
    dict(
        type="line",
        x0=elbow_point, x1=elbow_point,
        y0=0, y1=wcss[elbow_point-1],
        line=dict(color="#12BF80", width=2, dash='dash'),
        layer='below',
        name='Best Elbow',
        legendgroup='elbow'
    )
)

fig1.add_shape(
    dict(
        type="line",
        x0=elbow_point, x1=0,
        y0=wcss[elbow_point-1], y1=wcss[elbow_point-1],
        line=dict(color="#12BF80", width=2, dash='dash'),
        layer='below'
    )
)

fig1.add_trace(
    go.Scatter(x=[None], y=[None], mode='lines', line=dict(color="#12BF80", width=2, dash='dash'),
               name='<b>Best Elbow</b>', legendgroup='elbow',)
)

fig1.update_layout(
    legend=dict(x=0.005, y=0.31, bgcolor="rgba(255, 255, 255, 0.5)", orientation="h",font=dict(color="#262626"))
)

fig1.show()


#---------------------------------------------------------------------------------------------------------------------------------------------#
# This plot visually represents the Elbow Method, making it easier to identify the optimal number of clusters based on the knee/elbow point in 
# the plot. The "Best Elbow" line and marker highlight the identified optimal number of clusters.
#---------------------------------------------------------------------------------------------------------------------------------------------#

In [7]:
# Create a KMeans model with the optimal number of clusters
kmeansmodel = KMeans(n_clusters = elbow_point, init = 'k-means++', random_state = 0)
y_kmeans = kmeansmodel.fit_predict(X)

#---------------------------------------------------------------------------------------------------------------------------------------------#
# Each element of y_kmeans represents the cluster to which the corresponding data point belongs. Using y_kmeans array to analyze the 
# clustering results or visualize the data points in the assigned clusters.
#---------------------------------------------------------------------------------------------------------------------------------------------#


In [8]:
cluster_colors = ['#6331C5', '#3F7AD8', '#12BF80', '#B715B7', '#F6CB53']

# Create a DataFrame for Plotly
df = pd.DataFrame({'x': X[:, 0], 'y': X[:, 1], 'cluster': y_kmeans})

# Sort the DataFrame by cluster for better visualization
df_clustersort = df.sort_values(by='cluster')
df_clustersort = df_clustersort.reset_index(drop=True)


fig2 = go.Figure()

# Add scatter traces for each cluster
for cluster, color in zip(df_clustersort['cluster'].unique(), cluster_colors):
    cluster_data = df_clustersort[df_clustersort['cluster'] == cluster]
    fig2.add_trace(go.Scatter(
        x=cluster_data['x'],
        y=cluster_data['y'],
        mode='markers',
        marker=dict(color=color, size=10, line=dict(color='#F2F2F2', width=0.5)),
        name=f'Cluster {cluster}',
        hovertemplate=f'<extra></extra><b>Cluster {cluster}</b><br>' +
                      '<b>Annual Income (k$):</b> %{x}<br>' +
                      '<b>Spending Score (1-100)</b>: %{y}'
    ))

fig2.update_layout(
    title='<b style="font-size:20px;">Clusters of customers</b><br><span style="font-size:12px;">from Mall Customers Dataset</span>',
    xaxis=dict(title='Annual Income (k$)'),
    yaxis=dict(title='Spending Score (1-100)'),
    font=dict(family='Montserrat', color='#F2F2F2'),
    legend=dict(orientation="h", x=0.63, y=1.13, bgcolor="rgba(255, 255, 255, 0.5)",font=dict(color="#262626")),
    template='plotly_dark'
)

fig2.show()

#---------------------------------------------------------------------------------------------------------------------------------------------#
# The final figure is displayed, showcasing the clusters of customers based on their annual income and spending score.
# This type of visualization helps identify patterns and behaviors of customer segments, enabling the establishment of targeted marketing or 
# sales strategies based on different customer groups.
# In this case, identify the consumption according to annual income, low income is not always low consumption, or high income is reflected in 
# high consumption.
#---------------------------------------------------------------------------------------------------------------------------------------------#

In [9]:
#---------------------------------------------------------------------------------------------------------------------------------------------#
# # A step like this can be important for identifying age groups or gender associated with each cluster, facilitating decision-making in 
# marketing campaigns.
#---------------------------------------------------------------------------------------------------------------------------------------------#
labels = kmeansmodel.labels_
b = dataset.copy()
b['cluster'] = labels
b.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# Now, DataFrame b contains the original data along with the assigned cluster labels, making it easier to analyze and interpret the 
# characteristics of each cluster. This information can be used for targeted marketing strategies, such as tailoring campaigns based on the 
# specific preferences or behaviors of each customer segment. 
# It could enable businesses to adapt their sales strategies based on the characteristics of different customer groups, potentially leading 
# to cost reductions and more effective campaigns.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,CustomerID,Genre,Age,Annual Income (k$),Spending Score (1-100),cluster
0,1,Male,19,15,39,3
1,2,Male,21,15,81,4
2,3,Female,20,16,6,3
3,4,Female,23,16,77,4
4,5,Female,31,17,40,3


In [10]:
c = b.groupby(['cluster'])[['Annual Income (k$)', 'Spending Score (1-100)', 'Age']].mean().reset_index()
c.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# DataFrame c contains information about the average annual income, spending score, and age for each cluster. This type of analysis is useful 
# for gaining insights into the distinct characteristics of different customer segments identified by the clustering algorithm. This 
# information could use to tailor strategies based on the preferences and behaviors of each cluster, improving marketing campaigns and 
# customer engagement. The same approach can be applied to various types of data analysis, including demographic studies and medical research, 
# for example, identifying age groups where the adverse effects of a vaccine can be concentrated.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,cluster,Annual Income (k$),Spending Score (1-100),Age
0,0,55.296296,49.518519,42.716049
1,1,86.538462,82.128205,32.692308
2,2,88.2,17.114286,41.114286
3,3,26.304348,20.913043,45.217391
4,4,25.727273,79.363636,25.272727


In [11]:
# Group by cluster and count the number of clients in each cluster
d = b.groupby(['cluster'])[['CustomerID']].count().reset_index()
d.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# DataFrame d contains information about the number of customers in each cluster. By calculating the percentage of customers in each cluster, 
# be could prioritize efforts and resources towards the most significant customer segments. This knowledge aids in targeted marketing, 
# customer retention strategies, and overall business decision-making.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,cluster,CustomerID
0,0,81
1,1,39
2,2,35
3,3,23
4,4,22


In [12]:
# Merge DataFrames c and d by the 'cluster' column
segcustomers = c.merge(d,on='cluster',how='left')
# Calculate the percentage of clients in each cluster
segcustomers['%CustomerID']=segcustomers['CustomerID']*100/segcustomers['CustomerID'].sum()
segcustomers['Age']=segcustomers['Age'].round()
segcustomers['Spending Score (1-100)']=segcustomers['Spending Score (1-100)'].round()
segcustomers.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# "segcustomers" contains a consolidated view of the average annual income, spending score, and age, along with the count and percentage of 
# customers in each cluster. This type of information is valuable for strategic decision-making, allowing to focus on clusters with the most 
# significant impact on customer base.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,cluster,Annual Income (k$),Spending Score (1-100),Age,CustomerID,%CustomerID
0,0,55.296296,50.0,43.0,81,40.5
1,1,86.538462,82.0,33.0,39,19.5
2,2,88.2,17.0,41.0,35,17.5
3,3,26.304348,21.0,45.0,23,11.5
4,4,25.727273,79.0,25.0,22,11.0


In [13]:
# Group by cluster and gender, count the number of clients in each combination
genre=b.groupby(['cluster','Genre'])['CustomerID'].count().reset_index()
# Creating a Pivot Table to visualize the distribution of gender in each cluster.
genre=pd.pivot_table(genre,values='CustomerID',index=['cluster'],columns=['Genre'],aggfunc=np.sum)
genre=genre.reset_index()
genre.columns.name = None
genre=genre[['cluster','Female','Male']]
genre.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# "genre" contains a breakdown of the customer count by gender within each cluster. This information is useful for understanding gender 
# distribution patterns in different customer segments, allowing to tailor marketing strategies to specific subgroups within each cluster.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,cluster,Female,Male
0,0,48,33
1,1,21,18
2,2,16,19
3,3,14,9
4,4,13,9


In [14]:
# Merge the segcustomers and genre DataFrames by the 'cluster' column
segcusbygenre=segcustomers.merge(genre,on='cluster',how='left')
# Calculate the percentage of female and male customers in each cluster
segcusbygenre['%Female']=(segcusbygenre['Female']*100)/segcusbygenre['CustomerID']
segcusbygenre['%Male']=(segcusbygenre['Male']*100)/segcusbygenre['CustomerID']
segcusbygenre=segcusbygenre[['cluster','Annual Income (k$)','Spending Score (1-100)','Age','%CustomerID','%Female','%Male']]
segcusbygenre.head()
#---------------------------------------------------------------------------------------------------------------------------------------------#
# "segcusbygenre" contains information about the average annual income, spending score, and age, along with the percentage of female and male 
# customers in each cluster. In this case, the information could be useful for refining marketing strategies based on both demographic and 
# behavioral characteristics within each customer segment.
#---------------------------------------------------------------------------------------------------------------------------------------------#

Unnamed: 0,cluster,Annual Income (k$),Spending Score (1-100),Age,%CustomerID,%Female,%Male
0,0,55.296296,50.0,43.0,40.5,59.259259,40.740741
1,1,86.538462,82.0,33.0,19.5,53.846154,46.153846
2,2,88.2,17.0,41.0,17.5,45.714286,54.285714
3,3,26.304348,21.0,45.0,11.5,60.869565,39.130435
4,4,25.727273,79.0,25.0,11.0,59.090909,40.909091


## Saving Graphics

Add custom styles with Montserrat font to the HTML file

In [15]:
styles = """
        <link href="https://fonts.googleapis.com/css2?family=Montserrat:wght@400;700&display=swap" rel="stylesheet">
        <style>
            body {
                font-family: 'Montserrat', sans-serif;
            }
        </style>
    """

In [16]:
fig1.write_html("./HTMLs/TheElbowMethod_MLCustomers.html")

with open("./HTMLs/TheElbowMethod_MLCustomers.html", "r", encoding="utf-8") as file:
    content = file.read()

content = content.replace("</head>", styles + "</head>")

with open("./HTMLs/TheElbowMethod_MLCustomers.html", "w", encoding="utf-8") as file:
    file.write(content)
    
fig2.write_html("./HTMLs/ClustersMallCustomers.html")

with open("./HTMLs/ClustersMallCustomers.html", "r", encoding="utf-8") as file:
    content = file.read()

content = content.replace("</head>", styles + "</head>")

with open("./HTMLs/ClustersMallCustomers.html", "w", encoding="utf-8") as file:
    file.write(content)