In [1]:
pip install plotly --upgrade

Note: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np 
import pandas as pd 
import plotly.io as pio
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

pio.renderers.default = 'colab'
pio.templates.default = 'plotly'

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [3]:
df = pd.read_excel(r'Downloads\Customer_Segmentation_Data.xlsx')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   ID                   2240 non-null   int64         
 1   Year_Birth           2240 non-null   int64         
 2   Education            2240 non-null   object        
 3   Marital_Status       2240 non-null   object        
 4   Income               2216 non-null   float64       
 5   Kidhome              2240 non-null   int64         
 6   Teenhome             2240 non-null   int64         
 7   Dt_Customer          2240 non-null   datetime64[ns]
 8   Recency              2240 non-null   int64         
 9   MntWines             2240 non-null   int64         
 10  MntFruits            2240 non-null   int64         
 11  MntMeatProducts      2240 non-null   int64         
 12  MntFishProducts      2240 non-null   int64         
 13  MntSweetProducts     2240 non-nul

In [5]:
df['Frequency'] = df['NumWebPurchases']+df['NumCatalogPurchases']+df['NumStorePurchases']

In [6]:
df['Monetary'] = df['MntWines']+df['MntFruits']+df['MntMeatProducts']+df['MntFishProducts']+df['MntSweetProducts']+df['MntGoldProds']

In [7]:
df_rfm = df[['Recency','Frequency','Monetary']]
df_rfm.describe()

Unnamed: 0,Recency,Frequency,Monetary
count,2240.0,2240.0,2240.0
mean,49.109375,12.537054,605.798214
std,28.962453,7.205741,602.249288
min,0.0,0.0,5.0
25%,24.0,6.0,68.75
50%,49.0,12.0,396.0
75%,74.0,18.0,1045.5
max,99.0,32.0,2525.0


In [8]:
colors_rfm = ['rgb(183, 9, 76)',  'rgb(92, 77, 125)', 'rgb(0, 145, 173)']

In [9]:
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

fig = px.box(pd.melt(df_rfm), x='variable', y='value',
             title='<b>RFM Data Distribution Before Scaling</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='overlay', points='all')

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [10]:
std = StandardScaler()
df_rfm_scaled = pd.DataFrame(std.fit_transform(df_rfm), columns=df_rfm.columns)

In [11]:
#plotting rfm distribution after scaling
fig = px.box(pd.melt(df_rfm_scaled), x='variable', y='value',
             title='<b>RFM Data Distribution After Scaling</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='overlay', points='all')

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()



In [12]:
fig = px.box(pd.melt(df_rfm_scaled), x='variable', y='value',
             title='<b>RFM Data Distribution After Scaling</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='overlay', points='all')

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [13]:
fig = px.scatter_3d(df_rfm_scaled, x='Recency', y='Frequency', z='Monetary',
                    title='<b>RFM Mapping</b>',
                    opacity=0.5,color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(marker_size=5)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)', title_font_size=22)

fig.show()

In [14]:
#calculating inertia for each k for elbow method
inertias = []
K = range(2,10)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit(df_rfm_scaled)
    inertias.append(kmeans.inertia_)

In [15]:
#plotting elbow method
fig = px.line(x=K, y=inertias,
              title='<b>Optimal Number of Clusters by Elbow Method</b>',
              color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(line_width=4)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  xaxis_title='k (No. of Clusters)', yaxis_title='Inertia')
fig.add_vline(x=4, line_width=3, line_dash='dash', line_color='rgb(183, 9, 76)')

fig.add_annotation(x=4.1, y=1800, text='<i>optimal k=4</i>', font_size=16,
                   showarrow=True, ax=60, ay=-30, arrowhead=2, arrowsize=1,
                   arrowwidth=2)

fig.add_shape(type='circle', xref='x', yref='y',
    x0=3.9, y0=1650, x1=4.1,y1=1914,
    line_color='rgb(183, 9, 76)')

fig.show()

In [16]:
#calculating silhouette score for each k
silhouette = []
K = range(2,10)
for k in K:
    kmeans = KMeans(n_clusters=k)
    kmeans.fit_predict(df_rfm_scaled)
    score = silhouette_score(df_rfm_scaled, kmeans.labels_)
    silhouette.append(score)

In [17]:
#plotting silhouette score
fig = px.line(x=K, y=silhouette,
              title='<b>Optimal Number of Clusters by Silhouette Score</b>',
              color_discrete_sequence=['rgb(5, 60, 94)'])

fig.update_traces(line_width=4)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  xaxis_title='k (No. of Clusters)',
                  yaxis_title='Silhouette Score')

fig.add_vline(x=2, line_width=3, line_dash='dash', line_color='rgb(183, 9, 76)')

fig.add_annotation(x=2.1, y=0.44, text='<i>optimal k=2</i>', font_size=16,
                   showarrow=True, ax=60, ay=-30, arrowhead=2, arrowsize=1,
                   arrowwidth=2)

fig.add_shape(type='circle', xref='x', yref='y',
    x0=1.9, y0=0.432, x1=2.1,y1=0.442,
    line_color='rgb(183, 9, 76)')

fig.show()

In [18]:
#k-means with k=4
kmeans = KMeans(n_clusters=4, random_state=2022)
kmeans.fit(df_rfm_scaled)

#sorting cluster orders to get same label results in every run
idx = np.argsort(kmeans.cluster_centers_.sum(axis=1))
luv = np.zeros_like(idx)
luv[idx] = np.arange(4) #lookup values

y_pred = luv[kmeans.labels_] #use label according to lookup values

In [19]:
#adding the clusters column to the orignal dataframe for further analysis
df_rfm_scaled["Clusters"]= y_pred
df["Clusters"]= y_pred

In [20]:
#setting the colors of clusters
colors_cluster = ['rgb(183, 9, 76)', 'rgb(137, 43, 100)','rgb(69, 94, 137)',
                  'rgb(0, 145, 173)']

In [21]:
#plotting cluster results
fig = px.scatter_3d(df_rfm_scaled, x='Recency', y='Monetary', z='Frequency',
                    color='Clusters',title='<b>Cluster Results</b>',
                    opacity=0.5,color_continuous_scale=colors_cluster)

fig.update_traces(marker_size=5)

fig.update_layout(showlegend=False,paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [22]:
#convert to long format table
df_rfm_long = pd.melt(df_rfm_scaled, id_vars='Clusters')

#plotting rfm distributions
fig = px.box(df_rfm_long, x='Clusters', y='value',
             title='<b>RFM Distribution by Cluster</b>',
             color='variable', color_discrete_sequence=colors_rfm,
             boxmode='group')

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',
                  title_font_size=22)

fig.show()

In [23]:
#plotting number of customers in clusters
fig = px.histogram(df, x='Clusters', color='Clusters',
                   color_discrete_sequence=colors_cluster,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Number of Customers in Each Cluster</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4)

fig.show()

In [24]:
#identifying the centroid from original data
centroid = df.groupby('Clusters')[['Recency','Frequency','Monetary']].agg('mean')
centroid

Unnamed: 0_level_0,Recency,Frequency,Monetary
Clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,23.520194,7.056543,151.368336
1,73.570523,7.012678,146.244057
2,22.950644,19.770386,1184.530043
3,73.145038,19.230916,1181.335878


In [25]:
#plotting income & monetary by clusters
#one customer with income of 666666 is excluded because it's obscuring the pattern
fig = px.scatter(df[df['Income']<500000], x='Monetary', y='Income',
                 color='Clusters',
                 color_continuous_scale=colors_cluster,
                 title='<b>Clusters by Income & Monetary</b>',
                 opacity=0.5)

fig.update_traces(marker_size=10)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22)

fig.show()

In [26]:
#getting all the campaign related features
deals = df[['NumDealsPurchases','Clusters']]

#getting new variable of AllCmp which will be 1 if customers ever accepted any campaign
deals['DealsAccpt'] = np.where(deals.iloc[:,0]>0,1,0)
deals = deals.sort_values('DealsAccpt')

In [27]:
colors_bin = ['rgb(183, 9, 76)','rgb(0, 145, 173)']

In [28]:
#plotting deals acceptance rate for each cluster
fig = px.histogram(deals, x='Clusters', barnorm='percent',
                   color='DealsAccpt',
                   color_discrete_sequence=colors_bin,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Deals Acceptance Rate</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4, yaxis_title='Percent (%)')

fig.show()

In [29]:
#plotting number of deals purchases for each cluster
fig = px.histogram(df, x='Clusters', y='NumDealsPurchases', color='Clusters',
                   color_discrete_sequence=colors_cluster,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Number of Deals Purchases by Clusters</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4)

fig.show()

In [30]:
#getting all the campaign related features
cmpgn = df[['AcceptedCmp1','AcceptedCmp2','AcceptedCmp3','AcceptedCmp4',
               'AcceptedCmp5','Response','Clusters']]

#getting new variable of AllCmp which will be 1 if customers ever accepted any campaign
cmpgn['CmpAccpt'] = np.where((cmpgn.iloc[:,0:5].mean(axis=1))>0,1,0)
cmpgn = cmpgn.sort_values('CmpAccpt')

#shortening column name
new_col = dict(AcceptedCmp1='Cmp1',AcceptedCmp2='Cmp2',
               AcceptedCmp3='Cmp3',AcceptedCmp4='Cmp4',
               AcceptedCmp5='Cmp5',Response='CmpLast')

cmpgn.rename(columns=new_col, inplace=True)

In [31]:
#plotting campaigns acceptance rate
fig = px.histogram(cmpgn, x='Clusters', barnorm='percent',
                   color='CmpAccpt',
                   color_discrete_sequence=colors_bin,
                   category_orders=dict(Clusters=[0,1,2,3]),
                   title='<b>Campaigns Acceptance Rate</b>',
                   text_auto=True)

fig.update_layout(paper_bgcolor='rgb(229, 236, 246)',title_font_size=22,
                  bargap=0.4, yaxis_title='Percent (%)')

fig.show()

In [32]:
#convert table wide to long format for each cluster
cmpgn_long_0 = pd.melt(cmpgn[cmpgn['Clusters']==0][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_1 = pd.melt(cmpgn[cmpgn['Clusters']==1][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_2 = pd.melt(cmpgn[cmpgn['Clusters']==2][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])
cmpgn_long_3 = pd.melt(cmpgn[cmpgn['Clusters']==3][['Cmp1','Cmp2',
                                                    'Cmp3','Cmp4',
                                                    'Cmp5','CmpLast']])

In [33]:
#plotting campaigns performance on each cluster
fig = make_subplots(rows=2, cols=2,
                    subplot_titles=('Cluster 0', 'Cluster 1', 'Cluster 2',
                                    'Cluster 3'),
                    shared_yaxes=True)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_0['variable'],
                 y=cmpgn_long_0['value'], marker_color=colors_cluster[0],
                 texttemplate='%{y}'),
    row=1, col=1)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_1['variable'],
                 y=cmpgn_long_1['value'], marker_color=colors_cluster[1],
                 texttemplate='%{y}'),
    row=1, col=2)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_2['variable'],
                 y=cmpgn_long_2['value'], marker_color=colors_cluster[2],
                 texttemplate='%{y}'),
    row=2, col=1)

fig.add_trace(
    go.Histogram(histfunc='sum', x=cmpgn_long_3['variable'],
                 y=cmpgn_long_3['value'], marker_color=colors_cluster[3],
                 texttemplate='%{y}'),
    row=2, col=2)

fig.update_layout(showlegend=False, paper_bgcolor='rgb(229, 236, 246)',
                  title_text='<b>Campaigns Performance on Each Cluster</b>',
                  title_font_size=22, yaxis_range=[0,150], yaxis3_range=[0,150])