In [34]:
# import required libraries for dataframe and visualization

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
import plotly.express as px


# import required libraries for clustering
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from scipy.cluster.hierarchy import linkage
from scipy.cluster.hierarchy import dendrogram
from scipy.cluster.hierarchy import cut_tree
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

#### Load Data

In [35]:
retail = pd.read_excel('./data/Online Retail.xlsx')
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [36]:
retail.shape

(541909, 8)

In [37]:
retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    541909 non-null  object        
 1   StockCode    541909 non-null  object        
 2   Description  540455 non-null  object        
 3   Quantity     541909 non-null  int64         
 4   InvoiceDate  541909 non-null  datetime64[ns]
 5   UnitPrice    541909 non-null  float64       
 6   CustomerID   406829 non-null  float64       
 7   Country      541909 non-null  object        
dtypes: datetime64[ns](1), float64(2), int64(1), object(4)
memory usage: 33.1+ MB


In [38]:
retail.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


#### Data Cleaning

In [39]:
# Calculating the Missing Values % contribution in DF

df_null = round(100*(retail.isnull().sum())/len(retail), 2)
df_null

InvoiceNo       0.00
StockCode       0.00
Description     0.27
Quantity        0.00
InvoiceDate     0.00
UnitPrice       0.00
CustomerID     24.93
Country         0.00
dtype: float64

In [40]:
# Droping rows having missing values

retail = retail.dropna()
retail.shape

(406829, 8)

In [41]:
# Changing the datatype of Customer Id as per Business understanding

retail['CustomerID'] = retail['CustomerID'].astype(str)

#### Feature Engineering

In [42]:
# New Attribute : Monetary

retail['Amount'] = retail['Quantity']*retail['UnitPrice']
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [43]:
# This step groups the data in the retail DataFrame by the 'CustomerID' column 
# to calculates the sum of the 'Amount' column for each customer. 
rfm_m = retail.groupby('CustomerID')['Amount'].sum().reset_index()
rfm_m.head()

Unnamed: 0,CustomerID,Amount
0,12346.0,0.0
1,12347.0,4310.0
2,12348.0,1797.24
3,12349.0,1757.55
4,12350.0,334.4


In [44]:
# New Attribute : Frequency
# count the number of unique invoice numbers for each customer
rfm_f = retail.groupby('CustomerID')['InvoiceNo'].count().reset_index()
rfm_f.head()

Unnamed: 0,CustomerID,InvoiceNo
0,12346.0,2
1,12347.0,182
2,12348.0,31
3,12349.0,73
4,12350.0,17


In [45]:
rfm_f.columns

Index(['CustomerID', 'InvoiceNo'], dtype='object')

In [46]:
rfm_f.rename(columns={'InvoiceNo': 'Frequency'}, inplace=True)
rfm_f.head()

Unnamed: 0,CustomerID,Frequency
0,12346.0,2
1,12347.0,182
2,12348.0,31
3,12349.0,73
4,12350.0,17


In [47]:
# Merging the two dfs

merged_rfm = rfm_m.merge(rfm_f, on='CustomerID')
# another way 
# rfm = pd.merge(rfm_m, rfm_f, on='CustomerID', how='inner')
merged_rfm.head()

Unnamed: 0,CustomerID,Amount,Frequency
0,12346.0,0.0,2
1,12347.0,4310.0,182
2,12348.0,1797.24,31
3,12349.0,1757.55,73
4,12350.0,334.4,17


In [48]:
retail.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 406829 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype         
---  ------       --------------   -----         
 0   InvoiceNo    406829 non-null  object        
 1   StockCode    406829 non-null  object        
 2   Description  406829 non-null  object        
 3   Quantity     406829 non-null  int64         
 4   InvoiceDate  406829 non-null  datetime64[ns]
 5   UnitPrice    406829 non-null  float64       
 6   CustomerID   406829 non-null  object        
 7   Country      406829 non-null  object        
 8   Amount       406829 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(5)
memory usage: 31.0+ MB


In [49]:
# Convert to datetime to proper datatype

retail['InvoiceDate'] = pd.to_datetime(retail['InvoiceDate'],format='%d-%m-%Y %H:%M')
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34


In [50]:
# Compute the maximum date to know the last transaction date in our dataset

max_date = max(retail['InvoiceDate'])
max_date

Timestamp('2011-12-09 12:50:00')

In [51]:
# calculate the time difference between the maximum date in the dataset and the transaction date for each record

retail['Diff'] = max_date - retail['InvoiceDate']
retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Amount,Diff
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3,373 days 04:24:00
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,373 days 04:24:00
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0,373 days 04:24:00
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,373 days 04:24:00
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,373 days 04:24:00


In [52]:
# Compute last transaction date to get the recency of customers 
# As each customer may have multiple invonces that has been purshsed 
# So we need to create a indicator that tells us when he made the last transaction to find wather he is active or not
rfm_p = retail.groupby('CustomerID')['Diff'].min().reset_index()
rfm_p.head()

Unnamed: 0,CustomerID,Diff
0,12346.0,325 days 02:33:00
1,12347.0,1 days 20:58:00
2,12348.0,74 days 23:37:00
3,12349.0,18 days 02:59:00
4,12350.0,309 days 20:49:00


In [53]:
# Extract number of days only

rfm_p['Diff'] = rfm_p['Diff'].dt.days
rfm_p.head()

Unnamed: 0,CustomerID,Diff
0,12346.0,325
1,12347.0,1
2,12348.0,74
3,12349.0,18
4,12350.0,309


In [54]:
merged_rfm.head()

Unnamed: 0,CustomerID,Amount,Frequency
0,12346.0,0.0,2
1,12347.0,4310.0,182
2,12348.0,1797.24,31
3,12349.0,1757.55,73
4,12350.0,334.4,17


In [55]:
# Merge tha dataframes to get the final RFM dataframe

merged_rfm = merged_rfm.merge(rfm_p, on='CustomerID')

merged_rfm.rename(columns={'Diff': 'Recency'}, inplace=True)
merged_rfm.head()

Unnamed: 0,CustomerID,Amount,Frequency,Recency
0,12346.0,0.0,2,325
1,12347.0,4310.0,182,1
2,12348.0,1797.24,31,74
3,12349.0,1757.55,73,18
4,12350.0,334.4,17,309


In [56]:
# Create a DataFrame containing the selected attributes
attributes = ['Amount', 'Frequency', 'Recency']
data = merged_rfm[attributes]

# Create a box plot with Plotly
fig = px.box(data, y=attributes, labels={'variable': 'Attributes', 'value': 'Range'},
             title="Outliers Variable Distribution")
fig.update_layout(
    xaxis=dict(title="Attributes"),
    yaxis=dict(title="Range"),
    showlegend=False,
    boxmode='group',  # Display box plots side by side
    width=800, height=600
)
fig.show()

In [57]:
# Removing (statistical) outliers for Amount
Q1 = merged_rfm.Amount.quantile(0.05)
Q3 = merged_rfm.Amount.quantile(0.95)
IQR = Q3 - Q1
merged_rfm = merged_rfm[(merged_rfm.Amount >= Q1 - 1.5*IQR) & (merged_rfm.Amount <= Q3 + 1.5*IQR)]

In [58]:
# Removing (statistical) outliers for Recency
Q1 = merged_rfm.Recency.quantile(0.05)
Q3 = merged_rfm.Recency.quantile(0.95)
IQR = Q3 - Q1
merged_rfm = merged_rfm[(merged_rfm.Recency >= Q1 - 1.5*IQR) & (merged_rfm.Recency <= Q3 + 1.5*IQR)]

In [59]:
# Removing (statistical) outliers for Frequency
Q1 = merged_rfm.Frequency.quantile(0.05)
Q3 = merged_rfm.Frequency.quantile(0.95)
IQR = Q3 - Q1
merged_rfm = merged_rfm[(merged_rfm.Frequency >= Q1 - 1.5*IQR) & (merged_rfm.Frequency <= Q3 + 1.5*IQR)]

In [60]:
attributes = ['Amount', 'Frequency', 'Recency']

fig = px.box(merged_rfm, y=attributes, title="Outliers Variable Distribution", 
             labels={'variable': 'Attributes', 'value': 'Range'},
             boxmode='group', points='outliers')

fig.update_layout(
    xaxis=dict(title="Attributes", title_font=dict(size=14)),
    yaxis=dict(title="Range", title_font=dict(size=14)),
    showlegend=False,
    width=800,
    height=600
)

fig.show()

#### Rescaling the Attributes

In [61]:
# Rescaling the attributes

merged_rfm = merged_rfm[['Amount', 'Frequency', 'Recency']]

# Instantiate
scaler = StandardScaler()

# fit_transform
rfm_df_scaled = scaler.fit_transform(merged_rfm)
rfm_df_scaled.shape

(4293, 3)

In [62]:
rfm_df_scaled = pd.DataFrame(rfm_df_scaled)
rfm_df_scaled.columns = ['Amount', 'Frequency', 'Recency']
rfm_df_scaled.head()

Unnamed: 0,Amount,Frequency,Recency
0,-0.723738,-0.752888,2.301611
1,1.731617,1.042467,-0.906466
2,0.300128,-0.463636,-0.183658
3,0.277517,-0.04472,-0.738141
4,-0.533235,-0.603275,2.143188


#### Building the Model

K-Means Clustering\
K-means clustering is one of the simplest and popular unsupervised machine learning algorithms.\
The algorithm works as follows:\

1-First we initialize k points, called means, randomly.\

2-We categorize each item to its closest mean and we update the mean’s coordinates, which are the averages of the items categorized in that mean so far.\

3-We repeat the process for a given number of iterations and at the end, we have our clusters.\

In [63]:
#pip install threadpoolctl==3.1.0

In [64]:
# k-means with some arbitrary k

kmeans = KMeans(n_clusters=4, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [65]:
kmeans.labels_

array([2, 0, 1, ..., 2, 1, 1])

#### Finding the Optimal Number of Clusters

#### Elbow Curve to get the right number of Clusters¶
A fundamental step for any unsupervised algorithm is to determine the optimal number of clusters into which the data may be clustered. The Elbow Method is one of the most popular methods to determine this optimal value of k.

In [66]:
ssd = []
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(rfm_df_scaled)
    
    ssd.append(kmeans.inertia_)
    print("For n_clusters={0}, the Elbow score is {1}".format(num_clusters, kmeans.inertia_))

fig = px.line(x=range_n_clusters, y=ssd, 
              title="Elbow Curve for K-Means Clustering",
              labels={'x': 'Number of Clusters', 'y': 'Sum of Squared Distances (SSD)'})

fig.update_layout(
    xaxis=dict(title_font=dict(size=14)),
    yaxis=dict(title_font=dict(size=14)),
    showlegend=False,
    width=800,
    height=600
)

fig.show()

For n_clusters=2, the Elbow score is 7661.705869320753
For n_clusters=3, the Elbow score is 4433.58061930212
For n_clusters=4, the Elbow score is 3360.719102577021
For n_clusters=5, the Elbow score is 2802.8241284834658
For n_clusters=6, the Elbow score is 2392.1409779616197
For n_clusters=7, the Elbow score is 2038.9157610440066
For n_clusters=8, the Elbow score is 1828.3350601558518


#### Silhouette Analysis

silhouette score=p−q/max(p,q)

1-p is the mean distance to the points in the nearest cluster that the data point is not a part of

2-q is the mean intra-cluster distance to all the points in its own cluster.

3-The value of the silhouette score range lies between -1 to 1.

A score closer to 1 indicates that the data point is very similar to other data points in the cluster,

A score closer to -1 indicates that the data point is not similar to the data points in its cluster.

Silhouette analysis\
helps you assess the quality of clustering and find the number of clusters that best fits your data. Let's break down the code and understand its purpose.\

Purpose of Silhouette Analysis:\

Silhouette analysis helps you assess the quality of clustering for different numbers of clusters.\

The silhouette score ranges from -1 to 1, with higher values indicating better cluster separation and cohesion.\

By examining silhouette scores for various cluster numbers, you can determine the optimal number of clusters that best fit your data.\

In summary, this code performs silhouette analysis for a range of cluster numbers using K-Means clustering on the standardized RFM data. It helps you select the number of clusters that provides the best clustering quality for your data.\

In [67]:
# Silhouette analysis
range_n_clusters = [2, 3, 4, 5, 6, 7, 8]

for num_clusters in range_n_clusters:
    
    # intialise kmeans
    kmeans = KMeans(n_clusters=num_clusters, max_iter=50)
    kmeans.fit(rfm_df_scaled)
    
    cluster_labels = kmeans.labels_
    
    # silhouette score
    silhouette_avg = silhouette_score(rfm_df_scaled, cluster_labels)
    print("For n_clusters={0}, the silhouette score is {1}".format(num_clusters, silhouette_avg))

For n_clusters=2, the silhouette score is 0.5411246404292333
For n_clusters=3, the silhouette score is 0.5084896296141937
For n_clusters=4, the silhouette score is 0.4805351555840963
For n_clusters=5, the silhouette score is 0.46458018311256893
For n_clusters=6, the silhouette score is 0.41754956215303557
For n_clusters=7, the silhouette score is 0.4159203051748714
For n_clusters=8, the silhouette score is 0.4066069158968891


##### Silhouette Score Meaning: 
The silhouette score measures how similar each data point in a cluster is to the data points in the same cluster compared to other clusters. The score ranges from -1 to 1:**\

A higher score indicates better cluster separation and cohesion.\

A score close to 0 suggests overlapping or ambiguous clusters.\

A negative score means that data points might be assigned to the wrong clusters.\

Choosing the Best Number of Clusters: To determine the best number of clusters, you typically look for the number that gives the highest silhouette score.\

In your provided silhouette scores:\

For 2 clusters, the silhouette score is 0.5416.\

For 3 clusters, the silhouette score is 0.5085.\

For 4 clusters, the silhouette score is 0.4816.\

For 5 clusters, the silhouette score is 0.4646.\

For 6 clusters, the silhouette score is 0.4176.\

For 7 clusters, the silhouette score is 0.4148.\

For 8 clusters, the silhouette score is 0.4077.\

==> You chose "3" as the number of clusters because it has the second-highest silhouette score (0.5085),\

==> and it indicates reasonably good cluster separation and cohesion.\

==> While the silhouette score for 2 clusters is slightly higher, the difference is not substantial,\

==> and having 3 clusters may provide a more detailed and meaningful separation of your data.\

==> Keep in mind that the choice of the number of clusters may also depend on the specific problem and the insights you want to gain from the clustering analysis.\

It's often a balance between finding a simple model (few clusters) and capturing meaningful patterns (more clusters).

##### Silhouette Analysis Vs Elbow:
Silhouette analysis helps you figure out how good the groups are.\
The elbow method helps you find the right number of groups.

The elbow method looks at a graph and finds the point where it bends like an elbow. That point tells us how many teams to create for our use case.\

Both methods help us to figure out the best way to group data, on the basis of data organization\

When choosing the number of clusters for a clustering problem, it's important to consider both the silhouette score and the elbow method results, as they provide complementary insights.

##### Silhouette Score:¶
The silhouette score measures the quality of the clusters.

Higher silhouette scores indicate better separation and cohesion of clusters.

In your case, the highest silhouette score is for 2 clusters (0.5416), but the second-highest score is for 3 clusters (0.5085), which is still quite good.\

##### Elbow Method:
The elbow method looks at the reduction in error (WCSS) as you increase the number of clusters.

The point where adding more clusters doesn't significantly reduce the error is often considered the "elbow" and a reasonable choice for the number of clusters.

##### Decision:
Silhouette Score: The silhouette score suggests that 2 clusters or 3 clusters are both reasonable choices. Both options have good silhouette scores, with 2 clusters having a slightly higher score.

Elbow Method: The elbow method suggests that the "elbow" point occurs at 3 clusters. This is where the reduction in error starts to slow down significantly.


##### Conclusion:
Based on the silhouette score, you could choose either 2 or 3 clusters, and both are valid choices.

In [68]:
# Final model with k=3
kmeans = KMeans(n_clusters=3, max_iter=50)
kmeans.fit(rfm_df_scaled)

In [69]:
 kmeans.labels_

array([2, 1, 0, ..., 2, 0, 0])

In [70]:
# assign the label
merged_rfm['Cluster_Id'] = kmeans.labels_
merged_rfm.head()
Amount	Frequency	Recency	C

SyntaxError: invalid syntax (<ipython-input-70-a6ad124d846d>, line 4)

In [None]:
fig = px.box(merged_rfm, x='Cluster_Id', y='Amount',
             title="Cluster Id vs Amount Box Plot",
             labels={'Cluster_Id': 'Cluster ID', 'Amount': 'Amount'},
             color='Cluster_Id')

fig.update_layout(
    xaxis=dict(title="Cluster ID", title_font=dict(size=14)),
    yaxis=dict(title="Amount", title_font=dict(size=14)),
    showlegend=False,
    width=800,
    height=600
)

fig.show()

In [None]:
fig = px.scatter(merged_rfm, x='Cluster_Id', y='Amount', color='Frequency',
                 title='Cluster ID vs Amount (Color Encoded by Frequency)',
                 labels={'Cluster_Id': 'Cluster ID', 'Amount': 'Amount', 'Frequency': 'Frequency'})

fig.update_layout(
    xaxis=dict(title="Cluster ID", title_font=dict(size=14)),
    yaxis=dict(title="Amount", title_font=dict(size=14)),
    showlegend=True,
    width=800,
    height=600
)

fig.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Create a custom color palette with Red, Green, and Blue
custom_palette = sns.color_palette(["#FF0000", "#00FF00", "#0000FF"])

# Create a scatter plot matrix with separate plots for each cluster, custom palette, and a larger size
sns.set(style="ticks")
sns.pairplot(merged_rfm, hue='Cluster_Id', vars=['Amount', 'Frequency'], palette=custom_palette, height=4, aspect=1.5)
plt.suptitle('Cluster ID vs Amount & Frequency', y=1.02)
plt.show()

In [None]:
fig = px.box(merged_rfm, x='Cluster_Id', y='Frequency',
             title="Cluster Id vs Frequency Box Plot",
             labels={'Cluster_Id': 'Cluster ID', 'Frequency': 'Frequency'})

fig.update_layout(
    xaxis=dict(title="Cluster ID", title_font=dict(size=14)),
    yaxis=dict(title="Frequency", title_font=dict(size=14)),
    showlegend=False,
    width=800,
    height=600
)

fig.show()

In [None]:
fig = px.box(merged_rfm, x='Cluster_Id', y='Recency',
             title="Cluster Id vs Recency Box Plot",
             labels={'Cluster_Id': 'Cluster ID', 'Recency': 'Recency'})

fig.update_layout(
    xaxis=dict(title="Cluster ID", title_font=dict(size=14)),
    yaxis=dict(title="Recency", title_font=dict(size=14)),
    showlegend=False,
    width=800,
    height=600
)

fig.show()

In [None]:
# Selecting the two features for clustering
data_for_clustering = merged_rfm[['Amount', 'Frequency']]

# Specify the number of clusters (e.g., 3)
n_clusters = 3

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters)
merged_rfm['Cluster_2D'] = kmeans.fit_predict(data_for_clustering)

# Create a scatter plot with Plotly
fig = px.scatter(merged_rfm, x='Amount', y='Frequency', color='Cluster_2D',
                 title='Clustering by Amount and Frequency',
                 labels={'Amount': 'Amount', 'Frequency': 'Frequency', 'Cluster_2D': 'Cluster'})

fig.update_layout(
    xaxis=dict(title="Amount", title_font=dict(size=14)),
    yaxis=dict(title="Frequency", title_font=dict(size=14)),
    width=800,
    height=600
)

fig.show()

In [None]:
# Selecting the three features for clustering
data_for_clustering = merged_rfm[['Amount', 'Frequency', 'Recency']]

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters)
merged_rfm['Cluster_3D'] = kmeans.fit_predict(data_for_clustering)

# Visualize the clusters (scatter plot matrix)
sns.set(style="ticks")
sns.pairplot(merged_rfm, hue='Cluster_3D', vars=['Amount', 'Frequency', 'Recency'], palette='Set1')
plt.suptitle('Clustering by Amount, Frequency, and Recency', y=1.02)
plt.show()

In [None]:
import plotly.express as px

# Apply K-Means clustering
kmeans = KMeans(n_clusters=n_clusters)
merged_rfm['Cluster_3D'] = kmeans.fit_predict(data_for_clustering)

# Create a 3D scatter plot matrix with Plotly
fig = px.scatter_3d(merged_rfm, x='Amount', y='Frequency', z='Recency', color='Cluster_3D',
                     labels={'Amount': 'Amount', 'Frequency': 'Frequency', 'Recency': 'Recency', 'Cluster_3D': 'Cluster'})

fig.update_layout(
    scene=dict(
        xaxis_title='Amount',
        yaxis_title='Frequency',
        zaxis_title='Recency',
    ),
    title='Clustering by Amount, Frequency, and Recency',
    width=800,
    height=600
    )

fig.show()

### Final Conclusion
K-Means Clustering with 3 Cluster Ids¶
Customers with Cluster Id 1 are the customers with high amount of transactions as compared to other customers.

Customers with Cluster Id 1 are frequent buyers.

Customers with Cluster Id 2 are not recent buyers and hence least of importance from business point of view.

Hierarchical Clustering with 3 Cluster Labels

Customers with Cluster_Labels 2 are the customers with high amount of transactions as compared to other customers.

Customers with Cluster_Labels 2 are frequent buyers.

Customers with Cluster_Labels 0 are not recent buyers and hence least of importance from business point of view.