# Authors: Parth Sathiya, Mohammed TaherKhan, Krishkumar Patel

In [None]:
import os
import kagglehub
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# Downloading the dataset

#Dataset source: Kaggle (2024).
#Reference:
#https://www.kaggle.com/datasets/vishakhdapat/customer-segmentation-clustering

# Importing all required libraries for loading data, cleaning, visualization, and clustering.
# Downloading the Kaggle dataset and loading it as a DataFrame.
# Also keeping a full copy (df_copy) so we can always compare with original data.


In [None]:
# Download latest version
path = kagglehub.dataset_download("vishakhdapat/customer-segmentation-clustering")
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'customer-segmentation-clustering' dataset.
Path to dataset files: /kaggle/input/customer-segmentation-clustering


In [None]:
print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/customer-segmentation-clustering


In [None]:
# path = "/root/.cache/kagglehub/datasets/vishakhdapat/customer-segmentation-clustering/versions/1"
print(os.listdir(path))

['customer_segmentation.csv']


# Loading the Dataset

In [None]:
df = pd.read_csv(os.path.join(path, os.listdir(path)[0] ))
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [None]:
df_copy = df.copy()
df_copy.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,7,0,0,0,0,0,0,3,11,1
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,5,0,0,0,0,0,0,3,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,4,0,0,0,0,0,0,3,11,0
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,6,0,0,0,0,0,0,3,11,0
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,5,0,0,0,0,0,0,3,11,0


In [None]:
# df.to_csv("customer_segmentation.csv", index=False)

# Exploratory Data Analysis

In [None]:
print("Dataset has",df.shape[0], "Records and", df.shape[1], "Attribute or Features")

Dataset has 2240 Records and 29 Attribute or Features


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2240 entries, 0 to 2239
Data columns (total 29 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   2240 non-null   int64  
 1   Year_Birth           2240 non-null   int64  
 2   Education            2240 non-null   object 
 3   Marital_Status       2240 non-null   object 
 4   Income               2216 non-null   float64
 5   Kidhome              2240 non-null   int64  
 6   Teenhome             2240 non-null   int64  
 7   Dt_Customer          2240 non-null   object 
 8   Recency              2240 non-null   int64  
 9   MntWines             2240 non-null   int64  
 10  MntFruits            2240 non-null   int64  
 11  MntMeatProducts      2240 non-null   int64  
 12  MntFishProducts      2240 non-null   int64  
 13  MntSweetProducts     2240 non-null   int64  
 14  MntGoldProds         2240 non-null   int64  
 15  NumDealsPurchases    2240 non-null   i

In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Year_Birth,0
Education,0
Marital_Status,0
Income,24
Kidhome,0
Teenhome,0
Dt_Customer,0
Recency,0
MntWines,0


#Checking dataset size and data types.
#As there are only 24 records with missing income value and we beleive Income is significant feature therefore, we are dropping this few records

In [None]:
df.dropna(inplace = True)

In [None]:
df.shape

(2216, 29)

Checking redundancy

In [None]:
df.duplicated().sum()

np.int64(0)

In [None]:
df.isnull().sum()

Unnamed: 0,0
ID,0
Year_Birth,0
Education,0
Marital_Status,0
Income,0
Kidhome,0
Teenhome,0
Dt_Customer,0
Recency,0
MntWines,0


In [None]:
df.dtypes.value_counts()

Unnamed: 0,count
int64,25
object,3
float64,1


In [None]:
df.describe(include="object")

Unnamed: 0,Education,Marital_Status,Dt_Customer
count,2216,2216,2216
unique,5,8,662
top,Graduation,Married,31-08-2012
freq,1116,857,12


In [None]:
df.describe(exclude='object')

Unnamed: 0,ID,Year_Birth,Income,Kidhome,Teenhome,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
count,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,...,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0,2216.0
mean,5588.353339,1968.820397,52247.251354,0.441787,0.505415,49.012635,305.091606,26.356047,166.995939,37.637635,...,5.319043,0.073556,0.074007,0.073105,0.064079,0.013538,0.009477,3.0,11.0,0.150271
std,3249.376275,11.985554,25173.076661,0.536896,0.544181,28.948352,337.32792,39.793917,224.283273,54.752082,...,2.425359,0.261106,0.261842,0.260367,0.24495,0.115588,0.096907,0.0,0.0,0.357417
min,0.0,1893.0,1730.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
25%,2814.75,1959.0,35303.0,0.0,0.0,24.0,24.0,2.0,16.0,3.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
50%,5458.5,1970.0,51381.5,0.0,0.0,49.0,174.5,8.0,68.0,12.0,...,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
75%,8421.75,1977.0,68522.0,1.0,1.0,74.0,505.0,33.0,232.25,50.0,...,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
max,11191.0,1996.0,666666.0,2.0,2.0,99.0,1493.0,199.0,1725.0,259.0,...,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


# Dropping rows with missing Income.
# Checking duplicates to confirm each record is unique.


In [None]:
# sns.pairplot(df)

In [None]:
corr_matrix = df.corr(numeric_only=True)

fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    title='Correlation Heatmap (Plotly)',
    aspect='auto'
)

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=1600,
    height=1600
)

fig.show()


# Correlation heatmap to understand relationships.
# Spending features are strongly correlated with each other.
# All purchase-related features also move together.
# Campaign acceptance features correlate with Response.


In [None]:
# px.histogram(df, x="Income", nbins=100, title="Income Distribution")
def plot_income_distribution(df):
    fig = px.histogram(df, x="Income", nbins=100, title="Income Distribution", text_auto=True)
    fig.update_layout(xaxis_title="Income", yaxis_title="Frequency", bargap=0.2)
    fig.show()

plot_income_distribution(df)

In [None]:
df[df['Income']>600000]

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
2233,9432,1977,Graduation,Together,666666.0,1,0,02-06-2013,23,9,...,6,0,0,0,0,0,0,3,11,0


Here we can observe that there is one record which has 666666$ income, we would like to consider this customer as Extreme Rich customer, so we would like to categorised them in different category. In future it may be sensitive to clustering algorithm, therefore we are considering it as outlier, although we know and we accept that the income value is legiteimate and could be achievable.

# Plotting income distribution to identify extreme outliers.
# Removing the 666k income and incomes >120k so clustering is not distorted.


In [None]:
mask = df['Income'] > 600000
removed = int(mask.sum())
df = df.loc[~mask].copy()

In [None]:
# px.histogram(df, x="Income", nbins=100, title="Income Distribution")
def plot_income_distribution(df):
    fig = px.histogram(df, x="Income", nbins=100, title="Income Distribution", text_auto=True)
    fig.update_layout(xaxis_title="Income", yaxis_title="Frequency", bargap=0.2)
    fig.show()

plot_income_distribution(df)

After removing the ultra-high income record (666,666), we plot the income distribution again. The histogram now looks much more realistic and smoother. Most customers still fall in the 20k–80k range, which represents the main purchasing population.

We can still see a few high-income customers above 120k–160k, but these values are much closer to the rest of the dataset and do not distort the shape of the distribution.

By removing the one extreme outlier, we make the dataset more stable for clustering. This prevents the algorithm from forming unnecessary clusters around a single unusual customer and keeps the segmentation focused on meaningful income groups that represent the majority of our customers.

In [None]:
mask = df['Income'] > 120000
removed = int(mask.sum())
df = df.loc[~mask].copy()

In [None]:
# px.histogram(df, x="Income", nbins=100, title="Income Distribution")
def plot_income_distribution(df):
    fig = px.histogram(df, x="Income", nbins=100, title="Income Distribution", text_auto=True)
    fig.update_layout(xaxis_title="Income", yaxis_title="Frequency", bargap=0.2)
    fig.show()

plot_income_distribution(df)

In this step, we remove all customers with income above 120,000. These values are still valid in real life, but they are very rare in this dataset and behave as high-influence outliers. If we keep them, the clustering algorithm may try to form a separate cluster for a very small number of customers, which reduces the quality of the segmentation.

After removing these high-income outliers, the histogram becomes much smoother and more balanced. Almost all customers now fall between 20k and 90k, which represents a realistic middle-to-upper-middle income group.

This cleaned income distribution is much more suitable for clustering because:

It avoids extreme values pulling the centroids,

It allows clusters to form around meaningful spending-power groups,

It reflects the behaviour of the majority of the customer base.

This step ensures that our segmentation will focus on realistic customer groups rather than a few exceptional high-income individuals.

In [None]:
df['Age'] = datetime.now().year - df['Year_Birth']

fig = px.histogram(df, x='Age',nbins=20, title='Age Distribution',labels={'Age': 'Age', 'count': 'Frequency'},text_auto=True,color_discrete_sequence=['indigo'])
fig.update_layout(xaxis_title="Age", yaxis_title="Frequency", bargap=0.2)
fig.show()


Here we can see that there are 3 records with age over 120, which is suspicious.

We calculate each customer’s age from the Year_Birth column and plot the distribution. We can see that most customers fall between 40 and 70 years old, with the highest concentration around their 50s. This suggests that the company’s customer base is mainly made up of middle-aged adults.

We also notice a few extreme age values above 100, including two customers aged around 120+. These ages are unrealistic and clearly represent data entry errors. Such outliers can mislead the clustering algorithm because KMeans might try to create a separate group for these invalid values.

Identifying this pattern is important because it tells us that we need to clean unrealistic age values before performing segmentation.

In [None]:
df[df['Age']>120]

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age
192,7829,1900,2n Cycle,Divorced,36640.0,1,0,26-09-2013,99,15,...,0,0,0,0,0,1,3,11,0,125
239,11004,1893,2n Cycle,Single,60182.0,0,1,17-05-2014,23,8,...,0,0,0,0,0,0,3,11,0,132
339,1150,1899,PhD,Together,83532.0,0,0,26-09-2013,36,755,...,0,0,1,0,0,0,3,11,0,126


In [None]:
df['Age'].dtypes

dtype('int64')

In [None]:
mask = df['Age'] > 120
removed = int(mask.sum())
df = df.loc[~mask].copy()

print(f"Removed {removed} rows with Age > 120. New shape: {df.shape}")

Removed 3 rows with Age > 120. New shape: (2205, 30)


In [None]:
def age_type(a):
    if a is None:
        return 'Unknown'
    if a <= 12:  return 'Child'
    if a <= 19:  return 'Teen'
    if a <= 24:  return 'Young'
    if a <= 60:  return 'Adult'
    if a <= 90:  return 'Senior'
    return 'Old'

# ensure numeric
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')
df['Cust_AgeCatg'] = df['Age'].apply(age_type)


Here we identify customers with an age greater than 120 years, which is unrealistic and clearly the result of data entry errors. We find 3 such records, and we remove them to keep our dataset clean and suitable for clustering. Keeping invalid ages would create misleading patterns and could force the model to form a separate cluster around these incorrect values.

After cleaning the outliers, we create an Age Category feature using a custom function. This new feature groups customers into categories such as Child, Teen, Young, Adult, Senior, and Old.

This step does not directly affect the numerical clustering yet, but it helps us later when interpreting clusters in human terms. Instead of saying a cluster has an average age of 55, we can describe it more meaningfully—such as “this cluster mainly contains senior and adult customers.”

In [None]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Cust_AgeCatg
0,5524,1957,Graduation,Single,58138.0,0,0,04-09-2012,58,635,...,0,0,0,0,0,3,11,1,68,Senior
1,2174,1954,Graduation,Single,46344.0,1,1,08-03-2014,38,11,...,0,0,0,0,0,3,11,0,71,Senior
2,4141,1965,Graduation,Together,71613.0,0,0,21-08-2013,26,426,...,0,0,0,0,0,3,11,0,60,Adult
3,6182,1984,Graduation,Together,26646.0,1,0,10-02-2014,26,11,...,0,0,0,0,0,3,11,0,41,Adult
4,5324,1981,PhD,Married,58293.0,1,0,19-01-2014,94,173,...,0,0,0,0,0,3,11,0,44,Adult


In [None]:
df['Cust_AgeCatg'].value_counts()

Unnamed: 0_level_0,count
Cust_AgeCatg,Unnamed: 1_level_1
Adult,1434
Senior,771


In [None]:
df['Education'].value_counts()

Unnamed: 0_level_0,count
Education,Unnamed: 1_level_1
Graduation,1113
PhD,476
Master,364
2n Cycle,198
Basic,54


After creating the Cust_AgeCatg feature, we check how many customers fall into each age group. Most customers are in the Adult and Senior categories, which matches what we saw earlier in the age distribution. This tells us that the company mainly attracts middle-aged and older customers, which may influence spending behaviour and campaign responsiveness.

We also look at the distribution of customer Education levels. The majority of customers have Graduation or PhD/Master’s level education, while only a small number fall into lower education categories such as Basic or 2n Cycle.

Knowing the education distribution helps us understand the type of customer base we are working with. Higher education levels often correlate with higher income and more informed purchasing decisions, which will be useful when interpreting the segments later.

In [None]:
fig = px.pie(
    df,
    names='Education',
    title='Education Level Proportion'
)
fig.show()


This pie chart shows the proportion of customers based on their education level. We observe that more than half of the customers (around 50%) have completed Graduation, followed by PhD and Master’s degree holders. Only a very small proportion belong to the 2n Cycle or Basic education categories.

This tells us that the customer base is mostly well-educated, which often correlates with higher and more stable income levels. This information is useful for segmentation because education can influence spending habits, product preferences, and how customers respond to marketing campaigns.

Understanding this distribution helps us interpret clusters later—for example, higher-education groups may align more with higher-spending or more responsive segments.

In [None]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Cust_AgeCatg'],
      dtype='object')

In [None]:
fig = px.pie(
    df,
    names='Marital_Status',
    title='Marital_Status Proportion'
)
fig.show()


This pie chart shows the distribution of customers based on their marital status. The majority of customers are Married (38.7%) or Together (25.8%), which means more than 64% of the customer base lives with a partner. Around 21.3% are Single, and about 10% are Divorced.

There are also very small categories such as Widow, Alone, Absurd, and YOLO, each representing less than 1% of the data. These values are too rare to contribute meaningfully to clustering and may introduce noise.

Understanding this distribution is useful because marital status often influences household size, spending habits, and purchasing priorities. Later, when building clusters, we will group all these very small categories into “Others” so that the clustering algorithm focuses on the major groups (Married, Together, Single, Divorced) without being distracted by tiny or inconsistent categories.

# Features

In [None]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Cust_AgeCatg'],
      dtype='object')

Creating important feature which could be relavance

In [None]:
print(df['Dt_Customer'].dtypes)

object


In [None]:
df['Dt_Customer'] = pd.to_datetime(df['Dt_Customer'], format = '%d-%m-%Y')
print(df['Dt_Customer'].dtypes)

datetime64[ns]


In [None]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Cust_AgeCatg
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,0,0,0,3,11,1,68,Senior
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,0,0,0,3,11,0,71,Senior
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,0,0,0,0,0,3,11,0,60,Adult
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,0,0,0,0,0,3,11,0,41,Adult
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,0,0,0,0,0,3,11,0,44,Adult


In this step, we convert the Dt_Customer column from a string format into a proper datetime format. This allows us to perform date calculations accurately. The join date is important because it helps us understand how long each customer has been connected with the company.

After converting the column, we create a new feature called Tenure, which measures how many years a customer has been with the organisation. We calculate tenure by taking the difference between today’s date and the customer’s join date, and then converting the number of days into years.

This feature is valuable for segmentation because customers who have been with the company for longer often behave differently than new customers. Long-tenure customers may show higher loyalty, different spending patterns, or higher response rates to campaigns. Including Tenure allows the clustering model to capture these long-term behavioural differences.

Need to determine since how long the customer is conected with the organization

In [None]:
df['Tenure'] = (((datetime.now() - df['Dt_Customer']).dt.days) / 365.25).round(1)

In [None]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response,Age,Cust_AgeCatg,Tenure
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,0,0,0,0,3,11,1,68,Senior,13.2
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,0,0,0,0,3,11,0,71,Senior,11.7
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,0,0,0,0,3,11,0,60,Adult,12.3
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,0,0,0,0,3,11,0,41,Adult,11.8
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,0,0,0,0,3,11,0,44,Adult,11.9


Creating Family size feature

In [None]:
df['Family_Members'] = df['Kidhome'] + df['Teenhome'] + 2 #Assuming 2 Adults
# df.drop(['Kidhome','Teenhome'], axis=1, inplace=True)

Combining spending related features

In [None]:
spending_cols = ['MntWines', 'MntFruits', 'MntMeatProducts', 'MntFishProducts',
 'MntSweetProducts', 'MntGoldProds']

df['Total_Spending'] = df[spending_cols].sum(axis=1)
# df.drop(spending_cols, axis=1,inplace=True)

Combining Purchase related features

In [None]:
purchase_cols = ['NumDealsPurchases', 'NumWebPurchases', 'NumCatalogPurchases',
                 'NumStorePurchases', 'NumWebVisitsMonth']
df['Total_Purchase'] = df[purchase_cols].sum(axis=1)



In [None]:
# df['Online_Purchase'] = df['NumWebPurchases'] + df['NumCatalogPurchases']
# df.drop(purchase_cols, axis=1, inplace=True )

Looking at Campaign Success Rate

In [None]:
camp_cols = ['AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
             'AcceptedCmp4', 'AcceptedCmp5']

df['Total_AcceptedCmp'] = df[camp_cols].sum(axis=1)
# df.drop(camp_cols, axis=1, inplace=True)

In [None]:
df['Marital_Status'].value_counts()

Unnamed: 0_level_0,count
Marital_Status,Unnamed: 1_level_1
Married,854
Together,568
Single,470
Divorced,230
Widow,76
Alone,3
Absurd,2
YOLO,2


More Generalizing the Marital_Status feature

In [None]:
rare_categories = ['Widow','Alone','Absurd','YOLO']

df['Marital_Status'] = df['Marital_Status'].replace(rare_categories,'Others')

In [None]:
df['Marital_Status'].value_counts()

Unnamed: 0_level_0,count
Marital_Status,Unnamed: 1_level_1
Married,854
Together,568
Single,470
Divorced,230
Others,83


In [None]:
df.head()

Unnamed: 0,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,...,Z_CostContact,Z_Revenue,Response,Age,Cust_AgeCatg,Tenure,Family_Members,Total_Spending,Total_Purchase,Total_AcceptedCmp
0,5524,1957,Graduation,Single,58138.0,0,0,2012-09-04,58,635,...,3,11,1,68,Senior,13.2,2,1617,32,0
1,2174,1954,Graduation,Single,46344.0,1,1,2014-03-08,38,11,...,3,11,0,71,Senior,11.7,4,27,11,0
2,4141,1965,Graduation,Together,71613.0,0,0,2013-08-21,26,426,...,3,11,0,60,Adult,12.3,2,776,25,0
3,6182,1984,Graduation,Together,26646.0,1,0,2014-02-10,26,11,...,3,11,0,41,Adult,11.8,3,53,14,0
4,5324,1981,PhD,Married,58293.0,1,0,2014-01-19,94,173,...,3,11,0,44,Adult,11.9,3,422,24,0


In this step, we create several new features that help describe customer behaviour more clearly. These engineered features will later help the clustering algorithm detect meaningful patterns.

1. Family Size
We calculate Family_Members by adding the number of kids (Kidhome) and teenagers (Teenhome) and then adding 2 adults. This gives us an estimate of the total household size. Family size is important because it often influences spending levels and product preferences.

2. Total Spending
We combine all spending-related columns such as wines, fruits, meat, fish, sweets, and gold products into one feature called Total_Spending. This single value helps us identify low-spending, moderate-spending, and high-spending customers more easily.

3. Total Purchase Activity
We sum all purchase-related columns (web, store, deals, catalog, and web visits) to create Total_Purchase. This feature reflects how active a customer is. High-activity customers behave differently from occasional buyers, so this feature is important for segmentation.

4. Total Campaign Acceptance
We combine the five campaign acceptance columns into one feature, Total_AcceptedCmp. This shows how responsive a customer is to marketing campaigns. Customers who accept more campaigns may be easier to target with promotions.

5. Cleaning and Generalising Marital Status
We check the counts of each marital status category and notice some rare or unusual labels such as Widow, Alone, Absurd, and YOLO. These categories are too small to be meaningful for clustering.

To avoid noise, we group all these rare categories into a single label called “Others”. This keeps the feature clean and ensures that the clustering focuses on major customer groups like Married, Together, Single, and Divorced.

After creating all these new features, we update the dataset and can now see meaningful behavioural variables such as Tenure, Family_Members, Total_Spending, Total_Purchase, and Total_AcceptedCmp, which will be important inputs for our customer segmentation model.

#Aspect 1

# Feature Selection


In [None]:
selected_features = ['Income', 'Recency', 'Age', 'Tenure', 'Family_Members',
       'Total_Spending', 'Total_Purchase', 'Total_AcceptedCmp']

In [None]:
df[selected_features]

Unnamed: 0,Income,Recency,Age,Tenure,Family_Members,Total_Spending,Total_Purchase,Total_AcceptedCmp
0,58138.0,58,68,13.2,2,1617,32,0
1,46344.0,38,71,11.7,4,27,11,0
2,71613.0,26,60,12.3,2,776,25,0
3,26646.0,26,41,11.8,3,53,14,0
4,58293.0,94,44,11.9,3,422,24,0
...,...,...,...,...,...,...,...,...
2235,61223.0,46,58,12.5,3,1341,23,0
2236,64014.0,56,79,11.5,5,444,29,1
2237,56981.0,91,44,11.8,2,1241,25,1
2238,69245.0,8,69,11.9,3,843,26,0


In this step, we selected only the features that are meaningful for customer segmentation. Not every column in the dataset helps in understanding customer behaviour, so we kept the most important ones:

Income – tells us the financial strength of the customer.

Recency – how recently the customer made a purchase.

Age – helps understand age-based purchasing patterns.

Tenure – how long the customer has been with the company.

Family_Members – useful because family size can change spending behaviour.

Total_Spending – measures total money spent across all product categories.

Total_Purchase – how many purchases they made across all channels.

Total_AcceptedCmp – measures campaign responsiveness.

These features capture income level, purchasing behaviour, loyalty, and responsiveness, which are the core indicators needed to cluster customers into meaningful groups.

Scaling the features

In [None]:
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(df[selected_features])

In [None]:
# df_scaled = pd.DataFrame(X_scaled, columns = df[selected_features].columns)
df_scaled = df[selected_features]

In [None]:
df_scaled

Unnamed: 0,Income,Recency,Age,Tenure,Family_Members,Total_Spending,Total_Purchase,Total_AcceptedCmp
0,58138.0,58,68,13.2,2,1617,32,0
1,46344.0,38,71,11.7,4,27,11,0
2,71613.0,26,60,12.3,2,776,25,0
3,26646.0,26,41,11.8,3,53,14,0
4,58293.0,94,44,11.9,3,422,24,0
...,...,...,...,...,...,...,...,...
2235,61223.0,46,58,12.5,3,1341,23,0
2236,64014.0,56,79,11.5,5,444,29,1
2237,56981.0,91,44,11.8,2,1241,25,1
2238,69245.0,8,69,11.9,3,843,26,0


Calc Silhoutte Score

In order to determine the optimal number of clusters we need to use Elbow method or Silhoutte score method.

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import plotly.graph_objects as go
from plotly.subplots import make_subplots

k_values = list(range(2, 21))

wcss = []
silhouette_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(df_scaled[selected_features])

    # WCSS
    wcss.append(kmeans.inertia_)

    # Silhouette Score
    sil_score = silhouette_score(df_scaled[selected_features], labels)
    silhouette_scores.append(sil_score)


fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Elbow Method (WCSS)", "Silhouette Score vs k")
)

# Elbow curve (WCSS)
fig.add_trace(
    go.Scatter(
        x=k_values,
        y=wcss,
        mode="lines+markers",
        name="WCSS"
    ),
    row=1, col=1
)

# Silhouette curve
fig.add_trace(
    go.Scatter(
        x=k_values,
        y=silhouette_scores,
        mode="lines+markers",
        name="Silhouette Score"
    ),
    row=1, col=2
)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=1)
fig.update_yaxes(title_text="WCSS", row=1, col=1)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=2)
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)

fig.update_layout(
    title="KMeans Cluster Evaluation: Elbow & Silhouette",
    width=1000,
    height=400
)

fig.show()


In this step, we test different values of k (number of clusters) from 2 to 20 and evaluate each KMeans model using two metrics:

WCSS (Within-Cluster Sum of Squares) – used in the Elbow method. Lower WCSS means points are closer to their cluster centres.

Silhouette Score – measures how well-separated the clusters are (values closer to 1 are better).

In the Elbow plot (left), WCSS drops quickly when k increases from 2 to around 5, and after that the curve becomes flatter. This “elbow” shape suggests that adding more than 5 clusters does not give a big improvement.

In the Silhouette plot (right), the best score is at k = 2, but that would give us only “high-level” separation and would be too simple for rich customer segmentation. For k between 4 and 6, the silhouette score is still reasonably good and stable.

Based on both plots together, we decide to use k = 5 clusters for this aspect. This gives:

enough groups to capture different customer behaviours,

a reasonable silhouette score,

and a clear elbow in the WCSS curve.

So, k = 5 is a good balance between model quality and business interpretability for our customer segmentation.

In [None]:
kmeans = KMeans(
  n_clusters=5,
  init='k-means++',
  n_init=100,
  random_state=82)

df['Clusters'] = kmeans.fit_predict(df_scaled[selected_features])

In [None]:
cluster_summary = df.groupby('Clusters')[selected_features].mean()

In [None]:
cluster_summary

Unnamed: 0_level_0,Income,Recency,Age,Tenure,Family_Members,Total_Spending,Total_Purchase,Total_AcceptedCmp
Clusters,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,66613.836364,48.90101,58.539394,12.428283,2.743434,1023.09697,24.949495,0.282828
1,20946.409222,50.086455,50.14121,12.427666,2.936599,72.216138,14.492795,0.069164
2,51535.380457,49.580042,59.559252,12.385447,3.278586,439.365904,21.715177,0.203742
3,82167.552486,49.610497,57.519337,12.344475,2.303867,1441.765193,23.679558,0.944751
4,36637.103846,47.446154,53.548077,12.3825,3.296154,140.953846,15.769231,0.107692


In [None]:
df['Clusters'].value_counts()

Unnamed: 0_level_0,count
Clusters,Unnamed: 1_level_1
4,520
0,495
2,481
3,362
1,347


In this step, we run KMeans with 5 clusters using the selected features (Income, Recency, Age, Tenure, Family size, Total_Spending, Total_Purchase, and Total_AcceptedCmp).
We store the cluster label for each customer in a new column called Clusters, and then calculate the average value of each feature per cluster to understand the profile of every group.

From the cluster_summary table we can see clear differences between the clusters:

Cluster 3 has the highest Income, highest Total_Spending and the highest campaign acceptance, so this looks like our VIP / premium customers.

Cluster 1 has low Income, very low Total_Spending and low purchases, so this group represents low-value, price-sensitive customers.

Clusters 0, 2 and 4 sit in between: they have medium incomes and spending, with different family sizes and slightly different campaign responses. These can be seen as middle-tier customer segments with varying value and engagement levels.

This shows that the clustering is not random – it is separating customers into groups that make sense in terms of value and behaviour.

In [None]:
import plotly.express as px

counts = (
    df['Clusters'].value_counts()
      .sort_index()
      .rename_axis('cluster')
      .reset_index(name='count')
)
counts['percent'] = (counts['count'] / counts['count'].sum() * 100).round(1)

fig = px.bar(
    counts, x='cluster', y='count',
    text=counts['percent'].astype(str) + '%',
    title='Number of points per cluster'
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Count')
fig.show()


The bar chart shows how many customers fall into each of the 5 clusters, along with the percentage of the total dataset. All clusters have a similar size (around 15–24% each), and there is no extremely small or extremely large cluster.

This is good for customer segmentation because each group is large enough to be interesting for marketing actions, and the model is not dominated by one huge cluster with a few tiny ones. It means our 5-cluster solution is well balanced and practical for designing different strategies for each segment.

Here we can observe that the data points has be assigned to the appropriate clusters and all the cluster has proper data points appears to be balanced result.

**Cluster 0 – “Upper-Middle Income, High Spenders”**
| Feature            | Value | Interpretation           |
| ------------------ | ----- | ------------------------ |
| Income             | 66k   | Middle-high income       |
| Total Spending     | 1023  | High spender             |
| Total Purchase     | 24.9  | Active customer          |
| Accepted Campaigns | 0.28  | Willing to accept offers |

These are strong customers. Good for targeted campaigns.

**Cluster 1 – “Low Income, Low Spending, Low Response”**
| Feature            | Value | Interpretation                 |
| ------------------ | ----- | ------------------------------ |
| Income             | 20k   | Low income                     |
| Total Spending     | 72    | Very low spender               |
| Total Purchase     | 14.4  | Lower purchasing activity      |
| Accepted Campaigns | 0.06  | Almost never accepts marketing |

This is a low-value, cost-sensitive customer group.

**Cluster 2 – “Mid-Income, Moderate Spending, Large Families”**
| Feature        | Value | Interpretation  |
| -------------- | ----- | --------------- |
| Income         | 51k   | Moderate income |
| Total Spending | 439   | Mid spender     |
| Family Size    | 3.27  | Larger families |

Family-oriented buyers who spend moderately.

**Cluster 3 – “High-Income Power Spenders”**

| Feature            | Value | Interpretation                  |
| ------------------ | ----- | ------------------------------- |
| Income             | 82k   | High income                     |
| Total Spending     | 1441  | Extremely high spender          |
| Total Purchases    | 23.7  | Very active                     |
| Accepted Campaigns | 0.94  | Responds strongly to promotions |

Your BEST, premium customers.
High value + high engagement.

**Cluster 4 – “Young-to-Mid Adults, Moderate Income, Low Spending”**

| Feature        | Value | Interpretation  |
| -------------- | ----- | --------------- |
| Income         | 36k   | Mid-low income  |
| Total Spending | 140   | Low spender     |
| Age            | 53    | Mixed age group |
| Family Size    | 3.29  | Larger families |

Limited spending, possibly price-conscious families.

In [None]:
import plotly.express as px

cluster_heat = cluster_summary.copy()
cluster_heat.index = cluster_heat.index.astype(str)

fig_heat = px.imshow(
    cluster_heat,
    labels=dict(x="Features", y="Clusters", color="Mean Value"),
    x=cluster_heat.columns,
    y=cluster_heat.index,
    title="Cluster Centroid Heatmap",
    aspect="auto"
)

fig_heat.update_xaxes(side="top")
fig_heat.show()


This heatmap shows the mean value of each feature inside every cluster. Each row represents one customer segment, and each column is one behaviour metric such as income, spending, recency, or campaign response.

The colour intensity makes it easy to compare clusters side-by-side:

Cluster 3 clearly stands out with the highest income and highest total spending, confirming it is our premium / high-value group.

Cluster 1 has the lowest income and lowest total spending, which represents budget-sensitive customers.

The other clusters fall between these two extremes, each with different combinations of age, tenure, and purchase behaviour.

This plot gives a quick visual summary of how each segment is different and helps us understand what makes every cluster unique.

In [None]:
import plotly.express as px

df_box = df.copy()
df_box['Cluster'] = df_box['Clusters'].astype(str)

fig_box_spend = px.box(
    df_box,
    x='Cluster',
    y='Total_Spending',
    title='Total Spending Distribution by Cluster',
    points='outliers'
)

fig_box_spend.show()


This box plot shows how customer spending differs across the five clusters. Each box represents the spread of yearly total spending for that segment.

We can clearly see strong differences:

Cluster 3 has the highest spending, with a large interquartile range and many customers spending above 1500. This confirms it as our top-value / premium customer segment.

Cluster 0 also shows high and consistent spending, but slightly lower than Cluster 3.

Cluster 2 and Cluster 4 have moderate spending, indicating mid-value customers.

Cluster 1 has the lowest spending, with a very small range and low median, showing this segment is mostly low-engagement, low-value customers.

The box plot helps validate that the clusters based on behaviour are meaningful: high-value customers naturally fall into their own groups, and low spenders form separate clusters. This supports the usefulness of the segmentation for marketing strategies.

In [None]:
# Cluster summary (centroids)
cluster_summary = df.groupby('Clusters')[selected_features].mean()

# Normalize values for radar chart
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cluster_norm = pd.DataFrame(
    scaler.fit_transform(cluster_summary),
    columns=cluster_summary.columns
)
cluster_norm['Cluster'] = cluster_norm.index


In [None]:
import plotly.graph_objects as go

categories = selected_features

fig = go.Figure()

for i, row in cluster_norm.iterrows():
    fig.add_trace(go.Scatterpolar(
        r=row[categories].values,
        theta=categories,
        fill='toself',
        name=f'Cluster {row["Cluster"]}'
    ))

fig.update_layout(
    polar=dict(
        radialaxis=dict(visible=True, range=[0, 1])
    ),
    showlegend=True,
    title='Radar Chart of Cluster Profiles',
    width=900,
    height=700
)

fig.show()


The radar chart gives a side-by-side comparison of all five clusters across the key behavioural features we used for segmentation (income, age, spending, purchases, family size, recency, etc.).

Because all values are normalised between 0 and 1, this plot makes it easy to see how each cluster is different:

Cluster 3 stands out with very high income, high spending, high purchase counts, and strong campaign responsiveness. This confirms it as our top-value premium customer group.

Cluster 1 shows low values across almost every feature, especially spending and purchases. This segment represents low-engagement, low-spending customers.

Cluster 0 has moderately high income and good purchase activity, making it a mid-to-high value group.

Cluster 2 has older customers with larger families, moderate income, and medium spending.

Cluster 4 has the largest families, lower income, and low spending, suggesting a budget-conscious household segment.

The radar chart summarises all clusters visually in one place, helping us quickly understand the strengths and weaknesses of each customer segment.

#Evaluation

In [None]:
from sklearn.metrics import silhouette_score

sil_score = silhouette_score(df_scaled[selected_features], df['Clusters'])
sil_score


np.float64(0.5374206089948406)

| Score           | Meaning                                   |
| --------------- | ----------------------------------------- |
| **0.70 – 1.00** | Excellent, very distinct clusters         |
| **0.50 – 0.70** | **Very good, well-separated clusters**    |
| **0.30 – 0.50** | Good / acceptable                         |
| **0.10 – 0.30** | Weak clustering                           |
| **< 0.00**      | Bad clustering                            |


In [None]:
from sklearn.metrics import davies_bouldin_score

db_score = davies_bouldin_score(df_scaled[selected_features], df['Clusters'])
db_score


np.float64(0.530370151358658)

To evaluate how good our customer segmentation is, we use two standard clustering metrics:

1. Silhouette Score ≈ 0.537
A silhouette score above 0.50 indicates well-separated, meaningful clusters.
Our score of ~0.53 shows that customers inside each cluster behave similarly, while customers across clusters behave differently.
This confirms that our 5-cluster solution is strong and reliable.

2. Davies–Bouldin Score ≈ 0.53
For this metric, lower is better (0 = perfect clustering).
A value around 0.5 indicates good cluster separation, meaning the clusters are compact and not heavily overlapping.

Together, these metrics confirm that our segmentation is valid, stable, and well-structured, making it suitable for real marketing strategies and customer profiling.

#Aspect 2


Anlaysing the customer based on their interest shown in the marketing campaign

In [None]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Cust_AgeCatg', 'Tenure', 'Family_Members', 'Total_Spending',
       'Total_Purchase', 'Total_AcceptedCmp', 'Clusters'],
      dtype='object')

In [None]:
campaign_features = [
    'AcceptedCmp1', 'AcceptedCmp2', 'AcceptedCmp3',
    'AcceptedCmp4', 'AcceptedCmp5', 'Response',
    'Total_AcceptedCmp'
]

behavior_features = [
    'Income', 'Total_Spending', 'Tenure','Family_Members'
]

selected_features_mkt = campaign_features + behavior_features

In [None]:
corr_matrix = df[selected_features_mkt].corr(numeric_only=True)

fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    title='Correlation Heatmap (Plotly)',
    aspect='auto'
)

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=700,
    height=700

)

fig.show()


In this part we focus on customers’ interest in marketing campaigns.

First, we group all the campaign-related variables (AcceptedCmp1–AcceptedCmp5 and Response) and create Total_AcceptedCmp as the total number of campaigns accepted. Then we combine these with key behavioural features: Income, Total_Spending, Tenure, and Family_Members.

We then build a correlation heatmap to see how campaign acceptance is related to customer behaviour:

All the AcceptedCmp* variables and Response are strongly positively correlated with Total_AcceptedCmp. This confirms that Total_AcceptedCmp is a good single measure of how responsive a customer is to marketing campaigns.

Total_AcceptedCmp has a moderate positive correlation with both Income and Total_Spending. This means customers who earn more and spend more are also more likely to accept marketing campaigns. These are high-value, marketing-responsive customers.

The correlation between Tenure and campaign acceptance is very weak, which suggests that just being with the company for a long time does not automatically mean the customer responds more to campaigns.

Family_Members shows a negative correlation with both Total_AcceptedCmp and Total_Spending. Larger families seem to be more budget-conscious and less likely to accept campaigns, probably because they are more careful with spending.

Overall, campaign responsiveness is strongly linked to spending power and income, and less to tenure, while family size tends to reduce both spending and campaign acceptance. This gives us a clear picture of which customer types are most attractive for targeted marketing.

In [None]:
X = df[selected_features_mkt].dropna().copy()
X_index = X.index

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# X_scaled = X

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

k_values = list(range(2, 11))

wcss = []
sil_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = kmeans.fit_predict(X_scaled)

    wcss.append(kmeans.inertia_)                     # elbow
    sil_scores.append(silhouette_score(X_scaled, labels))  # quality

# Plot side-by-side
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Elbow Method (WCSS) – KMeans",
                    "Silhouette Score vs k – KMeans")
)

fig.add_trace(
    go.Scatter(x=k_values, y=wcss, mode="lines+markers", name="WCSS"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=k_values, y=sil_scores, mode="lines+markers", name="Silhouette"),
    row=1, col=2
)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=1)
fig.update_yaxes(title_text="WCSS", row=1, col=1)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=2)
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)

fig.update_layout(
    title="KMeans Evaluation for Marketing-Interest Features",
    width=1000,
    height=450
)

fig.show()


curve starts to flatten. This tells us that adding more than 4–5 clusters gives smaller and smaller improvement in compactness.

Silhouette score (right): The best score is at k = 2, but that would only give a very coarse split (e.g., “responders vs non-responders”). For k between 4 and 6, the silhouette score is lower but stays relatively stable, meaning the clusters are still reasonably well separated.

So, from a pure metric view, the elbow suggests around 4–5 clusters, while the silhouette curve tells us that more than 6 clusters does not bring clear quality gains. In our project we choose k = 5 as a good compromise:

it keeps enough detail to distinguish different campaign-interest patterns,

the quality is still acceptable, and

it also matches the 5-cluster structure we used in the main segmentation, which makes the results easier to interpret and compare.

In [None]:
best_k = k_values[int(np.argmax(sil_scores))]
print("Best k by silhouette:", best_k, "score:", max(sil_scores))


Best k by silhouette: 2 score: 0.5179445995121839


In [None]:
final_k = best_k

kmeans_final = KMeans(n_clusters=final_k, random_state=42, n_init=50)
final_labels = kmeans_final.fit_predict(X_scaled)

df['CampaignCluster_KM'] = np.nan
df.loc[X_index, 'CampaignCluster_KM'] = final_labels


In [None]:
cluster_summary_mkt = (
    df.loc[X_index]
      .groupby('CampaignCluster_KM')[selected_features_mkt]
      .mean()
)


cluster_summary_mkt


Unnamed: 0_level_0,AcceptedCmp1,AcceptedCmp2,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,Response,Total_AcceptedCmp,Income,Total_Spending,Tenure,Family_Members
CampaignCluster_KM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0.0,0.002554,0.001021,0.065884,0.039326,0.0,0.106742,0.108784,48143.335036,489.122574,12.395914,3.039326
1.0,0.554656,0.11336,0.137652,0.352227,0.651822,0.502024,1.809717,79198.659919,1539.838057,12.381377,2.230769


Since the silhouette score was highest for k = 2, the model splits customers into two distinct marketing-interest groups.
Below is what each cluster means based on the centroid values you shared.

# Cluster 0 – “Low-Engagement / Non-Responders”

| Feature                      | Value       | Interpretation                                                                       |
| ---------------------------- | ----------- | ------------------------------------------------------------------------------------ |
| **Income**                   | 48,143      | Low–middle income; limited purchasing power                                          |
| **Total Spending**           | 489         | Moderate spending; buys occasionally but not heavily                                 |
| **Tenure**                   | 12.39 years | Long-time customers; stable but low-engagement                                       |
| **Family Members**           | 3.04        | Larger families → higher financial responsibilities                                  |
| **Total Accepted Campaigns** | 0.11        | Very low responsiveness; rarely accepts marketing promotions                         |
| **AcceptedCmp1–5**           | Mostly ~0   | Almost never accepts any type of campaign                                            |
| **Response (latest)**        | 0.106       | Very low interest in the latest campaign                                             |
| **Overall Pattern**          | —           | Price-sensitive, low-engagement customers who buy only essential or occasional items |


**Interpretation**

This cluster represents customers who:

Rarely accept any campaigns

Spend less

Have stable tenure but very low marketing interest

Show weak response probability

These are low-value, low-engagement customers.
Marketing effort here will have very low ROI.



# Cluster 1 – “High-Engagement / Campaign-Responsive Customers”

| Feature                      | Value       | Interpretation                                                                |
| ---------------------------- | ----------- | ----------------------------------------------------------------------------- |
| **Income**                   | 79,198      | High income; strong purchasing power                                          |
| **Total Spending**           | 1,539       | Heavy spenders; significantly higher buying activity                          |
| **Tenure**                   | 12.38 years | Similar tenure to Cluster 0 (loyal customers)                                 |
| **Family Members**           | 2.23        | Smaller families → more disposable income                                     |
| **Total Accepted Campaigns** | 1.81        | Very high responsiveness; this group accepts many campaigns                   |
| **AcceptedCmp1**             | 0.55        | Strong engagement with promotions                                             |
| **AcceptedCmp4**             | 0.35        | Good interest in mid-term campaigns                                           |
| **AcceptedCmp5**             | 0.65        | Excellent acceptance rate                                                     |
| **Response (latest)**        | 0.50        | Actively responds to recent marketing                                         |
| **Overall Pattern**          | —           | High-value customers who respond well to promotions and generate more revenue |


**Interpretation**

This cluster represents:

Customers who consistently respond to nearly all marketing campaigns

High spenders

Higher income

Higher repeat-purchase activity

These are prime marketing targets — high ROI, loyal, and very campaign-sensitive.

In [None]:
import plotly.express as px

counts = (
    df['CampaignCluster_KM'].value_counts()
      .sort_index()
      .rename_axis('cluster')
      .reset_index(name='count')
)
counts['percent'] = (counts['count'] / counts['count'].sum() * 100).round(1)

fig = px.bar(
    counts, x='cluster', y='count',
    text=counts['percent'].astype(str) + '%',
    title='Number of points per cluster'
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Count')
fig.show()


This bar chart shows how many customers fall into each campaign-interest cluster created by KMeans.

Cluster 0 contains about 88.8% of all customers.

Cluster 1 contains about 11.2% of customers.

This tells us that most customers behave in a similar way with respect to campaign interest and behaviour (Cluster 0), while a smaller but still meaningful group has a clearly different pattern (Cluster 1).

From a segmentation point of view, this is useful because:

We can treat Cluster 0 as the “baseline” group and design standard campaigns for them.

We can focus special marketing actions on Cluster 1, since this minority segment behaves differently and may need a different message, frequency, or offer.

Even though there are only two clusters here, the fact that around one in ten customers belongs to a distinct campaign-interest segment is important for targeted marketing decisions.

This chart shows how many customers fall into each marketing segment.

In [None]:
import plotly.graph_objects as go

# Use a subset of most important features for readability
radar_features = [
    'Total_AcceptedCmp',
    'Income',
    'Total_Spending',
    'Tenure',
    'Family_Members'
]

cent_radar = cluster_summary_mkt[radar_features].copy()
cent_radar.index = cent_radar.index.astype(int).astype(str)

# Normalize for radar
scaler_rad = MinMaxScaler()
cent_radar_norm = pd.DataFrame(
    scaler_rad.fit_transform(cent_radar),
    columns=cent_radar.columns,
    index=cent_radar.index
)

fig_radar = go.Figure()
for cluster_id, row in cent_radar_norm.iterrows():
    fig_radar.add_trace(go.Scatterpolar(
        r=row[radar_features].values,
        theta=radar_features,
        fill='toself',
        name=f'Cluster {cluster_id}'
    ))

fig_radar.update_layout(
    polar=dict(radialaxis=dict(visible=True, range=[0, 1])),
    title='Radar Chart – Cluster Profiles',
    showlegend=True,
    width=800,
    height=600
)
fig_radar.show()


Radar chart clearly shows:

**Cluster 1 dominates in:**

Income

Spending

Campaign acceptance

**Cluster 0 dominates in:**

Tenure

Family size

Cluster 1 = High value customers → focus your campaigns and premium offers here.

Cluster 0 = Low campaign responsiveness → don’t waste marketing budget here.

#Evalution

In [None]:
from sklearn.metrics import silhouette_score

labels = df.loc[X_index, 'CampaignCluster_KM'].astype(int)

sil_score = silhouette_score(X_scaled, labels)
print("Silhouette score:", sil_score)


Silhouette score: 0.5179445995121839


Our silhouette score of 0.52 indicates good clustering quality, meaning the customer segments represent real behavioral differences and can be used confidently for targeted marketing strategies.”

#Aspect 3

Analysing how the customer has purchase the product whether they purchase through 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth'.

In [None]:
df.columns

Index(['ID', 'Year_Birth', 'Education', 'Marital_Status', 'Income', 'Kidhome',
       'Teenhome', 'Dt_Customer', 'Recency', 'MntWines', 'MntFruits',
       'MntMeatProducts', 'MntFishProducts', 'MntSweetProducts',
       'MntGoldProds', 'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth',
       'AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1',
       'AcceptedCmp2', 'Complain', 'Z_CostContact', 'Z_Revenue', 'Response',
       'Age', 'Cust_AgeCatg', 'Tenure', 'Family_Members', 'Total_Spending',
       'Total_Purchase', 'Total_AcceptedCmp', 'Clusters',
       'CampaignCluster_KM'],
      dtype='object')

In [None]:
purchase_features = [
    'NumDealsPurchases', 'NumWebPurchases',
       'NumCatalogPurchases', 'NumStorePurchases', 'NumWebVisitsMonth','Total_Purchase'
]

behavior_features = [
    'Income', 'Total_Spending', 'Tenure'
]

selected_features_purchase = purchase_features + behavior_features

In [None]:
corr_matrix = df[selected_features_purchase].corr(numeric_only=True)

fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    title='Correlation Heatmap (Plotly)',
    aspect='auto'
)

fig.update_layout(
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    width=1000,
    height=1000
)

fig.show()


In [None]:
X = df[selected_features_purchase].dropna().copy()
X_index = X.index

# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
X_scaled = X

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go

k_values = list(range(2, 11))

wcss = []
sil_scores = []

for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=20)
    labels = kmeans.fit_predict(X_scaled)

    wcss.append(kmeans.inertia_)                     # elbow
    sil_scores.append(silhouette_score(X_scaled, labels))  # quality

# Plot side-by-side
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=("Elbow Method (WCSS) – KMeans",
                    "Silhouette Score vs k – KMeans")
)

fig.add_trace(
    go.Scatter(x=k_values, y=wcss, mode="lines+markers", name="WCSS"),
    row=1, col=1
)

fig.add_trace(
    go.Scatter(x=k_values, y=sil_scores, mode="lines+markers", name="Silhouette"),
    row=1, col=2
)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=1)
fig.update_yaxes(title_text="WCSS", row=1, col=1)

fig.update_xaxes(title_text="Number of clusters (k)", row=1, col=2)
fig.update_yaxes(title_text="Silhouette Score", row=1, col=2)

fig.update_layout(
    title="KMeans Evaluation for Purchase-Interest Features",
    width=1000,
    height=450
)

fig.show()


In [None]:
best_k = k_values[int(np.argmax(sil_scores))]
print("Best k by silhouette:", best_k, "score:", max(sil_scores))


Best k by silhouette: 2 score: 0.6110461584875907


In [None]:
final_k = best_k

kmeans_final = KMeans(n_clusters=final_k, random_state=42, n_init=50)
final_labels = kmeans_final.fit_predict(X_scaled)

df['PurchaseCluster_KM'] = np.nan
df.loc[X_index, 'PurchaseCluster_KM'] = final_labels


In [None]:
cluster_summary_purchase = (
    df.loc[X_index]
      .groupby('PurchaseCluster_KM')[selected_features_purchase]
      .mean()
)


cluster_summary_purchase


Unnamed: 0_level_0,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,Total_Purchase,Income,Total_Spending,Tenure
PurchaseCluster_KM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0.0,2.22366,5.528651,4.525878,8.112754,3.96488,24.355823,69527.325323,1070.47597,12.398706
1.0,2.409617,2.724844,0.833482,3.617988,6.658949,16.24488,34370.572573,160.09528,12.390027


In [None]:
import plotly.express as px

counts = (
    df['PurchaseCluster_KM'].value_counts()
      .sort_index()
      .rename_axis('cluster')
      .reset_index(name='count')
)
counts['percent'] = (counts['count'] / counts['count'].sum() * 100).round(1)

fig = px.bar(
    counts, x='cluster', y='count',
    text=counts['percent'].astype(str) + '%',
    title='Number of points per cluster'
)
fig.update_traces(textposition='outside')
fig.update_layout(xaxis_title='Cluster', yaxis_title='Count')
fig.show()


#Cluster 0 (Premium Multichannel Buyers)

Most profitable segment

High frequency & high spending

Active across web, store, and catalog

Best candidates for:

✔ Premium offers

✔ Bundles

✔ Exclusive membership

✔ Personalised campaigns



| Feature                 | Value       | Interpretation                                              |
| ----------------------- | ----------- | ----------------------------------------------------------- |
| **NumDealsPurchases**   | 2.22        | Occasionally buys using deals, not heavily discount-driven  |
| **NumWebPurchases**     | 5.53        | Strong web purchasing activity                              |
| **NumCatalogPurchases** | 4.53        | Uses catalog frequently; comfortable with multiple channels |
| **NumStorePurchases**   | 8.11        | Very active store shoppers; prefers in-person buying        |
| **NumWebVisitsMonth**   | 3.96        | Moderate digital engagement                                 |
| **Total_Purchase**      | 24.36       | High overall buying frequency                               |
| **Income**              | 69,527      | High income; strong purchasing power                        |
| **Total_Spending**      | 1070        | Heavy spender; contributes strong revenue                   |
| **Tenure**              | 12.39 years | Long-term loyal customers                                   |


#Cluster 1 (Low-Spending Occasional Buyers)

Browse often but purchase rarely

Income & spending are low

Price-sensitive

Best targeted with:

✔ Discounts

✔ Deal-based offers

✔ Low-cost online ads

✔ Basic essentials

| Feature                 | Value       | Interpretation                                            |
| ----------------------- | ----------- | --------------------------------------------------------- |
| **NumDealsPurchases**   | 2.40        | Slightly deal-driven; responds to promotions occasionally |
| **NumWebPurchases**     | 2.72        | Low online purchasing; less digitally active              |
| **NumCatalogPurchases** | 0.83        | Very low catalog purchases                                |
| **NumStorePurchases**   | 3.62        | Low in-store buying; limited shopping activity            |
| **NumWebVisitsMonth**   | 6.66        | High visits but low purchases → "browsers but not buyers" |
| **Total_Purchase**      | 16.24       | Low buying frequency                                      |
| **Income**              | 34,370      | Low income; restricted purchasing power                   |
| **Total_Spending**      | 160         | Minimal spending; low revenue contribution                |
| **Tenure**              | 12.39 years | Same tenure but lower engagement                          |


In [None]:
from sklearn.metrics import silhouette_score

labels = df.loc[X_index, 'CampaignCluster_KM'].astype(int)

sil_score = silhouette_score(X_scaled, labels)
print("Silhouette score:", sil_score)


Silhouette score: 0.24758903568499832
