Kaggle Link: https://www.kaggle.com/datasets/dev0914sharma/customer-clustering?select=segmentation+data.csv

# Build model for:
    1. Kmeans
    2. DBScan
    3. OPTICS
    4. AffinityPropagation

In [1]:
# libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler, normalize

from sklearn.cluster import KMeans, AffinityPropagation, OPTICS, cluster_optics_dbscan, DBSCAN
from sklearn.metrics import silhouette_score

In [2]:
raw_df = pd.read_csv("segmentation data.csv")

In [3]:
raw_df.drop('ID', axis=1, inplace=True)

In [4]:
raw_df

Unnamed: 0,Sex,Marital status,Age,Education,Income,Occupation,Settlement size
0,0,0,67,2,124670,1,2
1,1,1,22,1,150773,1,2
2,0,0,49,1,89210,0,0
3,0,0,45,1,171565,1,1
4,0,0,53,1,149031,1,1
...,...,...,...,...,...,...,...
1995,1,0,47,1,123525,0,0
1996,1,1,27,1,117744,1,0
1997,0,0,31,0,86400,0,0
1998,1,1,24,1,97968,0,0


In [5]:
cols = raw_df.columns.tolist()
cols

['Sex',
 'Marital status',
 'Age',
 'Education',
 'Income',
 'Occupation',
 'Settlement size']

In [6]:
cat_cols = ['Sex','Marital status','Education','Occupation','Settlement size']
num_cols = ['Age', 'Income']

In [7]:
# convert categrorical columns data types to category
for col in cat_cols:
    raw_df[col] = raw_df[col].astype('category')

In [8]:
# Check for missing values
raw_df.isna().sum()

Sex                0
Marital status     0
Age                0
Education          0
Income             0
Occupation         0
Settlement size    0
dtype: int64

In [9]:
# Remove duplicates
raw_df.drop_duplicates(inplace=True)

In [10]:
# Scale the numeric cols
scaler = StandardScaler()
raw_df[num_cols] = scaler.fit_transform(raw_df[num_cols])

# Normalize the numeric cols
raw_df[num_cols] = normalize(raw_df[num_cols])

In [11]:
# One Hot Encoding for Categorical Cols
df = pd.get_dummies(raw_df[cat_cols])
df[num_cols] = raw_df[num_cols]

# 1. KMeans

In [12]:
for cluster in range(5,15):
    kmeans = KMeans(n_clusters=cluster)
    kmeans.fit(df)
    pred = kmeans.predict(df)
    score = silhouette_score(df, pred, sample_size=1000, random_state=42)
    print(f"For cluster {cluster}, score is {score}")

For cluster 5, score is 0.25903720785269124
For cluster 6, score is 0.24854572356325608
For cluster 7, score is 0.2658592597399612
For cluster 8, score is 0.2741774200537067
For cluster 9, score is 0.2803042597350791
For cluster 10, score is 0.2849556164379811
For cluster 11, score is 0.2982991518102257
For cluster 12, score is 0.31773087997423294
For cluster 13, score is 0.30594847841348677
For cluster 14, score is 0.3253385210890725


# 2. DBSCAN

In [13]:
for ep in np.arange(0.6, 2, 0.2):
    ep = round(ep, 1)
    dbscan = DBSCAN(min_samples=10, eps=ep)
    dbscan.fit(df)
    pred = dbscan.labels_
    score = silhouette_score(df, pred, sample_size=1000, random_state=42)
    clusters = len(set(pred)) - (1 if -1 in pred else 0)
    print(f"eps: {ep}, score={score}, clusters: {clusters}")

eps: 0.6, score=0.46613693834733283, clusters: 38
eps: 0.8, score=0.4683020343413329, clusters: 37
eps: 1.0, score=0.4637075043391468, clusters: 36
eps: 1.2, score=0.46351192593649454, clusters: 36
eps: 1.4, score=0.46779981958388794, clusters: 37
eps: 1.6, score=0.11985859395819799, clusters: 1
eps: 1.8, score=0.11985859395819799, clusters: 1


# 3. OPTICS

In [14]:
for ep in np.arange(1, 2, 0.2):
    ep = round(ep, 1)
    optics = OPTICS(min_samples=10, xi=0.05, min_cluster_size=0.05)
    optics.fit(df)
    pred = cluster_optics_dbscan(
        reachability=optics.reachability_,
        core_distances=optics.core_distances_,
        ordering=optics.ordering_,
        eps=ep,
    )
    score = silhouette_score(df, pred, sample_size=1000, random_state=42)
    clusters = len(set(pred))
    print(f"eps: {ep}, score={score}, clusters: {clusters}")

eps: 1.0, score=0.4637075043391468, clusters: 37
eps: 1.2, score=0.46351192593649454, clusters: 37
eps: 1.4, score=0.46779981958388794, clusters: 38
eps: 1.6, score=0.11985859395819799, clusters: 2
eps: 1.8, score=0.11985859395819799, clusters: 2


# 4. AffinityPropagation

In [15]:
for damping in np.arange(0.5, 1, 0.1):
    damping = round(damping, 1)
    ap = AffinityPropagation(damping=damping)
    ap.fit(df)
    pred = ap.predict(df)
    score = silhouette_score(df, pred, sample_size=1000, random_state=42)
    clusters = len(set(pred))
    print(f"damping: {damping}, score: {score}, clusters: {clusters}")



damping: 0.5, score: 0.4813342209857105, clusters: 188




damping: 0.6, score: 0.5001537788791665, clusters: 273
damping: 0.7, score: 0.5807693283940689, clusters: 84
damping: 0.8, score: 0.5843712687785615, clusters: 84
damping: 0.9, score: 0.5836650845989586, clusters: 84
