In [None]:
# Customer Segmentation
#This notebook showcases the customer segmentation process using K-Means clustering and evaluates the optimal number of clusters using the Davies-Bouldin Index.


In [None]:
## Data Preparation
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load datasets
customers = pd.read_csv('data/Customers.csv')
transactions = pd.read_csv('data/Transactions.csv')

# Create customer aggregation
customer_agg = customers[['CustomerID']].copy()
transaction_agg = transactions.groupby('CustomerID').agg({
    'TotalValue': 'sum',
    'Quantity': 'sum',
    'ProductID': 'nunique'
}).reset_index()
transaction_agg.rename(columns={'ProductID': 'NumProductsPurchased'}, inplace=True)

# Merge and fill missing values
customer_agg = customer_agg.merge(transaction_agg, on='CustomerID', how='left')
customer_agg.fillna(0, inplace=True)
display(customer_agg.head())


In [None]:
## K-Means Clustering
scaler = StandardScaler()
features = ['TotalValue', 'Quantity', 'NumProductsPurchased']
customer_agg_scaled = scaler.fit_transform(customer_agg[features])

# Find the optimal number of clusters
db_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(customer_agg_scaled)
    db_scores.append((k, davies_bouldin_score(customer_agg_scaled, kmeans.labels_)))

# Select best k
best_k = min(db_scores, key=lambda x: x[1])[0]
print(f"Optimal number of clusters: {best_k}")

kmeans = KMeans(n_clusters=best_k, random_state=42)
customer_agg['Cluster'] = kmeans.fit_predict(customer_agg_scaled)


In [None]:
## Visualizing Clusters
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(customer_agg_scaled)

plt.figure(figsize=(10, 6))
sns.scatterplot(x=reduced_data[:, 0], y=reduced_data[:, 1], hue=customer_agg['Cluster'], palette='Set2')
plt.title('Customer Segments')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.show()
