In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import davies_bouldin_score
from sklearn.decomposition import PCA

# Load customer data
cust = pd.read_csv('Customers.csv')

# Prepare data
cust.reset_index(inplace=True)
data = cust.drop(['CustomerID', 'CustomerName', 'SignupDate'], axis=1)
data = pd.get_dummies(data, drop_first=True)

# Normalize
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# Find optimal clusters using DB Index
db_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    labels = kmeans.fit_predict(scaled_data)
    db_scores.append(davies_bouldin_score(scaled_data, labels))
    print(f"DB Index for k={k}: {db_scores[-1]:.4f}")

# Plot DB Index
plt.figure(figsize=(8, 5))
plt.plot(range(2, 11), db_scores, marker='o')
plt.title("DB Index vs Number of Clusters")
plt.xlabel("Clusters")
plt.ylabel("DB Index")
plt.show()

# Choose optimal clusters
optimal_k = range(2, 11)[np.argmin(db_scores)]
print(f"Optimal k: {optimal_k}")

# Final K-Means
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
labels = kmeans.fit_predict(scaled_data)
cust['Cluster'] = labels

# Final DB Index
final_db = davies_bouldin_score(scaled_data, labels)
print(f"Final DB Index: {final_db:.4f}")

# PCA for Visualization
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
cust['PCA1'] = pca_data[:, 0]
cust['PCA2'] = pca_data[:, 1]

plt.figure(figsize=(10, 7))
sns.scatterplot(data=cust, x='PCA1', y='PCA2', hue='Cluster', palette='Set1', s=100)
plt.title("Clusters Visualization")
plt.xlabel("PCA1")
plt.ylabel("PCA2")
plt.legend(title='Cluster')
plt.show()

# Cluster Summary
summary = cust.groupby('Cluster').mean()
print("\nCluster Summary:")
print(summary)