In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize features for clustering
scaler = StandardScaler()
scaled_features = scaler.fit_transform(customer_features.iloc[:, 1:])

print(scaled_features[:5])  # Verify scaled features

from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score

# Test different cluster counts and calculate DB Index
db_indices = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(scaled_features)
    labels = kmeans.labels_
    
    # Calculate DB Index
    db_index = davies_bouldin_score(scaled_features, labels)
    db_indices.append((k, db_index))

# Find the best cluster count (lowest DB Index)
best_k = min(db_indices, key=lambda x: x[1])
print(f"Optimal number of clusters: {best_k[0]}, DB Index: {best_k[1]}")

# Perform clustering with the optimal number of clusters
kmeans = KMeans(n_clusters=best_k[0], random_state=42)
kmeans.fit(scaled_features)
customer_features['Cluster'] = kmeans.labels_

print(customer_features.head())  # Check which cluster each customer belongs to

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Reduce dimensions using PCA
pca = PCA(n_components=2)
pca_features = pca.fit_transform(scaled_features)

# Plot clusters
plt.figure(figsize=(10, 6))
for cluster in range(best_k[0]):
    cluster_points = pca_features[customer_features['Cluster'] == cluster]
    plt.scatter(cluster_points[:, 0], cluster_points[:, 1], label=f'Cluster {cluster}')

plt.title('Customer Clusters')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.legend()
plt.show()

# Save cluster assignments to a CSV
customer_features[['CustomerID', 'Cluster']].to_csv('Sahil_Raj_Clustering.csv', index=False)

# Example report summary
print(f"Number of clusters: {best_k[0]}")
print(f"Davies-Bouldin Index: {best_k[1]}")

