# Comparing K-means (Lloyd) vs K-means++
**Dataset**: Synthetic 

K-means++ improves upon standard K-means by using a smarter initialization method that spreads out the initial cluster centers, often leading to better and more consistent clustering results.


In [None]:
# Imports
import numpy as np
import pandas as pd
from sklearn.datasets import make_blobs # Import make_blobs
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA

# Set style
sns.set(style="whitegrid")

## 1 Generate Synthetic Dataset and Data Preprocessing

- Create a synthetic dataset using make_blobs from sklearn
- Standardize features
- Reduce to 2D with PCA for visualization

In [None]:
# Create a synthetic dataset with make_blobs
n_samples = 500
n_features = 2 
n_clusters = 3
X, y_true = make_blobs(n_samples=n_samples, n_features=n_features, centers=n_clusters,
                       cluster_std=[1.5, 0.5, 2.0], # Varying stds to make some clusters wider/overlapping
                       random_state=42) # For reproducibility of the dataset

# Create a DataFrame for convenience (optional, but good for pairplot/hue)
if n_features == 2:
    df = pd.DataFrame(X, columns=['Feature 1', 'Feature 2'])
else: # If more than 2 features, just use generic names for plotting
    df = pd.DataFrame(X, columns=[f'Feature {i+1}' for i in range(n_features)])

df['True Label'] = y_true # Store true labels in the DataFrame

# Standardize
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for visualization 
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

# Plot true labels
plt.figure(figsize=(8, 6))
# Using 'True Label' column for hue
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=df['True Label'], palette='tab10', legend='full', s=50, alpha=0.7)
plt.title('True Clusters (PCA Reduced)')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.grid(True, linestyle='--', alpha=0.7)
plt.show()

## 2 Model Definitions

- Initialize K-means (Lloyd) with random initialization
- Initialize K-means++ (default in sklearn)

In [None]:
# K-means with random initialization (Lloyd's algorithm)
kmeans_random = KMeans(n_clusters=3, init='random', n_init=1, random_state=1234)

# K-means++ (smart initialization)
kmeans_plus = KMeans(n_clusters=3, init='k-means++', n_init=1, random_state=1234)

## 3 Train & Evaluate Once

- Fit both models
- Visualize clusters
- Compute silhouette scores

**Silhouette Score**: A clustering evaluation metric that measures both *Cohesion* (how close points are to their own cluster) and *Separation* (how distinct clusters are from each other).
  
**Range of Silhouette Score**: $[-1, 1]$ where $+1$ is ideal clustering, $0$ is Overlapping clusters, and $-1$ is incorrect clustering.

**Formula**: Silhouette score of $i$-th point is
  $$
  s(i) = \frac{b(i) - a(i)}{\max\{a(i), b(i)\}},
  $$
  where $a(i)$ is the average intra-cluster distance and $b(i)$ is the average nearest-cluster distance.

  The **mean Silhouette score** is the average of all pointwise scores:
  $$
   S = \frac{1}{n}\sum_{i=1}^{n} s(i),
  $$
   where $n$ is the total number of points in the dataset.

In [None]:
# Fit models
kmeans_random.fit(X_scaled)
kmeans_plus.fit(X_scaled)

# Get cluster assignments
random_labels = kmeans_random.labels_
plus_labels = kmeans_plus.labels_

# Silhouette scores
random_score = silhouette_score(X_scaled, random_labels)
plus_score = silhouette_score(X_scaled, plus_labels)

# Plot results
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# Random initialization
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=random_labels, palette='tab10', ax=ax1)
ax1.set_title(f'K-means (Random Init)\nSilhouette Score: {random_score:.3f}')
ax1.set_xlabel('PCA Component 1')
ax1.set_ylabel('PCA Component 2')

# K-means++
sns.scatterplot(x=X_pca[:, 0], y=X_pca[:, 1], hue=plus_labels, palette='tab10', ax=ax2)
ax2.set_title(f'K-means++\nSilhouette Score: {plus_score:.3f}')
ax2.set_xlabel('PCA Component 1')

plt.show()

## 6 Mean Performance Over 50 Experiments

- Compare both methods across multiple runs
- Track silhouette scores and inertia

In [None]:
random_scores = []
plus_scores = []
random_inertia = []
plus_inertia = []

num_runs = 50

for seed in range(num_runs):
    # Random init
    km_random = KMeans(n_clusters=3, init='random', n_init=1, random_state=seed, max_iter=300)
    km_random.fit(X_scaled)
    random_scores.append(silhouette_score(X_scaled, km_random.labels_))
    random_inertia.append(km_random.inertia_)
    
    # K-means++
    km_plus = KMeans(n_clusters=3, init='k-means++', n_init=1, random_state=seed, max_iter=300)
    km_plus.fit(X_scaled)
    plus_scores.append(silhouette_score(X_scaled, km_plus.labels_))
    plus_inertia.append(km_plus.inertia_)

# Create results DataFrame
results = pd.DataFrame({
    'Method': ['Random']*num_runs + ['K-means++']*num_runs,
    'Silhouette': random_scores + plus_scores,
    'Inertia': random_inertia + plus_inertia
})

print("\nMean Performance over 10 runs:")
print(f"Random Init - Mean Silhouette: {np.mean(random_scores):.3f}, Mean Inertia: {np.mean(random_inertia):.1f}")
print(f"K-means++   - Mean Silhouette: {np.mean(plus_scores):.3f}, Mean Inertia: {np.mean(plus_inertia):.1f}")

# Plot comparison
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
sns.boxplot(x='Method', y='Silhouette', data=results)
plt.title('Silhouette Scores Comparison')

plt.subplot(1, 2, 2)
sns.boxplot(x='Method', y='Inertia', data=results)
plt.title('Inertia Comparison')

plt.tight_layout()
plt.show()

## Key Findings

1. **Initialization Matters**: K-means++ consistently finds better starting points than random initialization
2. **Better Scores**: Higher silhouette scores indicate more well-defined clusters with k-means++
3. **Lower Inertia**: K-means++ typically achieves lower within-cluster variance
4. **More Consistent**: K-means++ shows less variability across different random seeds