# K-Means demo with 3 features

In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn import datasets
from sklearn.cluster import KMeans

In [2]:
iris = datasets.load_iris()
feature_names = ['sepal length (cm)', 'petal length (cm)', 'petal width (cm)']
feature_indices = [0, 2, 3]
X = iris.data[:, feature_indices]
y = iris.target
df = pd.DataFrame(X, columns=feature_names)
df['species'] = pd.Categorical.from_codes(y, iris.target_names)
df.head(3)

Unnamed: 0,sepal length (cm),petal length (cm),petal width (cm),species
0,5.1,1.4,0.2,setosa
1,4.9,1.4,0.2,setosa
2,4.7,1.3,0.2,setosa


## Explore the data

In [3]:
fig = px.scatter_3d(
    df,
    x=feature_names[2],
    y=feature_names[0],
    z=feature_names[1],
    color='species',
    symbol='species',
    title='Ground Truth: Iris Species',
    opacity=0.8
)
fig.update_layout(margin=dict(l=0, r=0, b=0, t=40))
fig.show()

## K-Means Clustering

In [4]:
kmeans_configs = [
    {'n_clusters': 2, 'name': '2 clusters'},
    {'n_clusters': 3, 'name': '3 clusters'},
    {'n_clusters': 4, 'name': '4 clusters'}
]

In [5]:
cluster_results = []
kmeans_models = {}
for cfg in kmeans_configs:
    kmeans = KMeans(n_clusters=cfg['n_clusters'], n_init='auto', random_state=42)
    labels = kmeans.fit_predict(X)
    cluster_results.append((cfg['name'], labels))
    kmeans_models[cfg['name']] = kmeans

In [6]:
import plotly.subplots as sp

fig = sp.make_subplots(
    rows=1, cols=len(cluster_results),
    specs=[[{'type': 'scatter3d'}]*len(cluster_results)],
    subplot_titles=[name for name, _ in cluster_results]
)

for i, (name, labels) in enumerate(cluster_results):
    fig.add_trace(
        px.scatter_3d(
            df.assign(cluster=labels),
            x=feature_names[2],
            y=feature_names[0],
            z=feature_names[1],
            color='cluster',
            opacity=0.8
        ).data[0],
        row=1, col=i+1
    )

fig.update_layout(height=500, width=1300, margin=dict(l=0, r=0, b=0, t=40))
fig.show()

# K-means in depth

In [7]:
kmeans_model = kmeans_models['3 clusters']

In [8]:
y_hat = kmeans_model.predict(X)

In [9]:
df.loc[:, 'cluster'] = pd.Categorical.from_codes(y_hat, [f'cluster {i}' for i in range(kmeans_model.n_clusters)])
df.sample(5, random_state=42)

Unnamed: 0,sepal length (cm),petal length (cm),petal width (cm),species,cluster
73,6.1,4.7,1.2,versicolor,cluster 2
18,5.7,1.7,0.3,setosa,cluster 1
118,7.7,6.9,2.3,virginica,cluster 0
78,6.0,4.5,1.5,versicolor,cluster 2
76,6.8,4.8,1.4,versicolor,cluster 2


## KMeans Hyperparameters

In [10]:
print('n_clusters:', kmeans_model.n_clusters)
print('init:', kmeans_model.init)
print('n_init:', kmeans_model.n_init)
print('max_iter:', kmeans_model.max_iter)
print('random_state:', kmeans_model.random_state)

n_clusters: 3
init: k-means++
n_init: auto
max_iter: 300
random_state: 42


## Cluster Centroids

In [11]:
centroids = kmeans_model.cluster_centers_
centroids

array([[6.81      , 5.7075    , 2.075     ],
       [5.006     , 1.462     , 0.246     ],
       [5.89666667, 4.37166667, 1.41      ]])

In [12]:
centroids_df = pd.DataFrame(centroids, columns=feature_names)
centroids_df

Unnamed: 0,sepal length (cm),petal length (cm),petal width (cm)
0,6.81,5.7075,2.075
1,5.006,1.462,0.246
2,5.896667,4.371667,1.41


# Soft clustering

In [13]:
kmeans_model.transform(X[:5]), kmeans_model.predict(X[:5]), kmeans_model.labels_[:5]

(array([[4.99942809, 0.12163881, 3.30599161],
        [5.07131948, 0.13113352, 3.35979869],
        [5.23390688, 0.34927926, 3.51158756],
        [5.1090881 , 0.41036082, 3.37519094],
        [5.03450904, 0.07743384, 3.33150325]]),
 array([1, 1, 1, 1, 1], dtype=int32),
 array([1, 1, 1, 1, 1], dtype=int32))

## Distance to Centroids

In [14]:
from scipy.spatial.distance import cdist

distances = cdist(X, centroids)
distances_df = pd.DataFrame(distances, columns=[f'centroid_{i}' for i in range(centroids.shape[0])])
distances_df.head(5)

Unnamed: 0,centroid_0,centroid_1,centroid_2
0,4.999428,0.121639,3.305992
1,5.071319,0.131134,3.359799
2,5.233907,0.349279,3.511588
3,5.109088,0.410361,3.375191
4,5.034509,0.077434,3.331503


## Cluster Assignments and Closest Distance

In [15]:
# For each data point, show its assigned cluster and the distance to its centroid.
assigned_labels = kmeans_model.labels_
min_distances = distances[np.arange(len(X)), assigned_labels]
assignments_df = pd.DataFrame({
    'assigned_cluster': assigned_labels,
    'distance_to_centroid': min_distances
})
assignments_df.head()

Unnamed: 0,assigned_cluster,distance_to_centroid
0,1,0.121639
1,1,0.131134
2,1,0.349279
3,1,0.410361
4,1,0.077434


# Evaluate the clustering (WCSS)


In [16]:
print('Within-Cluster Sum of Squares (WCSS):', kmeans_model.inertia_)

Within-Cluster Sum of Squares (WCSS): 63.48411666666667


In [17]:
import numpy as np

wcss_per_cluster = []
for i in range(kmeans_model.n_clusters):
    cluster_points = X[kmeans_model.labels_ == i] # Index
    centroid = kmeans_model.cluster_centers_[i]
    wcss = np.sum((cluster_points - centroid) ** 2)
    wcss_per_cluster.append(wcss)

print("WCSS per cluster:", wcss_per_cluster)
print("Total WCSS (sum):", sum(wcss_per_cluster))
# ...existing code...

WCSS per cluster: [np.float64(23.158750000000005), np.float64(8.110200000000003), np.float64(32.21516666666666)]
Total WCSS (sum): 63.484116666666665
