# $K$-means clustering 

In [None]:
import pandas as pd
import numpy as np
from numpy.linalg import svd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from scipy.cluster import hierarchy

%matplotlib inline

In [None]:
# generate data
np.random.seed(21)
X = np.random.standard_normal((50, 2))
X[:25, 0] = X[:25, 0] + 3
X[:25, 1] = X[:25, 1] - 4

In [None]:
n_clusters = 2
km1 = KMeans(n_clusters=n_clusters, n_init=20)
km1.fit(X)

n_clusters = 3
km2 = KMeans(n_clusters=n_clusters, n_init=20)
km2.fit(X)

In [None]:
print(km1.labels_)
print(dir(km1))  # we can use dir to see other saved attributes

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.scatter(X[:, 0], X[:, 1], s=40, c=km1.labels_, cmap=plt.cm.prism)
ax1.set_title("K-Means Clustering Results with K=2")
ax1.scatter(
    km1.cluster_centers_[:, 0],
    km1.cluster_centers_[:, 1],
    marker="+",
    s=100,
    c="k",
    linewidth=2,
)

ax2.scatter(X[:, 0], X[:, 1], s=40, c=km2.labels_, cmap=plt.cm.prism)
ax2.set_title("K-Means Clustering Results with K=3")
ax2.scatter(
    km2.cluster_centers_[:, 0],
    km2.cluster_centers_[:, 1],
    marker="+",
    s=100,
    c="k",
    linewidth=2,
)
plt.show()

Hierarchical Clustering

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, figsize=(15, 18))

for linkage, cluster, ax in zip(
    [hierarchy.complete(X), hierarchy.average(X), hierarchy.single(X)],
    ["c1", "c2", "c3"],
    [ax1, ax2, ax3],
):
    cluster = hierarchy.dendrogram(linkage, ax=ax, color_threshold=0)

ax1.set_title("Complete Linkage")
ax2.set_title("Average Linkage")
ax3.set_title("Single Linkage")
plt.show()

Clustering the Observations of the NCI60 Data

In [None]:
X = pd.read_csv("https://github.com/pykale/transparentML/raw/main/data/NCI60_data.csv")
y = pd.read_csv("https://github.com/pykale/transparentML/raw/main/data/NCI60_labs.csv")

In [None]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X), index=y.iloc[:, 0], columns=X.columns)

In [None]:
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 20))

for linkage, cluster, ax in zip(
    [hierarchy.complete(X_scaled), hierarchy.average(X), hierarchy.single(X_scaled)],
    ["c1", "c2", "c3"],
    [ax1, ax2, ax3],
):
    cluster = hierarchy.dendrogram(
        linkage,
        labels=X_scaled.index,
        orientation="right",
        color_threshold=0,
        leaf_font_size=10,
        ax=ax,
    )

ax1.set_title("Complete Linkage")
ax2.set_title("Average Linkage")
ax3.set_title("Single Linkage")
plt.show()

In [None]:
plt.figure(figsize=(10, 20))
cut4 = hierarchy.dendrogram(
    hierarchy.complete(X_scaled),
    labels=X_scaled.index,
    orientation="right",
    color_threshold=140,
    leaf_font_size=10,
)
plt.vlines(
    140, 0, plt.gca().yaxis.get_data_interval()[1], colors="r", linestyles="dashed"
)
plt.show()

$K$-means

In [None]:
np.random.seed(21)
km3 = KMeans(n_clusters=4, n_init=50)
km3.fit(X_scaled)

In [None]:
km3.labels_

Combine with PCA

In [None]:
pca = PCA()
X_scaled_ = StandardScaler().fit_transform(X)
df_plot = pd.DataFrame(pca.fit_transform(X_scaled_))

In [None]:
plt.figure(figsize=(10, 20))
pca_cluster = hierarchy.dendrogram(
    hierarchy.complete(X_scaled),
    labels=X_scaled.index,
    orientation="right",
    color_threshold=100,
    leaf_font_size=10,
)

In [None]:
# Hierarchy based on Principal Components 1 to 5
plt.figure(figsize=(10, 20))
pca_cluster = hierarchy.dendrogram(
    hierarchy.complete(df_plot.iloc[:, :5]),
    labels=X_scaled.index,
    orientation="right",
    color_threshold=100,
    leaf_font_size=10,
)

## Exercises

min 3 max 5

