## Clustering Algorithms: k-means and DBSCAN

### k-means

In [1]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y = make_blobs()

k = 5 
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X)

In [2]:
y_pred

array([1, 4, 3, 1, 1, 2, 3, 1, 3, 3, 1, 3, 1, 4, 1, 0, 2, 0, 0, 2, 3, 3,
       0, 0, 1, 3, 2, 4, 4, 0, 3, 4, 3, 3, 2, 0, 0, 3, 4, 4, 4, 2, 1, 3,
       3, 2, 4, 0, 1, 4, 4, 4, 4, 2, 2, 3, 0, 1, 0, 1, 4, 0, 4, 3, 4, 3,
       3, 2, 0, 3, 2, 0, 2, 2, 3, 4, 2, 2, 3, 1, 4, 4, 1, 2, 3, 1, 3, 0,
       4, 2, 1, 3, 0, 2, 4, 3, 1, 3, 4, 3], dtype=int32)

In [3]:
kmeans.cluster_centers_

array([[-7.96256673,  5.23936128],
       [-1.29342104, -1.60302729],
       [-8.7033894 ,  6.82907789],
       [ 0.54634645, -2.7186582 ],
       [-2.83594001, -0.73738638]])

In [4]:
import numpy as np
X_new = np.array([[0,2], [3,2], [-3,3], [-3, 2.5]])
kmeans.predict(X_new)

array([1, 3, 4, 4], dtype=int32)

In [5]:
kmeans.transform(X_new).round(2)

array([[ 8.6 ,  3.83,  9.95,  4.75,  3.94],
       [11.43,  5.6 , 12.66,  5.32,  6.45],
       [ 5.44,  4.91,  6.87,  6.73,  3.74],
       [ 5.67,  4.44,  7.16,  6.31,  3.24]])

### Centroid initialization methods

In [6]:
good_init = np.array([[-3, 3], [-3,2], [-3,1], [-1,2], [0,2]])
kmeans = KMeans(n_clusters=5, init=good_init, n_init=1, random_state=42)
kmeans.fit(X)

KMeans(init=array([[-3,  3],
       [-3,  2],
       [-3,  1],
       [-1,  2],
       [ 0,  2]]),
       n_clusters=5, n_init=1, random_state=42)

In [7]:
kmeans.inertia_

116.5311909765365

In [8]:
kmeans.score(X)

-116.5311909765365

### Accelerated k-mans and mini-batch k-means

In [11]:
from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)
minibatch_kmeans.fit(X)

MiniBatchKMeans(n_clusters=5, random_state=42)

In [13]:
from sklearn.metrics import silhouette_score
silhouette_score(X, kmeans.labels_)

0.36728521742370396

### Using Clustering for Semi-Supervised Learning

In [15]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)
X_train, y_train = X_digits[:1400], y_digits[:1400]
X_test, y_test = X_digits[1400:], y_digits[1400:]

In [16]:
# pretending we only have 50 labels
from sklearn.linear_model import LogisticRegression

n_labeled=50
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
log_reg.score(X_test, y_test)

0.7481108312342569

In [18]:
k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

In [19]:
y_representative_digits = np.array([1,3,6,0,7,9,2,4,8,9,5,4,7,1,2,6,8,2,5,1,
                                    4,9,3,3,8,8,2,5,6,9,1,4,0,6,8,3,4,6,7,2,6,1,0,7,5,1,9,9,3,7])

In [23]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.8161209068010076

In [25]:
y_train_propegated = np.empty(len(X_train), dtype=np.int64)
for i in range(k):
    y_train_propegated[kmeans.labels_ == i] = y_representative_digits[i]

In [27]:
log_reg = LogisticRegression(max_iter = 10_000)
log_reg.fit(X_train, y_train_propegated)
log_reg.score(X_test, y_test)

0.8614609571788413

In [31]:
percentile_closest = 99

X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
    in_cluster = (kmeans.labels_==i)
    cluster_dist = X_cluster_dist[in_cluster]
    cutoff_distance = np.percentile(cluster_dist, percentile_closest)
    above_cutoff = (X_cluster_dist > cutoff_distance)
    X_cluster_dist[in_cluster & above_cutoff] = -1

partially_propageted = (X_cluster_dist != -1)
X_train_paritally_propegated = X_train[partially_propageted]
y_train_partially_propegated = y_train_propegated[partially_propageted]

In [34]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train_paritally_propegated, y_train_partially_propegated)
log_reg.score(X_test, y_test)

0.871536523929471

In [35]:
(y_train_partially_propegated == y_train[y_train_partially_propegated]).mean()

1.0