## Clustering Algorithms: k-means and DBSCAN

### k-means

In [1]:
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

X, y = make_blobs()

k = 5 
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X)

In [2]:
y_pred

array([4, 2, 2, 1, 3, 0, 4, 2, 4, 2, 4, 2, 0, 0, 1, 2, 4, 0, 3, 2, 0, 0,
       2, 0, 3, 1, 3, 1, 2, 0, 4, 4, 2, 2, 1, 4, 4, 0, 1, 4, 4, 0, 2, 2,
       3, 2, 2, 1, 2, 3, 2, 3, 0, 3, 0, 1, 4, 0, 2, 2, 3, 0, 3, 1, 2, 1,
       3, 2, 2, 4, 2, 4, 1, 2, 3, 2, 0, 4, 4, 4, 3, 0, 2, 4, 2, 2, 4, 2,
       2, 4, 3, 2, 2, 3, 2, 1, 1, 3, 1, 3], dtype=int32)

In [3]:
kmeans.cluster_centers_

array([[-0.62956559,  3.65567994],
       [ 4.42240904, 10.81253   ],
       [-4.38676129, -2.01073874],
       [-0.56763981,  5.49711989],
       [ 4.32662148,  9.01459583]])

In [4]:
import numpy as np
X_new = np.array([[0,2], [3,2], [-3,3], [-3, 2.5]])
kmeans.predict(X_new)

array([0, 0, 0, 0], dtype=int32)

In [5]:
kmeans.transform(X_new).round(2)

array([[ 1.77,  9.86,  5.94,  3.54,  8.24],
       [ 3.99,  8.93,  8.41,  5.  ,  7.14],
       [ 2.46, 10.78,  5.2 ,  3.49,  9.48],
       [ 2.64, 11.14,  4.72,  3.86,  9.8 ]])

### Centroid initialization methods

In [6]:
good_init = np.array([[-3, 3], [-3,2], [-3,1], [-1,2], [0,2]])
kmeans = KMeans(n_clusters=5, init=good_init, n_init=1, random_state=42)
kmeans.fit(X)

KMeans(init=array([[-3,  3],
       [-3,  2],
       [-3,  1],
       [-1,  2],
       [ 0,  2]]),
       n_clusters=5, n_init=1, random_state=42)

In [7]:
kmeans.inertia_

115.76927089115591

In [8]:
kmeans.score(X)

-115.76927089115591

### Accelerated k-mans and mini-batch k-means

In [9]:
from sklearn.cluster import MiniBatchKMeans

minibatch_kmeans = MiniBatchKMeans(n_clusters=5, random_state=42)
minibatch_kmeans.fit(X)

MiniBatchKMeans(n_clusters=5, random_state=42)

In [10]:
from sklearn.metrics import silhouette_score
silhouette_score(X, kmeans.labels_)

0.541027415594199

### Using Clustering for Semi-Supervised Learning

In [11]:
from sklearn.datasets import load_digits

X_digits, y_digits = load_digits(return_X_y=True)
X_train, y_train = X_digits[:1400], y_digits[:1400]
X_test, y_test = X_digits[1400:], y_digits[1400:]

In [12]:
# pretending we only have 50 labels
from sklearn.linear_model import LogisticRegression

n_labeled=50
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train[:n_labeled], y_train[:n_labeled])
log_reg.score(X_test, y_test)

0.7481108312342569

In [13]:
k = 50
kmeans = KMeans(n_clusters=k, random_state=42)
X_digits_dist = kmeans.fit_transform(X_train)
representative_digit_idx = np.argmin(X_digits_dist, axis=0)
X_representative_digits = X_train[representative_digit_idx]

In [14]:
y_representative_digits = np.array([1,3,6,0,7,9,2,4,8,9,5,4,7,1,2,6,8,2,5,1,
                                    4,9,3,3,8,8,2,5,6,9,1,4,0,6,8,3,4,6,7,2,6,1,0,7,5,1,9,9,3,7])

In [15]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_representative_digits, y_representative_digits)
log_reg.score(X_test, y_test)

0.8161209068010076

In [16]:
y_train_propegated = np.empty(len(X_train), dtype=np.int64)
for i in range(k):
    y_train_propegated[kmeans.labels_ == i] = y_representative_digits[i]

In [17]:
log_reg = LogisticRegression(max_iter = 10_000)
log_reg.fit(X_train, y_train_propegated)
log_reg.score(X_test, y_test)

0.8614609571788413

In [18]:
percentile_closest = 99

X_cluster_dist = X_digits_dist[np.arange(len(X_train)), kmeans.labels_]
for i in range(k):
    in_cluster = (kmeans.labels_==i)
    cluster_dist = X_cluster_dist[in_cluster]
    cutoff_distance = np.percentile(cluster_dist, percentile_closest)
    above_cutoff = (X_cluster_dist > cutoff_distance)
    X_cluster_dist[in_cluster & above_cutoff] = -1

partially_propageted = (X_cluster_dist != -1)
X_train_paritally_propegated = X_train[partially_propageted]
y_train_partially_propegated = y_train_propegated[partially_propageted]

In [19]:
log_reg = LogisticRegression(max_iter=10_000)
log_reg.fit(X_train_paritally_propegated, y_train_partially_propegated)
log_reg.score(X_test, y_test)

0.871536523929471

In [20]:
(y_train_partially_propegated == y_train[y_train_partially_propegated]).mean()

1.0

## DBSCAN
Density-based spatial clustering of applications with noise. Looks for clusters of points, "core instances" points not in clusters are anomalies.

In [29]:
from sklearn.cluster import DBSCAN
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=1000, noise=0.05)
dbscan = DBSCAN(eps=0.2, min_samples=5)
dbscan.fit(X)

DBSCAN(eps=0.2)

In [32]:
dbscan.labels_[:10]

array([0, 0, 1, 1, 0, 1, 1, 0, 0, 0])

In [33]:
dbscan.core_sample_indices_[:10]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [28]:
dbscan.components_

array([[ 0.85907568,  0.58734843],
       [-0.94689165,  0.25306214],
       [ 1.803421  , -0.01429474],
       ...,
       [ 0.7962933 ,  0.6928887 ],
       [ 0.37797691,  0.90569668],
       [ 1.00038746,  0.12080006]])

In [36]:
# DBSCAN does not have a predict method
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=50)
knn.fit(dbscan.components_, dbscan.labels_[dbscan.core_sample_indices_])

KNeighborsClassifier(n_neighbors=50)

In [37]:
X_new = np.array([[-0.5, 0], [0,0.5], [1, -0.1], [2,1]])
knn.predict(X_new)

array([1, 0, 1, 0])

In [39]:
knn.predict_proba(X_new)

array([[0.26, 0.74],
       [1.  , 0.  ],
       [0.22, 0.78],
       [1.  , 0.  ]])

In [40]:
y_dist, y_pred_idx = knn.kneighbors(X_new, n_neighbors=1)
y_pred = dbscan.labels_[dbscan.core_sample_indices_][y_pred_idx]
y_pred[y_dist > 0.2] = -1
y_pred.ravel()

array([-1,  0,  1, -1])

## Gaussian Mixtures
GMM assumes that instances were generated from a mixture of several Gaussian distributions with unknown parameters.

In [41]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, n_init=10)
gm.fit(X)

GaussianMixture(n_components=3, n_init=10)

In [42]:
gm.weights_

array([0.61620638, 0.19203983, 0.19175379])

In [43]:
gm.means_

array([[ 0.49923034,  0.25082408],
       [-0.77364091,  0.53781476],
       [ 1.77247711, -0.03814287]])

In [44]:
gm.covariances_

array([[[ 0.18771508, -0.11889399],
        [-0.11889399,  0.29578905]],

       [[ 0.04364091,  0.05450655],
        [ 0.05450655,  0.08330451]],

       [[ 0.04410406,  0.05501956],
        [ 0.05501956,  0.08134502]]])

In [45]:
gm.converged_

True

In [46]:
gm.n_iter_

21

In [48]:
gm.predict(X)[:5]

array([2, 0, 0, 0, 0])

In [49]:
gm.predict_proba(X).round(3)

array([[0.001, 0.   , 0.999],
       [1.   , 0.   , 0.   ],
       [1.   , 0.   , 0.   ],
       ...,
       [1.   , 0.   , 0.   ],
       [0.   , 0.   , 1.   ],
       [1.   , 0.   , 0.   ]])

In [50]:
X_new, y_new = gm.sample(6)
X_new

array([[-0.21626045,  1.03406075],
       [ 0.20088498,  0.50829486],
       [ 0.92575561, -0.94928096],
       [ 0.83857889,  0.72749798],
       [-0.41577356,  0.88292968],
       [-0.89731509,  0.22801612]])

In [52]:
gm.score_samples(X).round(2)[:5]

array([-0.39, -1.61, -1.45, -1.78, -1.6 ])

### Using Gaussian Mixtures for Anomaly Detection

In [54]:
densities = gm.score_samples(X)
density_threshold = np.percentile(densities, 2)
anomalies = X[densities < density_threshold]

### Bayesian Gaussian Mixture Models
Can determine the optimal number of clusters for you.

In [55]:
from sklearn.mixture import BayesianGaussianMixture
bgm = BayesianGaussianMixture(n_components=10, n_init=10, random_state=42)
bgm.fit(X)
bgm.weights_.round(2)

array([0.12, 0.09, 0.12, 0.17, 0.12, 0.13, 0.11, 0.14, 0.  , 0.  ])