In [58]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets

n_samples = 1500
dataset = datasets.make_blobs(n_samples=n_samples, centers=2, center_box=(-7.0, 7.5),
                              cluster_std=[1.4, 1.7],
                              random_state=42)
X_2, _ = datasets.make_blobs(n_samples=n_samples, random_state=170, centers=[[-4, -3]], cluster_std=[1.9])
transformation = [[1.2, -0.8], [-0.4, 1.7]]
X_2 = np.dot(X_2, transformation)
X, y = np.concatenate((dataset[0], X_2)), np.concatenate((dataset[1], np.array([2] * len(X_2))))

In [2]:
from sklearn.cluster import KMeans
from sklearn.metrics.cluster import v_measure_score

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X)
kmeans_pred = kmeans.labels_

# теперь посчитаем однородность
v_measure_score(labels_true=y, labels_pred=kmeans_pred)

0.7972774344500205

## Задание 14.1

In [60]:
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler, MinMaxScaler

gm = GaussianMixture(n_components=3, random_state=42)
X = StandardScaler().fit_transform(X)
y_pred = gm.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9332760595996924

## Задание 14.2

In [4]:
from sklearn.mixture import GaussianMixture

gm = GaussianMixture(n_components=3, random_state=42)
y_pred = gm.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9332760595996924

In [5]:
from sklearn.cluster import AgglomerativeClustering

ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9079037199053294

In [6]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.9, min_samples=35)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.000884601531943088

## Задание 14.3

In [8]:
from sklearn.cluster import KMeans

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=1, random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
v_measure_score(labels_true=y, labels_pred=y_pred)

0.7892280957870129

In [9]:
from sklearn.cluster import KMeans

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, init='random', n_init=1, random_state=42)
kmeans.fit(X)
y_pred_2 = kmeans.labels_
v_measure_score(labels_true=y, labels_pred=y_pred_2)

0.7892280957870129

## Задание 14.4

In [11]:
from sklearn.cluster import MiniBatchKMeans

kmeans_mini_batch = MiniBatchKMeans(n_clusters=3, random_state=42)
kmeans_mini_batch.fit(X)
kmeans_mini_batch_pred = kmeans_mini_batch.labels_
v_measure_score(labels_true=y, labels_pred=kmeans_mini_batch_pred)

0.7583969521655418

In [12]:
from sklearn.cluster import KMeans

# сначала получим предсказанные кластеры при помощи метода кластеризации
kmeans = KMeans(n_clusters=3, n_init=1, random_state=42)
kmeans.fit(X)
y_pred = kmeans.labels_
v_measure_score(labels_true=y, labels_pred=y_pred)

0.7892280957870129

## Задание 14.5

In [13]:
from sklearn.cluster import AgglomerativeClustering
## {"ward", "complete", "average", "single"}, default="ward"
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9079037199053294

In [14]:
ac = AgglomerativeClustering(n_clusters=3, linkage='complete')
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.5805530251504777

In [15]:
ac = AgglomerativeClustering(n_clusters=3, linkage='average')
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.678656551579543

In [16]:
ac = AgglomerativeClustering(n_clusters=3, linkage='single')
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.0008842106330108959

## Задание 14.6

In [18]:
from sklearn.neighbors import kneighbors_graph

connectivity = kneighbors_graph(X, n_neighbors=6, include_self=False)
connectivity = 0.5 * (connectivity + connectivity.T)

In [19]:
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9079037199053294

In [20]:
ac = AgglomerativeClustering(n_clusters=3, connectivity=connectivity)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.883942992495597

## Задание 14.7

In [29]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.9, min_samples=35)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.7732549110297919

In [30]:
dbscan = DBSCAN(eps=0.8, min_samples=35)
dbscan.fit(X)
y_pred = dbscan.labels_.astype(np.int)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.7063100442576922

In [44]:
unique, counts = np.unique(y_pred, return_counts=True)
dict(zip(unique, counts))

{-1: 368, 0: 656, 1: 721, 2: 1254}

In [None]:
Задание 14.9

## Задание 14.8

## Задание 14.9

In [55]:
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.703747024360433

In [56]:
X = StandardScaler().fit_transform(X)
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.9079037199053294

## Задание 14.10

In [59]:
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.703747024360433

In [61]:
X = MinMaxScaler().fit_transform(X)
ac = AgglomerativeClustering(n_clusters=3)
y_pred = ac.fit_predict(X)
v_measure_score(labels_true=y, labels_pred=y_pred)

0.8929241488344335