forked from amueller/introduction_to_ml_with_python
/
plot_kmeans.py
132 lines (112 loc) · 5.87 KB
/
plot_kmeans.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances
import matplotlib.pyplot as plt
import matplotlib as mpl
from cycler import cycler
from .tools import discrete_scatter
from .plot_2d_separator import plot_2d_classification
from .plot_helpers import cm3
def plot_kmeans_algorithm():
X, y = make_blobs(random_state=1)
# we don't want cyan in there
with mpl.rc_context(rc={'axes.prop_cycle': cycler('color', ['#0000aa',
'#ff2020',
'#50ff50'])}):
fig, axes = plt.subplots(3, 3, figsize=(10, 8), subplot_kw={'xticks': (), 'yticks': ()})
axes = axes.ravel()
axes[0].set_title("입력 데이터")
discrete_scatter(X[:, 0], X[:, 1], ax=axes[0], markers=['o'], c='w')
axes[1].set_title("초기화")
init = X[:3, :]
discrete_scatter(X[:, 0], X[:, 1], ax=axes[1], markers=['o'], c='w')
discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2], ax=axes[1],
markers=['^'], markeredgewidth=2)
axes[2].set_title("포인트 할당 (1)")
km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
centers = km.cluster_centers_
# need to compute labels by hand. scikit-learn does two e-steps for max_iter=1
# (and it's totally my fault)
labels = np.argmin(pairwise_distances(init, X), axis=0)
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[2])
discrete_scatter(init[:, 0], init[:, 1], [0, 1, 2],
ax=axes[2], markers=['^'], markeredgewidth=2)
axes[3].set_title("중심 재계산 (1)")
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[3])
discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
ax=axes[3], markers=['^'], markeredgewidth=2)
axes[4].set_title("포인트 재할당 (2)")
km = KMeans(n_clusters=3, init=init, max_iter=1, n_init=1).fit(X)
labels = km.labels_
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[4])
discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
ax=axes[4], markers=['^'], markeredgewidth=2)
km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
axes[5].set_title("중심 재계산 (2)")
centers = km.cluster_centers_
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[5])
discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
ax=axes[5], markers=['^'], markeredgewidth=2)
axes[6].set_title("포인트 재할당 (3)")
labels = km.labels_
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[6])
markers = discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
ax=axes[6], markers=['^'],
markeredgewidth=2)
axes[7].set_title("중심 재계산 (3)")
km = KMeans(n_clusters=3, init=init, max_iter=3, n_init=1).fit(X)
centers = km.cluster_centers_
discrete_scatter(X[:, 0], X[:, 1], labels, markers=['o'],
ax=axes[7])
discrete_scatter(centers[:, 0], centers[:, 1], [0, 1, 2],
ax=axes[7], markers=['^'], markeredgewidth=2)
axes[8].set_axis_off()
axes[8].legend(markers, ["클러스터 0", "클러스터 1", "클"
""
""
"러스터 2"], loc='best')
def plot_kmeans_boundaries():
X, y = make_blobs(random_state=1)
init = X[:3, :]
km = KMeans(n_clusters=3, init=init, max_iter=2, n_init=1).fit(X)
discrete_scatter(X[:, 0], X[:, 1], km.labels_, markers=['o'])
discrete_scatter(km.cluster_centers_[:, 0], km.cluster_centers_[:, 1],
[0, 1, 2], markers=['^'], markeredgewidth=2)
plot_2d_classification(km, X, cm=cm3, alpha=.4)
def plot_kmeans_faces(km, pca, X_pca, X_people, y_people, target_names):
n_clusters = 10
image_shape = (87, 65)
fig, axes = plt.subplots(n_clusters, 11, subplot_kw={'xticks': (), 'yticks': ()},
figsize=(10, 15), gridspec_kw={"hspace": .3})
for cluster in range(n_clusters):
center = km.cluster_centers_[cluster]
mask = km.labels_ == cluster
dists = np.sum((X_pca - center) ** 2, axis=1)
dists[~mask] = np.inf
inds = np.argsort(dists)[:5]
dists[~mask] = -np.inf
inds = np.r_[inds, np.argsort(dists)[-5:]]
axes[cluster, 0].imshow(pca.inverse_transform(center).reshape(image_shape), vmin=0, vmax=1)
for image, label, asdf, ax in zip(X_people[inds], y_people[inds],
km.labels_[inds], axes[cluster, 1:]):
ax.imshow(image.reshape(image_shape), vmin=0, vmax=1)
ax.set_title("%s" % (target_names[label].split()[-1]), fontdict={'fontsize': 9})
# add some boxes to illustrate which are similar and which dissimilar
rec = plt.Rectangle([-5, -30], 73, 1295, fill=False, lw=2)
rec = axes[0, 0].add_patch(rec)
rec.set_clip_on(False)
axes[0, 0].text(0, -40, "중심")
rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
rec = axes[0, 1].add_patch(rec)
rec.set_clip_on(False)
axes[0, 1].text(0, -40, "중심에서 가까운 이미지")
rec = plt.Rectangle([-5, -30], 385, 1295, fill=False, lw=2)
rec = axes[0, 6].add_patch(rec)
rec.set_clip_on(False)
axes[0, 6].text(0, -40, "중심에서 먼 이미지")