In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
from sklearn.datasets import load_iris

iris = load_iris()
df = pd.DataFrame(data = iris.data, columns = iris.feature_names)
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2


In [3]:
from sklearn.cluster import KMeans

kmeans_3 = KMeans(n_clusters = 3, init = 'k-means++', max_iter = 15, random_state = 2045)
kmeans_3.fit(df)

KMeans(max_iter=15, n_clusters=3, random_state=2045)

## Silhouette Analysis
#### Silhouette Coefficient

- 실루엣 계수(Silhouette Coefficient) 측정 지표
  - 개별 data points가 가지는 군집화 지표
  - data points가 같은 군집 내의 다른 data points와 얼마나 가깝게 군집되어 있고,
  - 다른 군집에 있는 data points와 얼마나 멀게 분리되어 있는지 나타내는 지표

- 각 군집간 거리가 얼마나 효율적으로 분리되었는지 평가
  - 다른 군집과의 거리는 멀고, 군집 내 data point 간의 거리는 가깝게 형성


In [4]:
# 군집분석 결과 사용
df['clustering'] = kmeans_3.labels_
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),clustering
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1


In [5]:
# 실루엣 계수값 계산
from sklearn.metrics import silhouette_samples

silhouette_samples(iris.data, df['clustering'])

array([0.85295506, 0.81549476, 0.8293151 , 0.80501395, 0.8493016 ,
       0.74828037, 0.82165093, 0.85390505, 0.75215011, 0.825294  ,
       0.80310303, 0.83591262, 0.81056389, 0.74615046, 0.70259371,
       0.64377156, 0.77568391, 0.85101831, 0.70685782, 0.82030124,
       0.78418399, 0.82590584, 0.79297218, 0.7941134 , 0.77503635,
       0.79865509, 0.83346695, 0.84201773, 0.84364429, 0.81784646,
       0.81518962, 0.79899235, 0.76272528, 0.72224615, 0.82877171,
       0.83224831, 0.79415322, 0.84188954, 0.76856774, 0.85033231,
       0.84941579, 0.63900017, 0.78657771, 0.80023815, 0.74698726,
       0.80977534, 0.81340268, 0.81902059, 0.8182324 , 0.85209835,
       0.02672203, 0.38118643, 0.05340075, 0.59294381, 0.36885321,
       0.59221025, 0.28232583, 0.26525405, 0.34419223, 0.57829491,
       0.37478707, 0.58710354, 0.55107857, 0.48216686, 0.56310057,
       0.32459291, 0.55751057, 0.61072967, 0.46149897, 0.6115753 ,
       0.32909528, 0.58968904, 0.31046301, 0.49424779, 0.50004

In [6]:
# data points 별 실루엣 계수값 추가
df['silh_coef'] = silhouette_samples(iris.data, df['clustering'])
df.head(3)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),clustering,silh_coef
0,5.1,3.5,1.4,0.2,1,0.852955
1,4.9,3.0,1.4,0.2,1,0.815495
2,4.7,3.2,1.3,0.2,1,0.829315


#### Silhouette Score

- data points의 Silhouette Coefficient의 평균

- 권장 Silhouette Score
  - 전체 Silhouette Coefficient 평균이 0 ~ 1 사이의 값을 가지며, 1에 가까운 경우
  - 개별 군집의 Silhouette Coefficient 평균들이 전체 Silhouette Coefficient 평균과 크게 차이나지 않는 경우

In [7]:
from sklearn.metrics import silhouette_score

silhouette_score(iris.data, df['clustering'])

0.5528190123564095

In [8]:
df.groupby('clustering')['silh_coef'].mean()

clustering
0    0.417320
1    0.798140
2    0.451105
Name: silh_coef, dtype: float64