In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans

from sklearn.metrics import silhouette_samples, silhouette_score

In [2]:
iris = load_iris()
feature_names = ['sepal_length','sepal_width','petal_length','petal_width']

irisDF = pd.DataFrame(data=iris.data, columns=feature_names)
irisDF

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [3]:
from sklearn.metrics import silhouette_samples, silhouette_score

# iris 의 모든 개별 데이터에 실루엣 계수값을 구함. 
score_samples = silhouette_samples(iris.data, irisDF['cluster'])
print('silhouette_samples( ) return 값의 shape' , score_samples.shape)

# irisDF에 실루엣 계수 컬럼 추가
irisDF['silhouette_coeff'] = score_samples

# 모든 데이터의 평균 실루엣 계수값을 구함. 
average_score = silhouette_score(iris.data, irisDF['cluster'])
print('붓꽃 데이터셋 Silhouette Analysis Score:{0:.3f}'.format(average_score))

irisDF.head(3)

KeyError: 'cluster'

In [None]:
irisDF.groupby('cluster')['silhouette_coeff'].mean()
import warnings
# 모든 경고를 무시하도록 설정
warnings.filterwarnings('ignore')

clustering_inertia = []
##model.inertia_는 KMeans 군집화 모델에서 사용되는 속성 중 하나로, 
#군집화 결과에 대한 내부 응집도(군집내 분산)를 의미
for n in range(1, 11):
    model = KMeans(n_clusters = n, init = 'k-means++', n_init = 10, 
                  max_iter=300, random_state = 2020)
    model.fit(irisDF.iloc[:,0:4])
    clustering_inertia.append(model.inertia_)
    
plt.plot(np.arange(1,11), clustering_inertia, 'o')
plt.plot(np.arange(1,11), clustering_inertia, '-')
plt.xlabel('Number of Clusters')
plt.ylabel('inertia')
plt.show()