In [9]:
import pandas as pd

train_df = pd.read_csv('dataset/klue_ner_data/konlpy_tagged_basic.csv')

kkma_nouns_list = sum(train_df['kkma_nouns'].apply(lambda x : list(x)), [])
hannanum_nouns_list = [' '.join(nouns) for nouns in train_df['hannanum_nouns']]
komoran_nouns_list = [' '.join(nouns) for nouns in train_df['komoran_nouns']]
mecab_nouns_list = [' '.join(nouns) for nouns in train_df['mecab_nouns']]
okt_nouns_list = [' '.join(nouns) for nouns in train_df['okt_nouns']]

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(binary=True, max_features=25000)

kkma_embeddings = tfidf_vectorizer.fit_transform(kkma_nouns_list).toarray()
hannanum_embeddings = tfidf_vectorizer.fit_transform(hannanum_nouns_list).toarray()
komoran_embeddings = tfidf_vectorizer.fit_transform(komoran_nouns_list).toarray()
mecab_embeddings = tfidf_vectorizer.fit_transform(mecab_nouns_list).toarray()
okt_embeddings = tfidf_vectorizer.fit_transform(okt_nouns_list).toarray()

# 실루엣 스코어를 측정하고, 차트를 그려주는 함수
def visualize_silhouette(cluster_lists, X_features): 

    from sklearn.datasets import make_blobs
    from sklearn.cluster import KMeans
    from sklearn.metrics import silhouette_samples, silhouette_score

    import matplotlib.pyplot as plt
    import matplotlib.cm as cm
    import math
    from tqdm import tqdm

    # 입력값으로 클러스터링 갯수들을 리스트로 받아서, 각 갯수별로 클러스터링을 적용하고 실루엣 개수를 구함
    n_cols = len(cluster_lists)

    # plt.subplots()으로 리스트에 기재된 클러스터링 수만큼의 sub figures를 가지는 axs 생성 
    fig, axs = plt.subplots(figsize=(4*n_cols, 4), nrows=1, ncols=n_cols)

    # 리스트에 기재된 클러스터링 갯수들을 차례로 iteration 수행하면서 실루엣 개수 시각화
    for ind, n_cluster in enumerate(tqdm(cluster_lists,desc='kmeans fitting')):

        # KMeans 클러스터링 수행하고, 실루엣 스코어와 개별 데이터의 실루엣 값 계산. 
        clusterer = KMeans(n_clusters = n_cluster, max_iter=500, random_state=0)
        cluster_labels = clusterer.fit_predict(X_features)

        sil_avg = silhouette_score(X_features, cluster_labels)
        sil_values = silhouette_samples(X_features, cluster_labels)

        y_lower = 10
        axs[ind].set_title('Number of Cluster : '+ str(n_cluster)+'\n' \
                          'Silhouette Score :' + str(round(sil_avg,3)) )
        axs[ind].set_xlabel("The silhouette coefficient values")
        axs[ind].set_ylabel("Cluster label")
        axs[ind].set_xlim([-0.1, 1])
        axs[ind].set_ylim([0, len(X_features) + (n_cluster + 1) * 10])
        axs[ind].set_yticks([])  # Clear the yaxis labels / ticks
        axs[ind].set_xticks([0, 0.2, 0.4, 0.6, 0.8, 1])

        # 클러스터링 갯수별로 fill_betweenx( )형태의 막대 그래프 표현. 
        for i in tqdm(range(n_cluster),desc='plotting'):
            ith_cluster_sil_values = sil_values[cluster_labels==i]
            ith_cluster_sil_values.sort()

            size_cluster_i = ith_cluster_sil_values.shape[0]
            y_upper = y_lower + size_cluster_i

            color = cm.nipy_spectral(float(i) / n_cluster)
            axs[ind].fill_betweenx(np.arange(y_lower, y_upper), 0, ith_cluster_sil_values, \
                                facecolor=color, edgecolor=color, alpha=0.7)
            axs[ind].text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
            y_lower = y_upper + 10

        axs[ind].axvline(x=sil_avg, color="red", linestyle="--")
        
# 클러스터 갯수 설정 리스트로 변경 가능
cluster_list = [2,3,4,5,6,7]


########## kkma_embed_2d ##########
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
kkma_embed_2d = t_sne.fit_transform(kkma_embeddings)

# 분석기 지정해주기 
embed_2d = kkma_embed_2d
visualize_silhouette(cluster_list, embed_2d)


########## hannanum_embed_2d ##########
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
hannanum_embed_2d = t_sne.fit_transform(hannanum_embeddings)

# 분석기 지정해주기 
embed_2d = hannanum_embed_2d
visualize_silhouette(cluster_list, embed_2d)


########## komoran_embed_2d ##########
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
komoran_embed_2d = t_sne.fit_transform(komoran_embeddings)

# 분석기 지정해주기 
embed_2d = komoran_embed_2d
visualize_silhouette(cluster_list, embed_2d)


########## mecab_embed_2d ########## 
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
mecab_embed_2d = t_sne.fit_transform(mecab_embeddings)

# 분석기 지정해주기 
embed_2d = mecab_embed_2d
visualize_silhouette(cluster_list, embed_2d)

########## okt_embed_2d ########## 
# TSNE로 차원축소하면 값이 2차원 (x, y)
from sklearn.manifold import TSNE
t_sne = TSNE()
okt_embed_2d = t_sne.fit_transform(okt_embeddings)

# 분석기 지정해주기 
embed_2d = okt_embed_2d
visualize_silhouette(cluster_list, embed_2d)

ValueError: empty vocabulary; perhaps the documents only contain stop words