In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 수집한 데이터들중 원하는 가져오기 위해 키워드를 입력합니다.
query_keyword = input("분석을 위한 키워드를 입력하세요 : ")
start, end, step = [int(x) for x in input("원하는 클러스터 갯수들을 입력하세요(e.g. 2,5,1)").split(",")]
xaxis = range(start, end, step)

분석을 위한 키워드를 입력하세요 : 코로나
원하는 클러스터 갯수들을 입력하세요(e.g. 2,5,1)2, 10, 2


In [3]:
import pickle
base_path = "/content/drive/MyDrive/Colab Notebooks/dataset/"

with open(base_path + f"tokenized_docs({query_keyword}).pk", "rb") as f:
    tokenized_docs = pickle.load(f)

In [4]:
cluster_mode = input("사용할 클러스터링 방법을 선택하세요.\n(e.g. k-means, HAC) : ")

사용할 클러스터링 방법을 선택하세요.
(e.g. k-means, HAC) : k-means


### 클러스터링을 위한 라이브러리 불러오기 

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

from sklearn.metrics import silhouette_score


import pickle
from time import time

### Text Encoding 

In [None]:
corpus = [" ".join(doc) for doc in tokenized_docs]

#vectorizer = CountVectorizer() # CountVectorizer(), TfidfVectorizer()
vectorizer = TfidfVectorizer(min_df=5)
#vectorizer = TfidfVectorizer(max_features=500)
X = vectorizer.fit_transform(corpus).todense()

print("TF-IDF model : ", X.shape)

### Find the optimal K 

In [None]:
silhouettes = []

# dissimilarity would not be defined for a single cluster, thus, minimum number of clusters should be 2
for k in xaxis:
    skmeans = KMeans(n_clusters = k).fit(X)
    labels = skmeans.labels_
    silhouettes.append(silhouette_score(X, labels))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 10))
plt.xticks(xaxis)
plt.title("Silhouette Score over K-means")
plt.xlabel("K")
plt.ylabel("Silhouette Score")
plt.plot(xaxis, silhouettes)
plt.show()

In [None]:
K = np.argmax(np.array(silhouettes)) + 2
print("Number of clusters :", K)

if cluster_mode == "hac":
    # For Hierarchical Clustering
    linkage = "average" # "single", "average"
    affinity = "cosine"


### 클러스터링 세팅 및 실행

In [None]:
if cluster_mode == "k-means":
    
    start = time()
    
    print(f"Start {cluster_mode} Clustering")

    method = KMeans(K).fit(X)
    labels = method.labels_

    print(f"End {cluster_mode} Clustering")

    sil_score = silhouette_score(X, labels)
    
    end = time()
    
    
    print(f"================  {cluster_mode}  =========================")
    print("Silhoutte Score = %.3f" % sil_score)
    print("Elapsed Time : %.2f" % (end - start))


elif cluster_mode == 'hac':
    
    start = time()

    print(f"Start {cluster_mode} Clustering with {linkage} linkage.")
    
    method = AgglomerativeClustering(n_clusters=K, affinity=affinity, linkage=linkage).fit(X)
    labels = method.labels_

    print(f"End {cluster_mode} Clustering")

    sil_score = silhouette_score(X, labels)
    
    end = time()
    
    
    print(f"================  {cluster_mode}  =========================")
    print("Silhoutte Score = %.3f" % sil_score)
    print("Elapsed Time : %.2f" % (end - start))

### (OPTIONAL) Dimensionality Reduction with PCA 

In [None]:
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

pca = PCA().fit(X)

ratio = np.cumsum(pca.explained_variance_ratio_)
reduced_X = np.arange(len(ratio))
plt.plot(ratio)
plt.axhline(y=0.9, color='r')
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');
print(np.interp(0.9, ratio, reduced_X))
print(int(np.interp(0.9, ratio, reduced_X)))

In [None]:
def pca_wv(X, D):
    pca = PCA(n_components = D)
    pca_vectors = pca.fit_transform(X)
    print(pca.singular_values_)
    
    return pca_vectors

In [None]:
dim = int(np.interp(0.9, ratio, reduced_X))
post_vectors = pca_wv(X, dim)

In [None]:
X = post_vectors

print("After PCA : ", X.shape)

if cluster_mode == "k-means":

    start = time()

    print(f"Start {cluster_mode} Clustering")

    method = KMeans(K).fit(X)
    labels = method.labels_

    print(f"End {cluster_mode} Clustering")

    sil_score = silhouette_score(X, labels)

    end = time()


    print(f"================  {cluster_mode}  =========================")
    print("Silhoutte Score = %.3f" % sil_score)
    print("Elapsed Time : %.2f" % (end - start))



elif cluster_mode == 'hac':

    start = time()

    print(f"Start {cluster_mode} Clustering with {linkage} linkage.")

    method = AgglomerativeClustering(n_clusters=K, affinity=affinity, linkage=linkage).fit(X)
    labels = method.labels_

    print(f"End {cluster_mode} Clustering")

    sil_score = silhouette_score(X, labels)
    end = time()


    print(f"================  {cluster_mode}  =========================")
    print("Silhoutte Score = %.3f" % sil_score)
    print("Elapsed Time : %.2f" % (end - start))

### tSNE for Visualization 

In [None]:
import pandas as pd

tsne = TSNE(n_components = 2)
tsne_feature = tsne.fit_transform(X)

if PCA_mode:
    tsne_feature = tsne.fit_transform(post_vectors)

tsne_df = pd.DataFrame(tsne_feature)
tsne_df['label'] = labels
tsne_df.info()

In [None]:
tsne_df.head()

In [None]:
import seaborn as sns

plt.figure(figsize=(16, 16))
sns.scatterplot(x=0, y=1, hue="label",
               palette=sns.color_palette("hls", K),
               data=tsne_df,
               #legend="full",
               alpha=0.7)
#plt.show()
plt.savefig(base_path + 'tSNE_figure.png')