# **실습**

In [1]:
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"

In [2]:
import numpy as np
import pandas as pd
import re
import gensim
import matplotlib.pyplot as plt

from gensim.models.doc2vec import TaggedDocument
from gensim.models import Doc2Vec
from konlpy.tag import Okt
from tqdm import tqdm

from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.metrics.cluster import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
df = pd.read_csv('./Data/한글 크롤링.csv')

In [None]:
df.head()

In [None]:
#특수문자 / 숫자 제거

pattern = re.compile(r'[^a-zA-Z가-힣\s\.\?\!]')
string = re.sub(pattern, ' ', 'DCX 프로젝트**** 중입니다.')
string

In [None]:
# 특수문자 / 숫자 제거 함수 제작
def re_pattern(string):
    pattern = re.compile(r'[^a-zA-Z가-힣\s\.\?\!]')
    string = re.sub(pattern, ' ', string)

    pattern2 = re.compile(r'\s+')
    result = re.sub(pattern2, ' ', string)
    return result

In [None]:
df['re_review'] = df['Review'].apply(lambda x : re_pattern(x))

In [None]:
df.shape

In [None]:
# 짧은 글 제거 

for index, n in enumerate(df['Review']):
    if len(n) < 15:
        df = df.drop([index])

# df = df['Review'].apply(lambda x : len(x) > 15)

In [None]:
df.shape

In [None]:
df = df.reset_index(drop = True)

In [None]:
# 형태소 분리

okt = Okt()

In [None]:
stopwords_df = pd.read_csv('./Data/ko-stopwords.csv') #불용어사전

In [None]:
stopwords = set(stopwords_df['stopwords'])

In [None]:
def okt_pos_tagging(string): #'Noun','Adjective','Verb'
    pos_words = okt.pos(string, stem= True, norm = True)
    result = [word for word, tag in pos_words if word not in stopwords if tag in {'Noun', 'Adjective', 'Verb'}]
    return result

In [None]:
tqdm.pandas() #프로세스바
df['tagged_review'] = df['re_review'].progress_apply(lambda x :okt_pos_tagging(x) )

In [None]:
#벡터화 

tagged_corpus_list = []

for index, word in enumerate(df['tagged_review']):
    tag = 'document{}'.format(index) #id
    tagged_corpus_list.append(TaggedDocument(tags=[tag], words = word))

In [None]:
model_doc2vec = Doc2Vec(vector_size= 200, 
                       alpha = 0.025,
                       min_alpha = 0.01,
                       window = 3,
                       min_count = 1,
                       dm = 1 ) # dm = 1 문맥

In [None]:
model_doc2vec.build_vocab(tagged_corpus_list)

In [None]:
model_doc2vec.train(tagged_corpus_list, total_examples=model_doc2vec.corpus_count, epochs=100)

In [None]:
model_doc2vec.dv["document0"]

In [None]:
#데이터 프레임에 추가

vector_list = []

for i in range(len(df)):
    doc2vec = model_doc2vec.dv['document{}'.format(i)]
    vector_list.append(doc2vec)

df['vector'] = vector_list

In [None]:
df.head()

In [None]:
#pip install fastcluster 

In [None]:
import fastcluster as fc

X = np.stack(df['vector'].to_numpy())
X_red = TruncatedSVD(n_components=100, random_state=42).fit_transform(X)

Z = fc.linkage_vector(X_red, method='ward')  # 메모리 절약
plt.figure(figsize=(10,5))
dendrogram(Z, orientation='top', truncate_mode='lastp', p=20,
           distance_sort='descending', show_leaf_counts=False)
plt.show()

In [None]:
#덴드로그램
'''
model_linkage = linkage(list(df['vector']), 'ward')

plt.figure(figsize=(10, 5)) 
dendrogram(model_linkage,
          orientation='top',    
          distance_sort='descendig',
          show_leaf_counts=False)
plt.show()
'''

In [None]:
#3개 클러스터로 분리하기
cluster_model = AgglomerativeClustering(n_clusters=3, linkage='ward')
cluster_label = cluster_model.fit_predict(list(df['vector']))


In [None]:
#클러스터 수, 실루엣 지수 확인

n_cluster = []
clustering_score = []

for i in tqdm(range(2,7)):
    cluster_model = AgglomerativeClustering(n_clusters=i, linkage='ward')
    cluster_label = cluster_model.fit_predict(list(df['vector']))
    score = silhouette_score(list(df['vector']), cluster_label)

    n_cluster.append(i)
    clustering_score.append(score)


In [None]:
pd.DataFrame({'n_cluster':n_cluster, 'clustering_score':clustering_score})

In [None]:
plt.plot(n_cluster, clustering_score)

In [None]:
#TFIDF
all_documents = []
for i in df['cluster'].unique():
    pos_tagged = df[df['cluster'] == i]['tagged_review']

    document = ''
    for j in pos_tagged:
        doc = " ".join(j) + ' '
        document += doc
    all_documents.append(document)


In [None]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(all_documents) #매트릭스 

In [None]:
feature_name = vectorizer.get_feature_names_out() #키워드 도출
feature_name

In [None]:
tfidf_value = tfidf_matrix.toarray() #매트릭스를 array로 변환

In [None]:
tfidf_df = pd.DataFrame(tfidf_value, columns = feature_name)

In [None]:
tfidf_df.index = df['cluster'].unique()
tfidf_df = tfidf_df.T
tfidf_df

In [None]:
for i in tfidf_df.columns:
    tfidfvalue = tfidf_df[i].sort_values(ascending=False)

    data = {'tfidf_word':tfidfvalue.index, 'tfidf_value':tfidfvalue.values}
    data_df = pd.DataFrame(data)
    data_df.to_csv(r"/content/sample_data/cluster_{}.csv".format(i), encoding = 'utf-8-sig')

In [None]:
import pickle

with open('clusterin_result.pkl','wb') as f :
    pickle.dump(df, f)