# 문서 군집화 (Document Clustering)

In [1]:
import numpy as np 
import pandas as pd  

In [2]:
import os, glob 

path = 'C:\\skn_17\\nlp\\01_text_analysis\\data\\OpinosisDataset1.0\\topics'
all_files = glob.glob(os.path.join(path, '*.data'))

filename_list = []
opinions_list = []

for file_ in all_files:
    df = pd.read_table(file_, header=None, index_col=None, encoding='latin1')
    # display(df)
 
    filename = file_.split('\\')[-1]   # 폴더만 반환
    filename = filename.split('.')[0]
    filename_list.append(filename)

    opinions = df.to_string(index=False, header=False)
    opinions_list.append(opinions)

In [3]:
document_df = pd.DataFrame({
    'filename' : filename_list, 
    'opinions' : opinions_list
})

document_df

Unnamed: 0,filename,opinions
0,accuracy_garmin_nuvi_255W_gps,...
1,bathroom_bestwestern_hotel_sfo,...
2,battery-life_amazon_kindle,...
3,battery-life_ipod_nano_8gb,...
4,battery-life_netbook_1005ha,...
5,buttons_amazon_kindle,...
6,comfort_honda_accord_2008,...
7,comfort_toyota_camry_2007,...
8,directions_garmin_nuvi_255W_gps,...
9,display_garmin_nuvi_255W_gps,...


### 특성 벡터화 및 전처리 
- TfidfVectorizer를 이용한 벡터화 
- 불용어 처리 
- ngram 설정
- 어근 분리 (Lemmatization)

In [4]:
import nltk

nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Playdata\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('running', pos='v'))
print(lemmatizer.lemmatize('ran', pos='v'))

run
run


In [6]:
import string
string.punctuation    # 문장에서 사용할 수 있는 모든 문장 부호를 포함한 문자열

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [7]:
def lemmatize(text):
    # 소문자 변환
    text = text.lower()

    # 특수문자 제거 
    punc_rem_dict = dict((ord(ch), None) for ch in string.punctuation)
    text = text.translate(punc_rem_dict)

    # 토큰화
    tokens = nltk.word_tokenize(text)

    # 어근 분리 
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token, pos='v') for token in tokens]

lemmatize('The Matrix is everywhere its all around us, here even in this room!!!!! You can see it out your window or on your television!!!!!')

['the',
 'matrix',
 'be',
 'everywhere',
 'its',
 'all',
 'around',
 'us',
 'here',
 'even',
 'in',
 'this',
 'room',
 'you',
 'can',
 'see',
 'it',
 'out',
 'your',
 'window',
 'or',
 'on',
 'your',
 'television']

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(
    tokenizer=lemmatize,
    stop_words='english',
    ngram_range=(1, 2), 
    max_df=0.85, 
    min_df=0.05
)

opinions_vecs = tfidf_vectorizer.fit_transform(document_df['opinions'])
print(opinions_vecs.toarray().shape)
print(opinions_vecs)



(51, 4072)
<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 25383 stored elements and shape (51, 4072)>
  Coords	Values
  (0, 165)	0.7742542979210241
  (0, 1463)	0.07204716607359458
  (0, 3407)	0.016130297873354668
  (0, 2815)	0.03322048155310359
  (0, 1016)	0.1613798782436557
  (0, 1866)	0.020663607871398704
  (0, 1444)	0.013478481520354605
  (0, 1053)	0.007936997735176827
  (0, 2031)	0.01013758802410648
  (0, 360)	0.015122124086220925
  (0, 2358)	0.02934179604430104
  (0, 3165)	0.017183123909565296
  (0, 3550)	0.012166058278124673
  (0, 2226)	0.019655434084264962
  (0, 2686)	0.015122124086220925
  (0, 1459)	0.017322714375150577
  (0, 1799)	0.018011791518398646
  (0, 3143)	0.05196814312545173
  (0, 2540)	0.016699368276168712
  (0, 2266)	0.17743327660690134
  (0, 2803)	0.016699368276168712
  (0, 443)	0.015122124086220925
  (0, 1910)	0.04757120962562307
  (0, 1532)	0.2161414982207837
  (0, 3798)	0.017322714375150577
  :	:
  (50, 232)	0.012712026490235473
  (50, 2966)	0.0127

### KMeans 군집화

In [9]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=4, 
    max_iter=5000, 
    random_state=0
)

opinions_label = kmeans.fit_predict(opinions_vecs)
document_df['cluster'] = opinions_label
document_df

Unnamed: 0,filename,opinions,cluster
0,accuracy_garmin_nuvi_255W_gps,...,2
1,bathroom_bestwestern_hotel_sfo,...,1
2,battery-life_amazon_kindle,...,0
3,battery-life_ipod_nano_8gb,...,2
4,battery-life_netbook_1005ha,...,2
5,buttons_amazon_kindle,...,0
6,comfort_honda_accord_2008,...,3
7,comfort_toyota_camry_2007,...,3
8,directions_garmin_nuvi_255W_gps,...,2
9,display_garmin_nuvi_255W_gps,...,2


In [10]:
document_df[document_df['cluster'] == 3]  # 클러스터별 어쩌구 (깃 확인하기)

Unnamed: 0,filename,opinions,cluster
6,comfort_honda_accord_2008,...,3
7,comfort_toyota_camry_2007,...,3
16,gas_mileage_toyota_camry_2007,...,3
17,interior_honda_accord_2008,...,3
18,interior_toyota_camry_2007,...,3
22,mileage_honda_accord_2008,...,3
25,performance_honda_accord_2008,...,3
29,quality_toyota_camry_2007,...,3
37,seats_honda_accord_2008,...,3
47,transmission_toyota_camry_2007,...,3


In [11]:
centers = kmeans.cluster_centers_
print(centers.shape)

centroid_arg_idx = centers.argsort()[:, ::-1]
top_20 = centroid_arg_idx[:, :20]
feature_names = tfidf_vectorizer.get_feature_names_out()
feature_names[top_20]   # 벡터화된 데이터를 주제별로 군집하고, 어떤 단어들이 많이 나오는 지 확인 가능

(4, 4072)


array([['kindle', 'price', 'page', 'button', 'font', 'battery', 'eye',
        'book', 'faster', 'navigation', 'font size', 'read', 'navigate',
        'easy', 'vista', 'screen', 'easy eye', 'kindle 2', 'hotel',
        'size'],
       ['room', 'service', 'hotel', 'staff', 'food', 'location', 'clean',
        'bathroom', 'park', 'room service', 'free', 'stay', 'friendly',
        'great location', 'wine', 'helpful', 'bed', 'breakfast', 'wharf',
        'tube'],
       ['screen', 'battery', 'keyboard', 'battery life', 'directions',
        'voice', 'map', 'life', 'video', 'feature', 'speed', 'display',
        'size', 'speed limit', 'accurate', 'satellite', 'update',
        'sound', 'performance', 'limit'],
       ['interior', 'seat', 'mileage', 'comfortable', 'gas',
        'gas mileage', 'transmission', 'car', 'performance', 'quality',
        'ride', 'comfort', 'camry', 'drive', 'toyota',
        'seat comfortable', 'accord', 'exterior', 'uncomfortable',
        'honda']], dtype=obj