##07. 문서 군집화 소개와 실습 (Opinion Review 데이터 세트)

**문서 군집화 개념**
- 문서 군집화: 비숫한 텍스트 구성의 문서를 군집화하는 방법
  - 동일한 군집에 속하는 문서를 같은 카테고리 소속으로 분류 -> 텍스트 분류 기반의 문서 분류와 유사함
  - 문서 군집화는 텍스트 분류 기반의 문서 분류와 달리 학습 데이터 세트가 필요없는 비지도학습 기반임
  

**Opinion Reviwe 데이터 세트를 이용한 문서 군집화 수행**
- 데이터세트 구성: 51개의 텍스트 파일 구성


In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# 필수 NLTK 데이터 다운로드 (필요시)
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')

def LemNormalize(text):
  lemmatizer = WordNetLemmatizer()
  return [lemmatizer.lemmatize(token) for token in word_tokenize(text)]

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
!ls "/content/drive/MyDrive/Colab Notebooks/OpinosisDataset1.0/topics"


accuracy_garmin_nuvi_255W_gps.txt.data
bathroom_bestwestern_hotel_sfo.txt.data
battery-life_amazon_kindle.txt.data
battery-life_ipod_nano_8gb.txt.data
battery-life_netbook_1005ha.txt.data
buttons_amazon_kindle.txt.data
comfort_honda_accord_2008.txt.data
comfort_toyota_camry_2007.txt.data
directions_garmin_nuvi_255W_gps.txt.data
display_garmin_nuvi_255W_gps.txt.data
eyesight-issues_amazon_kindle.txt.data
features_windows7.txt.data
fonts_amazon_kindle.txt.data
food_holiday_inn_london.txt.data
food_swissotel_chicago.txt.data
free_bestwestern_hotel_sfo.txt.data
gas_mileage_toyota_camry_2007.txt.data
interior_honda_accord_2008.txt.data
interior_toyota_camry_2007.txt.data
keyboard_netbook_1005ha.txt.data
location_bestwestern_hotel_sfo.txt.data
location_holiday_inn_london.txt.data
mileage_honda_accord_2008.txt.data
navigation_amazon_kindle.txt.data
parking_bestwestern_hotel_sfo.txt.data
performance_honda_accord_2008.txt.data
performance_netbook_1005ha.txt.data
price_amazon_kindle.txt.data
pri

In [6]:
import pandas as pd
import glob, os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

path = r'/content/drive/MyDrive/Colab Notebooks/OpinosisDataset1.0/topics'

# path로 지정한 디렉터리 밑에 있는 모든 .data 파일들의 파일명을 리스트로 취합
all_files = glob.glob(os.path.join(path, "*.txt.data"))

filename_list = []
opinion_text = []

# 개별 파일들의 파일명은 filename_list 리스트로 취합,
# 개별 파일들의 파일 내용은 DataFrame 로딩 후 다시 string으로 변환하여 opinion_text 리스트로 취합
for file_ in all_files:
    with open(file_, 'r', encoding='latin1') as f:
        text = f.read()
    # 절대경로로 주어진 파일명을 가공. Linux에서 수행시에는 아래 \\를 / 변경.
    # 맨 마지막 .data 확장자도 제거
    filename_ = file_.split('\\')[-1]
    filename = os.path.splitext(os.path.basename(file_))[0]

    # 파일명 리스트와 파일 내용 리스트에 파일명과 파일 내용을 추가
    filename_list.append(filename)
    opinion_text.append(text)

# 파일명 리스트와 파일 내용 리스트를 DataFrame으로 생성
document_df = pd.DataFrame({'filename': filename_list, 'opinion_text': opinion_text})
document_df.head()


Unnamed: 0,filename,opinion_text
0,battery-life_ipod_nano_8gb.txt,short battery life I moved up from an 8gb .\...
1,accuracy_garmin_nuvi_255W_gps.txt,", and is very, very accurate .\n but for the m..."
2,voice_garmin_nuvi_255W_gps.txt,The voice prompts and maps are wonderful esp...
3,display_garmin_nuvi_255W_gps.txt,3 quot widescreen display was a bonus .\n Thi...
4,features_windows7.txt,"I had to uninstall anti, virus and selected o..."


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(tokenizer=LemNormalize, stop_words='english', ngram_range=(1,2), min_df=0.05, max_df=0.85)

#opinion_text 컬럼값으로 feature vectorization 수행
feature_vect = tfidf_vect.fit_transform(document_df['opinion_text'])



In [8]:
from sklearn.cluster import KMeans

# 5개 집합으로 군집화 수행. 예제를 위해 동일한 클러스터링 결과 도출용 random_state=0
km_cluster = KMeans(n_clusters=5, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_
cluster_centers = km_cluster.cluster_centers_

In [9]:
document_df['cluster_label'] = cluster_label
document_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,battery-life_ipod_nano_8gb.txt,short battery life I moved up from an 8gb .\...,2
1,accuracy_garmin_nuvi_255W_gps.txt,", and is very, very accurate .\n but for the m...",3
2,voice_garmin_nuvi_255W_gps.txt,The voice prompts and maps are wonderful esp...,3
3,display_garmin_nuvi_255W_gps.txt,3 quot widescreen display was a bonus .\n Thi...,3
4,features_windows7.txt,"I had to uninstall anti, virus and selected o...",1


In [10]:
document_df[document_df['cluster_label']==0].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
37,staff_bestwestern_hotel_sfo.txt,Staff are friendly and helpful .\n The staf...,0
27,staff_swissotel_chicago.txt,The staff at Swissotel were not particularly ...,0


In [11]:
document_df[document_df['cluster_label']==1].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
39,comfort_honda_accord_2008.txt,"Drivers seat not comfortable, the car itself ...",1
26,comfort_toyota_camry_2007.txt,Ride seems comfortable and gas mileage fairly...,1
4,features_windows7.txt,"I had to uninstall anti, virus and selected o...",1
19,gas_mileage_toyota_camry_2007.txt,Ride seems comfortable and gas mileage fairly...,1
47,interior_honda_accord_2008.txt,I love the new body style and the interior is...,1
18,interior_toyota_camry_2007.txt,"First of all, the interior has way too many c...",1
14,keyboard_netbook_1005ha.txt,", I think the new keyboard rivals the great h...",1
40,mileage_honda_accord_2008.txt,"It's quiet, get good gas mileage and looks cl...",1
48,performance_honda_accord_2008.txt,"Very happy with my 08 Accord, performance is q...",1
25,performance_netbook_1005ha.txt,The Eee Super Hybrid Engine utility lets user...,1


In [12]:
document_df[document_df['cluster_label']==2].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
34,battery-life_amazon_kindle.txt,After I plugged it in to my USB hub on my com...,2
0,battery-life_ipod_nano_8gb.txt,short battery life I moved up from an 8gb .\...,2
6,battery-life_netbook_1005ha.txt,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",2
21,buttons_amazon_kindle.txt,I thought it would be fitting to christen my K...,2
29,eyesight-issues_amazon_kindle.txt,It feels as easy to read as the K1 but doesn'...,2
36,fonts_amazon_kindle.txt,Being able to change the font sizes is awesome...,2
20,navigation_amazon_kindle.txt,"In fact, the entire navigation structure has ...",2
31,price_amazon_kindle.txt,"If a case was included, as with the Kindle 1,...",2
11,screen_ipod_nano_8gb.txt,"As always, the video screen is sharp and brig...",2
7,screen_netbook_1005ha.txt,Keep in mind that once you get in a room full...,2


In [13]:
document_df[document_df['cluster_label']==3].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
1,accuracy_garmin_nuvi_255W_gps.txt,", and is very, very accurate .\n but for the m...",3
10,directions_garmin_nuvi_255W_gps.txt,You also get upscale features like spoken dir...,3
3,display_garmin_nuvi_255W_gps.txt,3 quot widescreen display was a bonus .\n Thi...,3
15,satellite_garmin_nuvi_255W_gps.txt,It's fast to acquire satellites .\n If you've...,3
9,screen_garmin_nuvi_255W_gps.txt,It is easy to read and when touching the scr...,3
12,speed_garmin_nuvi_255W_gps.txt,Another feature on the 255w is a display of th...,3
13,speed_windows7.txt,"Windows 7 is quite simply faster, more stable,...",3
17,updates_garmin_nuvi_255W_gps.txt,Another thing to consider was that I paid $50 ...,3
2,voice_garmin_nuvi_255W_gps.txt,The voice prompts and maps are wonderful esp...,3


In [14]:
document_df[document_df['cluster_label']==4].sort_values(by='filename')

Unnamed: 0,filename,opinion_text,cluster_label
43,bathroom_bestwestern_hotel_sfo.txt,"The room was not overly big, but clean and ve...",4
35,food_holiday_inn_london.txt,The room was packed to capacity with queues a...,4
32,food_swissotel_chicago.txt,The food for our event was delicious .\n The ...,4
46,free_bestwestern_hotel_sfo.txt,The wine reception is a great idea as it is ni...,4
50,location_bestwestern_hotel_sfo.txt,"Good Value good location , ideal choice .\nGr...",4
30,location_holiday_inn_london.txt,Great location for tube and we crammed in a f...,4
42,parking_bestwestern_hotel_sfo.txt,Parking was expensive but I think this is com...,4
22,price_holiday_inn_london.txt,"All in all, a normal chain hotel on a nice loc...",4
24,room_holiday_inn_london.txt,"We arrived at 23,30 hours and they could not r...",4
38,rooms_bestwestern_hotel_sfo.txt,"Great Location , Nice Rooms , Helpless Con...",4


In [15]:
from sklearn.cluster import KMeans

# 3개의 집합으로 군집화
km_cluster = KMeans(n_clusters=3, max_iter=10000, random_state=0)
km_cluster.fit(feature_vect)
cluster_label = km_cluster.labels_

# 소속 클러스터를 cluster_label 카럼으로 할당하고 cluster_label 값으로 정렬
document_df['cluster_label'] = cluster_label
document_df.sort_values(by='cluster_label')

Unnamed: 0,filename,opinion_text,cluster_label
24,room_holiday_inn_london.txt,"We arrived at 23,30 hours and they could not r...",0
27,staff_swissotel_chicago.txt,The staff at Swissotel were not particularly ...,0
30,location_holiday_inn_london.txt,Great location for tube and we crammed in a f...,0
28,service_swissotel_hotel_chicago.txt,Mediocre room and service for a very extravag...,0
23,rooms_swissotel_chicago.txt,The Swissotel is one of our favorite hotels in...,0
22,price_holiday_inn_london.txt,"All in all, a normal chain hotel on a nice loc...",0
50,location_bestwestern_hotel_sfo.txt,"Good Value good location , ideal choice .\nGr...",0
44,service_bestwestern_hotel_sfo.txt,Both of us having worked in tourism for over ...,0
43,bathroom_bestwestern_hotel_sfo.txt,"The room was not overly big, but clean and ve...",0
42,parking_bestwestern_hotel_sfo.txt,Parking was expensive but I think this is com...,0


**군집별 핵심 단어 추출하기**
- 각 군집에 속한 문서는 핵심 단어를 주축으로 군집화가 되어 있음
  - 이를 각 군집을 구성하는 핵심 단어에 어떤 것이 있는지 확인하고자 함
- KMeans 객체는 각 군집을 구성하는 단어 피처가 군집의 중심을 기준으로 얼마나 가깝게 위치해 있는지 clusters_centers라는 속성으로 제고함
  - clusters_centers : 배열값으로 제공, 행은 개별 군집, 열은 개별 피처를 의미
    - 배열 내 값 == 좌표

    

In [16]:
cluster_centers = km_cluster.cluster_centers_
print('cluster_centers shape:', cluster_centers.shape)
print(cluster_centers)

cluster_centers shape: (3, 5872)
[[0.00062303 0.00105424 0.         ... 0.00171768 0.00137133 0.00137133]
 [0.0006491  0.0005168  0.         ... 0.         0.         0.        ]
 [0.00153507 0.00141655 0.00177951 ... 0.         0.         0.        ]]


- cluster_centers의 속성: 넘파이 ndarray

In [27]:
# 군집별 top n 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명을 반환함.
def get_cluster_details(cluster_model, cluster_data, feature_names, clusters_num, top_n_features=0):
  cluster_details={}

  # cluster_centers array의 값이 큰 순으로 정렬된 인덱스 값을 반환
  # 군집 중심점(centroid) 별 할당된 word 피처들의 거리값이 큰 순으로 값을 구하기 위함
  centroid_feature_ordered_ind = cluster_model.cluster_centers_.argsort()[:, ::-1]

  # 개별 군집별로 반복하면서 핵심 단어, 그 단어의 중심 위치 상댓값, 대상 파일명 입력
  for cluster_num in range(clusters_num):
    # 개별 군집별 정보를 담을 데이터 초기화.
    cluster_details[cluster_num]={}
    cluster_details[cluster_num]['cluster'] = cluster_num

    # cluster_centers_.argsort()[:,::-1]로 구한 인덱스를 이용해 top n 피처 단어를 구함.
    top_feature_indexes = centroid_feature_ordered_ind[cluster_num, :top_n_features]
    top_features = [feature_names[ind] for ind in top_feature_indexes]

    # top_feature_indexes 를 이용해 해당 피처 단어의 중심 위치 상댓값 구함.
    top_feature_values = cluster_model.cluster_centers_[cluster_num, top_feature_indexes].tolist()

    # cluster_details 딕셔너리 객체에 개별 군집별 핵심단어와 중심위치 상댓값, 해당 파일명 입력
    cluster_details[cluster_num]['top_features'] = top_features
    cluster_details[cluster_num]['top_features_value'] = top_feature_values
    filenames = cluster_data[cluster_data['cluster_label'] == cluster_num]['filename']
    filename = filenames.values.tolist()

    cluster_details[cluster_num]['filenames'] = filenames

  return cluster_details

In [28]:
def print_cluster_details(cluster_details):
  for cluster_num, cluster_detail in cluster_details.items():
    print('####### Cluster {0}'.format(cluster_num))
    print('Top features:', cluster_detail['top_features'])
    print('Reviews 파일명:', cluster_detail['filenames'][:7])
    print('============================================')

In [29]:
feature_names = tfidf_vect.get_feature_names_out()

cluster_details = get_cluster_details(cluster_model=km_cluster, cluster_data=document_df, feature_names=feature_names, clusters_num=3, top_n_features=10)
print_cluster_details(cluster_details)

####### Cluster 0
Top features: ['room', 'hotel', 'service', 'staff', 'food', 'location', '. room', 'clean', 'bathroom', 'price']
Reviews 파일명: 22           price_holiday_inn_london.txt
23            rooms_swissotel_chicago.txt
24            room_holiday_inn_london.txt
27            staff_swissotel_chicago.txt
28    service_swissotel_hotel_chicago.txt
30        location_holiday_inn_london.txt
32             food_swissotel_chicago.txt
Name: filename, dtype: object
####### Cluster 1
Top features: ['interior', 'seat', 'mileage', 'performance', 'comfortable', 'gas', 'quality', 'gas mileage', 'car', 'voice']
Reviews 파일명: 2          voice_garmin_nuvi_255W_gps.txt
3        display_garmin_nuvi_255W_gps.txt
4                   features_windows7.txt
8                 sound_ipod_nano_8gb.txt
10    directions_garmin_nuvi_255W_gps.txt
12         speed_garmin_nuvi_255W_gps.txt
15     satellite_garmin_nuvi_255W_gps.txt
Name: filename, dtype: object
####### Cluster 2
Top features: ['screen', 'battery',