In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

# 문서 군집화(Document Clustering)

- 비슷한 텍스트 구성의 문서를 군집화하는 것
- 동일한 군집에 속하는 문서를 같은 카테고리 소속으로 분류할 수 있으므로 텍스트 분류 기반의 문서 분류와 유사하나
- 텍스트 분류 기반의 문서 분류는 사전에 결정 카테고리 값을 가진 학습 데이터 세트가 필요한데 반해 학습 데이터 세트가 필요없는 비지도 학습

## 문서 군집화 실습

### 예제 데이터

- UCI머신러닝 리포지토리에 있는 Opinion Review 데이터
    - 출처 : https://archive.ics.uci.edu/dataset/191/opinosis+opinion+frasl+review
- 51개 텍스트 파일로 구성
    - Tripadvisor(호텔), Edmunds.xom(자동차), Amazon.com(전자제품) 사이트에서 가져온 리뷰 문서
    - 각 문서는 약 100개 정도의 문장을 가지고 있음  

### 데이터 준비

- 파일을 읽고 데이터 프레임으로 변환

In [4]:
import os, glob

In [7]:
path = r'C:\workspace_multi07\web\12_machine learning\data\OpinosisDataset1.0\topics'
all_files = glob.glob(os.path.join(path, '*.data'))

text_list= []
filename_list = []

for file_ in all_files:
    df = pd.read_table(file_, index_col=None, encoding='latin1')
    text_list.append(df.to_string())
    filename_list.append(file_.split('\\')[-1].split('.')[0])

doc_df = pd.DataFrame({'filename': filename_list, 'opinion_text': text_list})
doc_df.head(3)

Unnamed: 0,filename,opinion_text
0,accuracy_garmin_nuvi_255W_gps,...
1,bathroom_bestwestern_hotel_sfo,...
2,battery-life_amazon_kindle,...


### 토큰화

#### 토큰화 및 어근 추출을 위한 함수 LemNormalize(text) 정의
- 소문자로 변환한 후 구두점을 제거(remove_punct_dict)
- nltk.word_tokenize() 메서드로 단어 토큰화
- LemTokens(tokens) 함수 호출하여 토큰화한 단어의 원형으로 변환
- WordNetLemmatizer클래스에서 제공하는 lemmatize(token)을 이용하여 Lemmatization을 구현

In [8]:
from nltk.stem import WordNetLemmatizer
import nltk
import string

In [13]:
# 구두점 제거 string.punctuation -> '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
# WordNetLemmatizer -> 단어의 원형을 반환 -> 과거형을 현재형, 복수형을 단수형
punct_dict = dict((ord(punct),None) for punct in string.punctuation)
lemmar = WordNetLemmatizer()

def LemTokens(tokens):
    return [lemmar.lemmatize(token) for token in tokens]

def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(punct_dict)))

In [14]:
LemNormalize('This is Good!!!')

['this', 'is', 'good']

#### TF-IDF벡터화

- tokenizer로 LemNormalize() 함수를 이용

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# tokenizer= 토큰화 함수 넣으면 토큰화 적용
tf = TfidfVectorizer(tokenizer= LemNormalize, stop_words='english',
                     ngram_range= (1,2), min_df=0.5, max_df=0.85)

ftr_vect = tf.fit_transform(doc_df['opinion_text'])

In [16]:
ftr_vect.shape

(51, 130)

In [17]:
ftr_vect.toarray()

array([[0.29125784, 0.        , 0.        , ..., 0.06483071, 0.        ,
        0.05825157],
       [0.        , 0.        , 0.        , ..., 0.        , 0.05658632,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.31138932, 0.        ,
        0.13989448],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.17480704],
       [0.01585217, 0.01803886, 0.01803886, ..., 0.        , 0.08266723,
        0.07926083],
       [0.0388652 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.0777304 ]])

### 군집화
- 군집화 기법 : K-Means 적용

In [18]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters= 5, max_iter=10000, random_state=0)
kmeans.fit(ftr_vect)
cluster_labels= kmeans.labels_
cluster_centers = kmeans.cluster_centers_

In [21]:
doc_df['cluster_label'] = cluster_labels
doc_df.head()

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,...,3
1,bathroom_bestwestern_hotel_sfo,...,2
2,battery-life_amazon_kindle,...,1
3,battery-life_ipod_nano_8gb,...,1
4,battery-life_netbook_1005ha,...,1


#### 군집화 결과 확인

- 'cluster_label'=0인 문서들

In [22]:
doc_df[doc_df.cluster_label == 0].sort_values('filename')

Unnamed: 0,filename,opinion_text,cluster_label
6,comfort_honda_accord_2008,...,0
7,comfort_toyota_camry_2007,...,0
16,gas_mileage_toyota_camry_2007,...,0
17,interior_honda_accord_2008,...,0
18,interior_toyota_camry_2007,...,0
22,mileage_honda_accord_2008,...,0
24,parking_bestwestern_hotel_sfo,...,0
25,performance_honda_accord_2008,...,0
29,quality_toyota_camry_2007,...,0
37,seats_honda_accord_2008,...,0


- 'cluster_label'=1인 문서들

In [23]:
doc_df[doc_df.cluster_label == 1].sort_values('filename')

Unnamed: 0,filename,opinion_text,cluster_label
2,battery-life_amazon_kindle,...,1
3,battery-life_ipod_nano_8gb,...,1
4,battery-life_netbook_1005ha,...,1
5,buttons_amazon_kindle,...,1
10,eyesight-issues_amazon_kindle,...,1
11,features_windows7,...,1
12,fonts_amazon_kindle,...,1
19,keyboard_netbook_1005ha,...,1
23,navigation_amazon_kindle,...,1
26,performance_netbook_1005ha,...,1


- 'cluster_label'=2인 문서들

In [24]:
doc_df[doc_df.cluster_label == 2].sort_values('filename')

Unnamed: 0,filename,opinion_text,cluster_label
1,bathroom_bestwestern_hotel_sfo,...,2
15,free_bestwestern_hotel_sfo,...,2
20,location_bestwestern_hotel_sfo,...,2
21,location_holiday_inn_london,...,2
28,price_holiday_inn_london,...,2
32,room_holiday_inn_london,...,2
30,rooms_bestwestern_hotel_sfo,...,2
31,rooms_swissotel_chicago,...,2
45,staff_bestwestern_hotel_sfo,...,2
46,staff_swissotel_chicago,...,2


- 'cluster_label'=3인 문서들

In [25]:
doc_df[doc_df.cluster_label == 3].sort_values('filename')

Unnamed: 0,filename,opinion_text,cluster_label
0,accuracy_garmin_nuvi_255W_gps,...,3
8,directions_garmin_nuvi_255W_gps,...,3
9,display_garmin_nuvi_255W_gps,...,3
33,satellite_garmin_nuvi_255W_gps,...,3
34,screen_garmin_nuvi_255W_gps,...,3
43,speed_garmin_nuvi_255W_gps,...,3
48,updates_garmin_nuvi_255W_gps,...,3
50,voice_garmin_nuvi_255W_gps,...,3


- 'cluster_label'=4인 문서들

In [27]:
doc_df[doc_df.cluster_label == 4].sort_values('filename')

Unnamed: 0,filename,opinion_text,cluster_label
13,food_holiday_inn_london,...,4
14,food_swissotel_chicago,...,4
38,service_bestwestern_hotel_sfo,...,4
39,service_holiday_inn_london,...,4
40,service_swissotel_hotel_chicago,...,4


#### 3개 집합으로 군집화

In [29]:
kmeans = KMeans(n_clusters= 3, max_iter=10000, random_state=0)
kmeans.fit(ftr_vect)
cluster_labels= kmeans.labels_
cluster_centers = kmeans.cluster_centers_
doc_df['cluster_label2'] = cluster_labels
doc_df.head()

Unnamed: 0,filename,opinion_text,cluster_label,cluster_label2
0,accuracy_garmin_nuvi_255W_gps,...,3,1
1,bathroom_bestwestern_hotel_sfo,...,2,2
2,battery-life_amazon_kindle,...,1,1
3,battery-life_ipod_nano_8gb,...,1,1
4,battery-life_netbook_1005ha,...,1,1


In [35]:
for i in range(3):
    filenames = doc_df[doc_df.cluster_label2 == i].sort_values('filename')['filename']
    print(f'cluster {i}:\n{filenames.values}\n')

cluster 0:
['comfort_honda_accord_2008' 'comfort_toyota_camry_2007'
 'gas_mileage_toyota_camry_2007' 'interior_honda_accord_2008'
 'interior_toyota_camry_2007' 'mileage_honda_accord_2008'
 'performance_honda_accord_2008' 'quality_toyota_camry_2007'
 'seats_honda_accord_2008' 'sound_ipod_nano_8gb'
 'transmission_toyota_camry_2007']

cluster 1:
['accuracy_garmin_nuvi_255W_gps' 'battery-life_amazon_kindle'
 'battery-life_ipod_nano_8gb' 'battery-life_netbook_1005ha'
 'buttons_amazon_kindle' 'directions_garmin_nuvi_255W_gps'
 'display_garmin_nuvi_255W_gps' 'eyesight-issues_amazon_kindle'
 'features_windows7' 'fonts_amazon_kindle' 'keyboard_netbook_1005ha'
 'navigation_amazon_kindle' 'performance_netbook_1005ha'
 'satellite_garmin_nuvi_255W_gps' 'screen_garmin_nuvi_255W_gps'
 'screen_ipod_nano_8gb' 'screen_netbook_1005ha' 'size_asus_netbook_1005ha'
 'speed_garmin_nuvi_255W_gps' 'speed_windows7'
 'updates_garmin_nuvi_255W_gps' 'video_ipod_nano_8gb'
 'voice_garmin_nuvi_255W_gps']

cluster 2:
[

### 군집(Cluster)별 핵심 단어 추출하기
- 군집에 속한 문서는 핵심 단어를 주측으로 군집화되어 있음
- 각 군집을 구성하는 핵심 단어를 추출
- KMeans 객체는 각 군집을 구성하는 단어 피처가 군집의 중심(centroid)을 기준으로 얼마나 가깝게 위치해 있는지 cluster_centers_ 속성으로 제공

**cluster_centers_**
- 배열 값으로 행은 개별 군집, 열은 개별 피처를 의미

In [36]:
kmeans.cluster_centers_.shape

(3, 130)

#### 군집별 핵심 단어를 찾기 위한 함수 get_cluster_details() 정의

- 함수 인수
    - cluster_model : KMeans 군집화 객체
    - cluster_data : 군집 데이터
    - feature_names : 피처 이름
    - clusters_num : 군집 수
    - top_n_features

In [45]:
def get_cluster_details(cluster_model, cluster_data, feature_names, cluster_num, top_n_feature):
    details ={}
    center_idx = cluster_model.cluster_centers_.argsort()[:,::-1]

    for n in range(cluster_num):
        details[n] = {}
        details[n]['cluster'] = n
        
        top_idx = center_idx[n, :top_n_feature]
        top_ftrs = [feature_names[i] for i in top_idx]
        top_values = cluster_model.cluster_centers_[n, top_idx].tolist()

        details[n]['top_ftrs'] = top_ftrs
        details[n]['top_ftrs_values'] = top_values
        filenames = cluster_data[cluster_data.cluster_label2 == n].filename
        details[n]['filenames'] = filenames.values.tolist()
    return details

#### print_cluster_details(cluster_details) 함수 정의

In [46]:
def print_cluster_details(cluster_details):
    for n, detail in cluster_details.items():
        print(f'-- cluster {n}')
        print(f"TOP features: {detail['top_ftrs']}")
        print(f"Filenames: {detail['filenames']}")
        print('')

- 군집별 핵심 단어 추출

In [47]:
ftr_names = tf.get_feature_names_out()
details = get_cluster_details(cluster_model= kmeans, 
                              cluster_data= doc_df, 
                              feature_names= ftr_names, cluster_num=3, top_n_feature=10)
details

{0: {'cluster': 0,
  'top_ftrs': ['car',
   'comfortable',
   'quality',
   'look',
   'love',
   'new',
   'problem',
   'feel',
   'hard',
   'excellent'],
  'top_ftrs_values': [0.4191706463844818,
   0.3174262011188621,
   0.27826138137460127,
   0.14045631161785385,
   0.10741482330348945,
   0.08947573945859684,
   0.08914844645915125,
   0.08466134518634005,
   0.07048777826461795,
   0.06845535233970032],
  'filenames': ['comfort_honda_accord_2008',
   'comfort_toyota_camry_2007',
   'gas_mileage_toyota_camry_2007',
   'interior_honda_accord_2008',
   'interior_toyota_camry_2007',
   'mileage_honda_accord_2008',
   'performance_honda_accord_2008',
   'quality_toyota_camry_2007',
   'seats_honda_accord_2008',
   'sound_ipod_nano_8gb',
   'transmission_toyota_camry_2007']},
 1: {'cluster': 1,
  'top_ftrs': ['turn',
   'easy',
   'feature',
   'size',
   'use',
   'fast',
   'read',
   'new',
   'small',
   'using'],
  'top_ftrs_values': [0.21313991779008684,
   0.21105265801436454

In [48]:
print_cluster_details(details)

-- cluster 0
TOP features: ['car', 'comfortable', 'quality', 'look', 'love', 'new', 'problem', 'feel', 'hard', 'excellent']
Filenames: ['comfort_honda_accord_2008', 'comfort_toyota_camry_2007', 'gas_mileage_toyota_camry_2007', 'interior_honda_accord_2008', 'interior_toyota_camry_2007', 'mileage_honda_accord_2008', 'performance_honda_accord_2008', 'quality_toyota_camry_2007', 'seats_honda_accord_2008', 'sound_ipod_nano_8gb', 'transmission_toyota_camry_2007']

-- cluster 1
TOP features: ['turn', 'easy', 'feature', 'size', 'use', 'fast', 'read', 'new', 'small', 'using']
Filenames: ['accuracy_garmin_nuvi_255W_gps', 'battery-life_amazon_kindle', 'battery-life_ipod_nano_8gb', 'battery-life_netbook_1005ha', 'buttons_amazon_kindle', 'directions_garmin_nuvi_255W_gps', 'display_garmin_nuvi_255W_gps', 'eyesight-issues_amazon_kindle', 'features_windows7', 'fonts_amazon_kindle', 'keyboard_netbook_1005ha', 'navigation_amazon_kindle', 'performance_netbook_1005ha', 'satellite_garmin_nuvi_255W_gps', 's

----