### 문서 군집화 - Opinion Review Dataset

In [54]:
import os, glob
import pandas as pd

In [55]:
path = '../data/OpinosisDataset1.0/topics/'
os.path.join(path, '*.data')


'../data/OpinosisDataset1.0/topics/*.data'

In [56]:
all_files = glob.glob(os.path.join(path, '*.data'))
print(len(all_files))
print(all_files[:5])

51
['../data/OpinosisDataset1.0/topics\\accuracy_garmin_nuvi_255W_gps.txt.data', '../data/OpinosisDataset1.0/topics\\bathroom_bestwestern_hotel_sfo.txt.data', '../data/OpinosisDataset1.0/topics\\battery-life_amazon_kindle.txt.data', '../data/OpinosisDataset1.0/topics\\battery-life_ipod_nano_8gb.txt.data', '../data/OpinosisDataset1.0/topics\\battery-life_netbook_1005ha.txt.data']


In [57]:
file = all_files[0]
filename = file.split('\\')[-1].split('.')[0]
filename

'accuracy_garmin_nuvi_255W_gps'

- filename과 opnion으로 구성된 DataFrame 만들기

In [58]:
filename_list, opnion_list = [], []
for file in all_files:
    with open(file, encoding='latin1') as f:
        opnion = f.read()
    opnion_list.append(opnion)
    filename = file.split('\\')[-1].split('.')[0]
    filename_list.append(filename)

df = pd.DataFrame({'filename':filename_list, 'opnion':opnion_list})
df.head(3)
    

Unnamed: 0,filename,opnion
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n but for the m..."
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve..."
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...


- Feature 변환
    - simple_tokenizer() 함수 제작
    - TfidfVectorizer

In [59]:
from nltk import word_tokenize

def simple_tokenizer(text):
    return [word for word in word_tokenize(text) if len(word) > 2]

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(tokenizer=simple_tokenizer, stop_words='english', 
                        ngram_range=(1, 2), min_df=0.05, max_df=0.95)
feature = tvect.fit_transform(df.opnion) # X_data



- 군집화

In [61]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, random_state=2023, n_init='auto')
kmeans.fit(feature)

In [62]:
df['cluster_label'] = kmeans.labels_
df.head()

Unnamed: 0,filename,opnion,cluster_label
0,accuracy_garmin_nuvi_255W_gps,", and is very, very accurate .\n but for the m...",0
1,bathroom_bestwestern_hotel_sfo,"The room was not overly big, but clean and ve...",1
2,battery-life_amazon_kindle,After I plugged it in to my USB hub on my com...,3
3,battery-life_ipod_nano_8gb,short battery life I moved up from an 8gb .\...,3
4,battery-life_netbook_1005ha,"6GHz 533FSB cpu, glossy display, 3, Cell 23Wh ...",3


In [63]:
df.cluster_label.value_counts()

cluster_label
0    16
2    13
3    10
4     9
1     3
Name: count, dtype: int64

In [64]:
from sklearn.metrics import silhouette_score, silhouette_samples

silhouette_score(feature, kmeans.labels_)

0.07128403618144952

In [65]:
for n in [2, 3, 4, 5, 6]:
    kmeans = KMeans(n_clusters=n, random_state=2023, n_init='auto')
    kmeans.fit(feature)
    score = silhouette_score(feature, kmeans.labels_)
    print(f'군집갯수 : {n}, 실루엣 스코어 : {score:.4f}')

군집갯수 : 2, 실루엣 스코어 : 0.0511
군집갯수 : 3, 실루엣 스코어 : 0.0530
군집갯수 : 4, 실루엣 스코어 : 0.0639
군집갯수 : 5, 실루엣 스코어 : 0.0713
군집갯수 : 6, 실루엣 스코어 : 0.0743


- 중심 단어 추출하기

In [66]:
centers = kmeans.cluster_centers_
centers

array([[0.        , 0.00396275, 0.01014125, ..., 0.        , 0.        ,
        0.01010242],
       [0.        , 0.        , 0.0046054 , ..., 0.00328993, 0.        ,
        0.        ],
       [0.        , 0.00363082, 0.00696748, ..., 0.00127411, 0.        ,
        0.        ],
       [0.        , 0.00757668, 0.00146196, ..., 0.        , 0.        ,
        0.00277388],
       [0.00363278, 0.0004939 , 0.00030527, ..., 0.        , 0.0032441 ,
        0.        ],
       [0.        , 0.00503352, 0.00416591, ..., 0.        , 0.00114262,
        0.00455755]])

In [67]:
from cluster import get_cluster_details

In [69]:

feature_names = tvect.get_feature_names_out()
cluster_details = get_cluster_details(
    cluster_model=kmeans, cluster_data=df, feature_names=feature_names,
    clusters_num=5, top_n_features=10
)

In [72]:
for cluster_num, cluster_detail in cluster_details.items():
    print(f'##### Cluster {cluster_num}')
    print('Top features: ', cluster_detail['top_features'])
    print('Reviews 파일명 : ', cluster_detail['filenames'])
    print('==================================================')

##### Cluster 0
Top features:  ['screen', 'directions', 'kindle', 'voice', 'font', 'page', 'accurate', 'features', 'map', 'eyes']
Reviews 파일명 :  ['accuracy_garmin_nuvi_255W_gps', 'buttons_amazon_kindle', 'directions_garmin_nuvi_255W_gps', 'display_garmin_nuvi_255W_gps', 'eyesight-issues_amazon_kindle', 'features_windows7', 'fonts_amazon_kindle', 'keyboard_netbook_1005ha', 'navigation_amazon_kindle', 'price_amazon_kindle', 'satellite_garmin_nuvi_255W_gps', 'screen_garmin_nuvi_255W_gps', 'screen_ipod_nano_8gb', 'speed_garmin_nuvi_255W_gps', 'updates_garmin_nuvi_255W_gps', 'voice_garmin_nuvi_255W_gps']
##### Cluster 1
Top features:  ['rooms', 'bathroom', 'clean', 'room', 'hotel', 'rooms clean', 'small', 'sink', 'nice', 'bathrooms']
Reviews 파일명 :  ['bathroom_bestwestern_hotel_sfo', 'rooms_bestwestern_hotel_sfo', 'rooms_swissotel_chicago']
##### Cluster 2
Top features:  ['service', 'hotel', 'staff', 'food', 'room', 'location', 'parking', 'price', 'room service', 'free']
Reviews 파일명 :  ['foo