## 문서 군집화 - Opinion Review dataset

In [5]:
import pandas as pd
import glob, os

path = 'OpinosisDataset1.0/topics'
os.path.join(path, '*.data')    # Windows - \\, Mac - /

'OpinosisDataset1.0/topics/*.data'

In [6]:
all_files = glob.glob(os.path.join(path, '*.data'))
all_files[:5]

['OpinosisDataset1.0/topics/battery-life_ipod_nano_8gb.txt.data',
 'OpinosisDataset1.0/topics/gas_mileage_toyota_camry_2007.txt.data',
 'OpinosisDataset1.0/topics/room_holiday_inn_london.txt.data',
 'OpinosisDataset1.0/topics/location_holiday_inn_london.txt.data',
 'OpinosisDataset1.0/topics/staff_bestwestern_hotel_sfo.txt.data']

In [7]:
file = all_files[0]
file

'OpinosisDataset1.0/topics/battery-life_ipod_nano_8gb.txt.data'

In [8]:
file.split('\\')[-1].split('.')[0]

'OpinosisDataset1'

In [9]:
filename_list = []
opinion_text = []
for file in glob.glob(os.path.join(path, '*.data')):
    with open(file, encoding='latin1') as f:
        text = f.read()
    opinion_text.append(text)
    filename = file.split('\\')[-1].split('.')[0]
    filename_list.append(filename)

df = pd.DataFrame({'filename':filename_list, 'opinion':opinion_text})
df.head(3)

Unnamed: 0,filename,opinion
0,OpinosisDataset1,short battery life I moved up from an 8gb .\...
1,OpinosisDataset1,Ride seems comfortable and gas mileage fairly...
2,OpinosisDataset1,"We arrived at 23,30 hours and they could not r..."


- Simple tokenizer 함수를 이용해 feature 변환

In [10]:
from nltk import word_tokenize

def simple_tokenizer(text):             # 글자수가 2개 이하인 토큰은 제거
    return [word for word in word_tokenize(text) if len(word) > 2]

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

tvect = TfidfVectorizer(tokenizer=simple_tokenizer, stop_words='english',
                        ngram_range=(1,2), min_df=0.05, max_df=0.85)
feature = tvect.fit_transform(df.opinion)

- 군집화

In [12]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=5, max_iter=10000, random_state=2022)
kmeans.fit(feature)

KMeans(max_iter=10000, n_clusters=5, random_state=2022)

In [13]:
df['cluster'] = kmeans.labels_
df.head()

Unnamed: 0,filename,opinion,cluster
0,OpinosisDataset1,short battery life I moved up from an 8gb .\...,4
1,OpinosisDataset1,Ride seems comfortable and gas mileage fairly...,2
2,OpinosisDataset1,"We arrived at 23,30 hours and they could not r...",0
3,OpinosisDataset1,Great location for tube and we crammed in a f...,0
4,OpinosisDataset1,Staff are friendly and helpful .\n The staf...,0


In [14]:
df.cluster.value_counts()

0    16
3    11
1    10
2     9
4     5
Name: cluster, dtype: int64

In [15]:
# 3개의 집합으로 군집화
kmeans = KMeans(n_clusters=3, max_iter=10000, random_state=2022)
kmeans.fit(feature)
df['cluster_label'] = kmeans.labels_
df.cluster_label.value_counts()

2    25
0    16
1    10
Name: cluster_label, dtype: int64

- 군집별 핵심 단어 추출하기

In [16]:
feature.shape

(51, 4154)

In [17]:
cluster_centers = kmeans.cluster_centers_
cluster_centers.shape

(3, 4154)

In [18]:
from cluster import get_cluster_details

feature_names = tvect.get_feature_names()
cluster_details = get_cluster_details(cluster_model=kmeans, cluster_data=df,
                                      feature_names=feature_names, clusters_num=3, top_n_features=10)



In [19]:
for cluster_num, cluster_detail in cluster_details.items():
    print(f'####### Cluster {cluster_num}')
    print('Top features:', cluster_detail['top_features'])
    print('Reviews 파일명:', cluster_detail['filenames'][:7])
    print('==================================================')

####### Cluster 0
Top features: ['hotel', 'service', 'rooms', 'staff', 'room', 'food', 'location', 'clean', 'bathroom', 'parking']
Reviews 파일명: ['OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1']
####### Cluster 1
Top features: ['interior', 'mileage', 'seats', 'comfortable', 'gas', 'gas mileage', 'transmission', 'car', 'performance', 'quality']
Reviews 파일명: ['OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1']
####### Cluster 2
Top features: ['screen', 'battery', 'battery life', 'keyboard', 'kindle', 'life', 'directions', 'size', 'voice', 'speed']
Reviews 파일명: ['OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1', 'OpinosisDataset1']


In [1]:
# 왜저래