In [2]:
import json
from collections import Counter
import pandas as pd
import numpy as np
import glob
from konlpy.tag import Kkma
import re
import tqdm

import gensim
from gensim import corpora, models
from gensim.models import CoherenceModel
import warnings
warnings.filterwarnings('ignore')

import pyLDAvis
import pyLDAvis.gensim 
import matplotlib.pyplot as plt

In [4]:
glob.glob('./data_ko/*.json')

['./data_ko/ko_review_jeju.json',
 './data_ko/ko_review_jeonju.json',
 './data_ko/ko_review_daejeon.json',
 './data_ko/ko_review_daegu.json',
 './data_ko/ko_review_seogwipo.json',
 './data_ko/ko_review_yeosu.json',
 './data_ko/ko_review_gangneung.json',
 './data_ko/ko_review_seoul.json',
 './data_ko/ko_review_gwangju.json']

In [28]:
review_lst = ['./data_ko/ko_review_jeju.json',
 './data_ko/ko_review_seogwipo.json',
 './data_ko/ko_review_gangneung.json',
 './data_ko/ko_review_yeosu.json']

In [29]:
review = {}
for file in review_lst:
    with open(file,'r',encoding='utf-8') as fp:
        review.update(json.load(fp))

In [30]:
review_total = []
for home_id,rev_lst in review.items():
    for rev in rev_lst :
        if len(rev) > 1 :
            review_total.append(rev)
            
print('number of reviews : ', len(review_total))

number of reviews :  122166


# Text preprocessing

In [None]:
kkma = Kkma()
for rev in review_total :
    rev[1] = kkma.sentences(rev[1])

In [None]:
hangul = re.compile('[^ ㄱ-ㅣ가-힣]+') # 한글과 띄어쓰기를 제외한 모든 글자

for rev in review_total : 
    sen_lst = []
    for r in rev[1] :
        sen = hangul.sub('', r)
        sen = sen.strip().replace("ㅎ", "").replace("ㅠ", "").replace("ㅜ", "")
        sen_lst.append(sen)
        rev[1] = sen_lst 

In [None]:
for rev in review_total : 
    sen_lst = []
    for r in rev[1] :
        sen = r.strip()
        if len(sen) > 1 :
            sen_lst.append(sen)
        rev[1] = sen_lst 

In [None]:
with open('review_split_sea.json','w',encoding='utf-8') as fp:
        json.dump(review_total,fp)

# Pos tagging

In [4]:
with open('review_split_sea.json') as data_file:    
    review_total = json.load(data_file)

In [7]:
for rev in tqdm.tqdm(review_total) : 
    sen_lst = []
    for r in rev[1] :
        sen = kkma.pos(r)
        sen_lst.append(sen)
    rev[1] = sen_lst 

100%|██████████| 122166/122166 [46:32<00:00, 43.75it/s] 


In [8]:
for rev in review_total : 
    sen_lst = []
    for sen in rev[1] :
        w_lst = []
        for (word,pos) in sen :
            if pos in ['NN','NNG','NNP','NP','VA','VV','XR'] :
                w_lst.append(word)
        sen_lst.append(w_lst)
        rev[1] = sen_lst

In [9]:
review_total[3000]

[5,
 [['알려주', '아깝', '정도', '혼자', '알', '숙소'],
  ['방', '구조', '풍', '디테일', '소품', '아름답'],
  ['바다',
   '바라보',
   '내리',
   '마시',
   '있',
   '커피',
   '도구',
   '넷',
   '보',
   '있',
   '오',
   '책',
   '있',
   '하루',
   '종일',
   '숙소',
   '있',
   '정도',
   '들어오',
   '나가',
   '이날',
   '일정',
   '게',
   '좋',
   '같'],
  ['멋지', '숙소', '머물', '따뜻', '사장님', '좋'],
  ['제', '들르']]]

In [10]:
with open('review_NV_sea.json','w',encoding='utf-8') as fp:
        json.dump(review_total,fp)

In [15]:
token = []
for r in review_total :
    for s in r[1] :
        for w in s :
            token.append(w)
        
token_cnt = Counter(token)
sorted(token_cnt.items(),key=lambda kv:kv[1],reverse=True)[:30]

[('좋', 131252),
 ('있', 85081),
 ('숙소', 72305),
 ('하', 58458),
 ('오', 30645),
 ('깨끗하', 30024),
 ('친절', 29751),
 ('여행', 27876),
 ('호스트', 25655),
 ('곳', 25349),
 ('같', 24484),
 ('되', 23997),
 ('집', 23990),
 ('위치', 23705),
 ('가', 23493),
 ('분', 22311),
 ('없', 22239),
 ('깔끔', 20146),
 ('갈', 19366),
 ('다음', 19013),
 ('쉬', 18984),
 ('방', 18776),
 ('감사', 18038),
 ('조용', 17069),
 ('편하', 16986),
 ('보', 16467),
 ('가족', 15073),
 ('가깝', 14663),
 ('이용', 13948),
 ('만족', 13346)]

In [18]:
stopWords = ['있','잇','하','되','분','없']

for rev in review_total : 
    sen_lst = []
    for sen in rev[1] :
        no_stopword = [w for w in sen if not w in stopWords]
        sen_lst.append(no_stopword)
        rev[1] = sen_lst

# LDA

In [21]:
texts = []
for rev in review_total:
    for sen in rev[1] :
         if len(sen)>0 :
            texts.append(sen)
        
print('문장 갯수 :',len(texts))
texts[:3]

문장 갯수 : 441178


[['감사'], ['가격', '대비', '훌륭'], ['다르', '후기', '같', '곳']]

In [22]:
dic = corpora.Dictionary(texts)

In [23]:
corpus = [dic.doc2bow(text) for text in texts]

In [24]:
import time
start = time.time()

n = 5
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = n, id2word = dic) 
print('걸린시간(m) :',(time.time()-start)/60)

걸린시간(m) : 0.9044737776120504


In [25]:
ldamodel.show_topics(num_words=10) #토픽을 구성하는 주요단어들

[(0,
  '0.035*"가" + 0.031*"오" + 0.031*"좋" + 0.029*"쉬" + 0.028*"다음" + 0.026*"숙소" + 0.023*"가깝" + 0.023*"곳" + 0.021*"갈" + 0.018*"위치"'),
 (1,
  '0.027*"추천" + 0.026*"감사" + 0.022*"먹" + 0.018*"조식" + 0.017*"아침" + 0.017*"쓰" + 0.016*"맛있" + 0.016*"준비" + 0.015*"듣" + 0.014*"드리"'),
 (2,
  '0.036*"좋" + 0.020*"보" + 0.019*"방" + 0.019*"바다" + 0.019*"보이" + 0.013*"화장실" + 0.012*"크" + 0.012*"불편" + 0.012*"나오" + 0.012*"점"'),
 (3,
  '0.111*"좋" + 0.078*"숙소" + 0.047*"깨끗하" + 0.032*"깔끔" + 0.028*"조용" + 0.020*"예쁘" + 0.020*"사진" + 0.019*"위치" + 0.019*"집" + 0.018*"넓"'),
 (4,
  '0.050*"친절" + 0.045*"호스트" + 0.031*"좋" + 0.029*"여행" + 0.021*"지내" + 0.017*"시간" + 0.017*"가족" + 0.015*"오" + 0.013*"배려" + 0.012*"사장님"')]

In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(ldamodel, corpus, dic)
vis

In [None]:
ldamodel = gensim.models.ldamodel.LdaModel.load('lda_sea.lda')

In [None]:
ldamodel.save('lda_sea.lda')

In [None]:
pyLDAvis.save_html(vis,'LDA_sea.html')