In [1]:
import os
import json
import re
import pandas as pd
import numpy as np
from kiwipiepy import Kiwi

# data load

In [2]:
with open('crawl_result/WINE_FREE.json','r',encoding='utf-8')as f:
    WINE_FREE=json.load(f)

In [3]:
with open('crawl_result/wineQ&A_text.json','r',encoding='utf-8')as f:
    wineQA=json.load(f)

In [4]:
with open('crawl_result/wine_recommend_text.json','r',encoding='utf-8')as f:
    recommend=json.load(f)

In [6]:
free=pd.DataFrame(WINE_FREE['data'])
qa=pd.DataFrame(wineQA['data'])
rec=pd.DataFrame(recommend['data'])

In [12]:
free.제목[1]

'비비노 메모리얼데이 20프로 할인\n|\n자유게시판\n\n\n\n\n\n\n\n\n2021.05.31. 22:24'

# data cleaning

In [10]:
def clean_text(text):
    re_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),|]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
    text = re.sub(re_pattern, 'url', text)
    text = re.sub('[+]', ',', text)
    text=re.sub('[^ ㄱ-ㅣ가-힣A-Za-z0-9!?.,~[]]+',' ',text)
    text=re.sub('[\s *]',' ',text)
    return text

In [13]:
DF['clean_content']=DF.본문.apply(lambda x:clean_text(x))
DF['clean_title']=DF.제목.apply(lambda x:clean_text(x))
DF['contents']=DF.clean_title+DF.clean_content

# Tokenizing

In [20]:
kiwi = Kiwi(num_workers=16)
kiwi.prepare()
temp_title = [[each_word[0] if ('NNG' in each_word[1]) or ('NNP' in each_word[1])
               else each_word[0] + '다' if ('VV' in each_word[1]) or ('VA' in each_word[1])
               else None for each_word in each_doc[0][0]]
              for each_doc in kiwi.analyze(DF['contents'], top_n=1)]

target_title = [[each_word for each_word in each_doc if each_word] for each_doc in temp_title]

In [21]:
DF['token']=target_title

In [22]:
corpus=DF.token.apply(lambda x: ' '.join(x))

# TF-IDF

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidv = TfidfVectorizer(min_df=0.01).fit(corpus)
TFIDF=tfidv.transform(corpus)

TFIDF.sum()
text=tfidv.get_feature_names()

In [74]:
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt

# wordclouds=WordCloud(width=800,height=800,background_color='white',colormap='Greens')
# from collections import Counter
# count=Counter(text)
# fig=plt.figure(figsize=(10,10))
# plt.imshow(wordclouds.to_array())
# plt.show()

# 와인 등 빈도수는 높지만 불필요한 단어 제거

In [63]:
def drop_certain_words(corpus, sparse_matrix, drop_words):
    drop_words_index = [np.where(corpus == word)[0][0] for word in drop_words]
    to_keep = sorted(set(range(sparse_matrix.shape[1])) - set(drop_words_index))
    corpus = corpus[to_keep]
    sparse_matrix = sparse_matrix[:, to_keep]
    return corpus, sparse_matrix

In [64]:
words_list, TFIDF=drop_certain_words(np.array(tfidv.get_feature_names()),TFIDF,['와인','마시다','하다','있다', 
                                                                                '댓글', '많다', '안내', '등급',
                                                                               '답변', '기본', '소통','이렇다'])

# LDA

In [50]:
from sklearn.decomposition import LatentDirichletAllocation

In [67]:
lda=LatentDirichletAllocation(n_components=5)
lda.fit(TFIDF)

LatentDirichletAllocation(n_components=5)

In [68]:
def display_topics(model, feature_names, no_top_words):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        important_words = [feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]

        print("Topic %d:" % topic_idx)
        print(" ".join(important_words))
        topics.append(important_words)
    return topics

In [69]:
display_topics(lda,words_list,10)

Topic 0:
보관 가격 괜찮다 되다 사다 셀러 어떻다 정도 비다 좋다
Topic 1:
먹다 추천 어울리다 드리다 좋다 부탁 같다 화이트 가다 레드
Topic 2:
행사 싸다 확인 방법 설정 알람 사람 구하다 사다 클릭
Topic 3:
알다 계시다 선물 받다 들다 보다 없다 정보 나오다 같다
Topic 4:
주목 대하다 감사 규정 체계 사항 싸다 클릭 코르크 리딩


[['보관', '가격', '괜찮다', '되다', '사다', '셀러', '어떻다', '정도', '비다', '좋다'],
 ['먹다', '추천', '어울리다', '드리다', '좋다', '부탁', '같다', '화이트', '가다', '레드'],
 ['행사', '싸다', '확인', '방법', '설정', '알람', '사람', '구하다', '사다', '클릭'],
 ['알다', '계시다', '선물', '받다', '들다', '보다', '없다', '정보', '나오다', '같다'],
 ['주목', '대하다', '감사', '규정', '체계', '사항', '싸다', '클릭', '코르크', '리딩']]

In [38]:
# import tomotopy as tp 

# model = tp.LDAModel(k=10, alpha=0.1, eta=0.01, min_cf=5)

# for i, line in enumerate(DF.contents.tolist()):
#     model.add_doc(tokenize(line)) 
#     if i % 10 == 0: print('Document #{} has been loaded'.format(i))
        
# model.train(0) 
# print('Total docs:', len(model.docs))
# print('Total words:', model.num_words)
# print('Vocab size:', model.num_vocabs)
 

# for i in range(200):
#     #print('Iteration {}\tLL per word: {}'.format(i, model.ll_per_word))
#     model.train(1)
    
# for i in range(model.k):
#     # 토픽 개수가 총 20개이니, 0~19번까지의 토픽별 상위 단어 10개를 뽑아봅시다.
#     res = model.get_topic_words(i, top_n=10)
#     print('Topic #{}'.format(i), end='\t')
#     #print(', '.join(w for w, p in res))
#res