In [1]:
import pyLDAvis.gensim_models as gensimvis
import pyLDAvis
from gensim.models.ldamodel import LdaModel
from gensim import corpora
from konlpy.tag import Okt
import pandas as pd
import re
import logging
from tqdm import tqdm

# 웹소설 제목 변수 설정
webnovel_title = '고수, 후궁으로 깨어나다'

# 설정 로깅
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)

# 불용어 목록 확장
stop_words = set([
    '를', '이', '은', '는', '있다', '하다', '에', 'ㅠ', 'ㅋ',
    '건가', 'ㅎ', '일이', '무슨', '대한', '슈도', '뭔가', '진짜',
    '정말', '생각', '사람', '보고', '누구', '정도', '위해', '때문', '이건',
    '어디', '가장', '아주', '제일', '그냥', '해도', '하나', '얼마나', '자기',
    '부분', '어찌', '저런', '자신', '것', '수', '등', '및', '점'
])

# 단어 치환 목록
word_mapped = {
    "11낭": "십일낭", "십일낭도": "십일낭"
}

# 데이터 로드
data = pd.read_excel(f'{webnovel_title}-Total_collected_data.xlsx', na_values=['NaN'])
data = data.dropna()  # NaN 값을 가진 행 제거
data.reset_index(drop=True, inplace=True)  # 인덱스 재설정

# 텍스트 열 추출
texts = data["Review"].astype(str).tolist()

# Okt 초기화
okt = Okt()

# 텍스트 전처리 함수
def preprocess(text):
    # 특수 문자와 숫자 제거
    text = re.sub(r"[^가-힣\s]", "", text)
    # 단어 치환
    for key, value in word_mapped.items():
        text = text.replace(key, value)
    # Okt를 사용하여 토큰화
    tokens = okt.nouns(text)
    # 불용어 제거
    tokens = [token for token in tokens if token not in stop_words and len(token) > 1]
    return tokens
  
# 전처리된 텍스트
processed_texts = [preprocess(text) for text in tqdm(texts)]

# 사전과 말뭉치 생성
dictionary = corpora.Dictionary(processed_texts)
corpus = [dictionary.doc2bow(text) for text in processed_texts]

# LDA 모델 설정
num_topics = 5
chunksize = 2000
passes = 20
iterations = 400
eval_every = None

# LDA 모델 학습
model = LdaModel(
    corpus=corpus,
    id2word=dictionary,
    chunksize=chunksize,
    alpha="auto",
    eta="auto",
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

# 토픽 코히런스 계산
top_topics = model.top_topics(corpus)
avg_topic_coherence = sum([t[1] for t in top_topics]) / num_topics
print("Average topic coherence: %.4f." % avg_topic_coherence)

from pprint import pprint
pprint(top_topics)

# 시각화 준비
lda_visualization = gensimvis.prepare(model, corpus, dictionary, n_jobs=1)
pyLDAvis.save_html(lda_visualization, f"{webnovel_title}-lda.html")


100%|██████████| 20297/20297 [00:35<00:00, 565.52it/s]
2024-06-03 19:11:59,473 : INFO : adding document #0 to Dictionary<0 unique tokens: []>
2024-06-03 19:11:59,561 : INFO : adding document #10000 to Dictionary<7426 unique tokens: ['소설', '복선', '천년', '시침', '아이']...>
2024-06-03 19:11:59,614 : INFO : adding document #20000 to Dictionary<10712 unique tokens: ['소설', '복선', '천년', '시침', '아이']...>
2024-06-03 19:11:59,622 : INFO : built Dictionary<10777 unique tokens: ['소설', '복선', '천년', '시침', '아이']...> from 20297 documents (total 92740 corpus positions)
2024-06-03 19:11:59,622 : INFO : Dictionary lifecycle event {'msg': "built Dictionary<10777 unique tokens: ['소설', '복선', '천년', '시침', '아이']...> from 20297 documents (total 92740 corpus positions)", 'datetime': '2024-06-03T19:11:59.622786', 'gensim': '4.3.0', 'python': '3.11.7 | packaged by Anaconda, Inc. | (main, Dec 15 2023, 18:05:47) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.22631-SP0', 'event': 'created'}
2024-06-03 19:11:59,71

Average topic coherence: -9.2787.
[([(0.089117266, '황제'),
   (0.063569814, '천년'),
   (0.04134862, '개원'),
   (0.03242304, '돌이'),
   (0.013902246, '년비'),
   (0.012303471, '사랑'),
   (0.010620137, '언제'),
   (0.009808905, '우리'),
   (0.009067808, '오해'),
   (0.008585016, '지금'),
   (0.008429337, '마음'),
   (0.0077261804, '눈치'),
   (0.0075894915, '계속'),
   (0.007526179, '계란'),
   (0.007438358, '천천'),
   (0.007076078, '내용'),
   (0.007000434, '무림'),
   (0.0069256914, '다른'),
   (0.0056203087, '여자'),
   (0.0055422927, '반숙')],
  -3.567643301782719),
 ([(0.051325567, '작가'),
   (0.03017968, '여주'),
   (0.027259633, '다음'),
   (0.02268553, '소설'),
   (0.021167833, '쿠키'),
   (0.013179757, '처음'),
   (0.01216946, '완결'),
   (0.010441336, '댓글'),
   (0.010012954, '시기'),
   (0.009867223, '작품'),
   (0.009235487, '느낌'),
   (0.009209811, '폐하'),
   (0.008738947, '전개'),
   (0.007951317, '스토리'),
   (0.007842797, '시작'),
   (0.00757076, '한번'),
   (0.0075393193, '나중'),
   (0.0068651624, '갑자기'),
   (0.0064396784, '성격'),
  