<a href="https://colab.research.google.com/github/rhwans/rhwans/blob/main/NLP_%EB%B2%84%ED%86%A0%ED%94%BD(BERTopic).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 필요한 패키지 설치
!apt-get update -qq
!apt-get install -qq g++ openjdk-8-jdk python-dev python3-dev
!pip install konlpy               # konlpy를 먼저 설치합니다.
!pip install python-mecab-ko

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Package 'python-dev' has no installation candidate


In [2]:
# bertopic 및 기타 필요한 라이브러리 설치
!pip install bertopic plotly
!pip install bertopic[visualization] plotly



In [4]:
# 필요한 라이브러리 임포트
import os
from google.colab import files
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from bertopic import BERTopic

In [5]:
# MeCab 한국어 사전 설치
!apt-get install -qq mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3==0.996.5
!git clone --depth 1 https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
!bash Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab190912.sh

# MeCab 환경 변수 설정 (Colab 환경에서 필요)
os.environ['MECAB_CONFIG'] = '/usr/local/etc/mecabrc'  # mecabrc 파일 경로 설정

Collecting mecab-python3==0.996.5
  Using cached mecab-python3-0.996.5.tar.gz (65 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
fatal: destination path 'Mecab-ko-for-Google-Colab' already exists and is not an empty directory.
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.99

In [6]:
# MeCab 테스트
import MeCab
mecab = MeCab.Tagger()
print(mecab.parse("mecab-python3 설치 테스트"))

mecab	SL,*,*,*,*,*,*,*
-	SY,*,*,*,*,*,*,*
python	SL,*,*,*,*,*,*,*
3	SN,*,*,*,*,*,*,*
설치	NNG,행위,F,설치,*,*,*,*
테스트	NNG,행위,F,테스트,*,*,*,*
EOS



In [7]:
# CSV 파일 업로드
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv('TP 1-2028.csv', encoding='utf-8')  # 인코딩이 다를 경우 적절히 변경하세요

Saving TP 1-2028.csv to TP 1-2028 (2).csv


In [8]:
# 'text' 열에서 데이터 추출 및 전처리
preprocessed_documents = []
for line in tqdm(df['text']):  # 'text'를 실제 텍스트 열 이름으로 변경하세요
 # 빈 문자열이거나 숫자로만 이루어진 줄은 제외
  if isinstance(line, str) and line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|██████████| 2027/2027 [00:00<00:00, 180487.30it/s]


In [9]:
#커스텀 토크나이저 정의
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

custom_tokenizer = CustomTokenizer(Mecab())

# 한국어 불용어 목록 (예시, 필요에 따라 확장하세요)
korean_stop_words = ['있다', '하다', '되다', '이다', '도', '만', '것', '수', '등', '를', '을', '에', '에서', '의', '습니다', '는데', '너무', '어요', '입니다', '으로', '어서', '라서', '지만', '합니다', '정말', '에게', '갑니다', '없이', '다는', '면서', '19', '50']

vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000, stop_words=korean_stop_words)

In [10]:
#BERTopic 모델 설정 및 학습
MODEL_NAME = "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens"
model = BERTopic(embedding_model=MODEL_NAME, vectorizer_model=vectorizer, nr_topics=50, top_n_words=10, calculate_probabilities=True)
topics, probs = model.fit_transform(preprocessed_documents)

In [11]:
# 토픽 정보 확인
print(model.get_topic_info())

    Topic  Count                   Name  \
0      -1   1015         -1_시간_마음_생각_감사   
1       0     89          0_느낌_시간_최고_편안   
2       1     19        1_한국_서울_낙산사_지리산   
3       2     12          2_경험_동안_자비_처음   
4       3     84          3_음식_시간_맛있_식사   
5       4     17          4_맛있_깨끗_윤회_연등   
6       5     68          5_명상_시간_스님_일상   
7       6    218       6_불교_마음_스님_템플스테이   
8       7     48         7_맨발_전나무_자현_느낌   
9       8     20      8_낙산사_소리_파도_템플스테이   
10      9     12         9_108_명상_생각_배우   
11     10     24    10_시간_템플스테이_지금_2022   
12     11     19         11_마음_대하_평온_스님   
13     12     18        12_감사_재미있_셔서_친절   
14     13     47        13_시간_핸드폰_생각_휴식   
15     14     11       14_소리_감사_공기_드립니다   
16     15     65     15_생각_시간_템플스테이_낙산사   
17     16     92      16_생각_마음_자신_템플스테이   
18     17     13         17_경험_시간_다고_생각   
19     18     36  18_크리스마스_백담사_템플스테이_스님   
20     19     65     19_감사_템플스테이_108_행복   
21     20     20         20_바다_보이_모든_감사   
22     21  

In [12]:
# 분류된 토픽에 대한 시각화(버블 차트)
fig = model.visualize_topics()
fig.write_html("topic_visualization.html")
files.download("topic_visualization.html")

# 특정 Document에 대해서 Topic별 Probabilities를 확인
fig = model.visualize_distribution(probs[200], min_probability=0.015)
fig.write_html("topic_distribution.html")
files.download("topic_distribution.html")

# Topic 별 Hierarchy를 보여줌
fig = model.visualize_hierarchy(top_n_topics=50)
fig.write_html("topic_hierarchy.html")
files.download("topic_hierarchy.html")

# Topic을 대표하는 상위 단어 5개씩 보여줌
fig = model.visualize_barchart(top_n_topics=5)
fig.write_html("topic_barchart.html")
files.download("topic_barchart.html")

# Topic간 유사도를 Cosine Similarity로 계산 후 Heatmap으로 표현
fig = model.visualize_heatmap(n_clusters=20, width=1000, height=1000)
fig.write_html("topic_heatmap.html")
files.download("topic_heatmap.html")

# Topic내 대표하는 단어들에 대해서 c-tf-idf로 계산해서 각 단어가 Topic에서 차지하는 중요도를 계산했던 것을 Rank 순대로 보여줌
fig = model.visualize_term_rank()
fig.write_html("topic_term_rank.html")
files.download("topic_term_rank.html")

print("모든 분석 결과가 HTML 파일로 다운로드되었습니다.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

모든 분석 결과가 HTML 파일로 다운로드되었습니다.
