<a href="https://colab.research.google.com/github/rhwans/rhwans/blob/main/NLP_%EB%B2%84%ED%86%A0%ED%94%BD(BERTopic).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 필요한 패키지 설치
!apt-get update -qq
!apt-get install -qq g++ openjdk-8-jdk python-dev python3-dev
!pip install konlpy
!pip install python-mecab-ko
!pip install mecab-python3
!pip install bertopic plotly
!pip install bertopic[visualization] plotly
!pip install wordcloud networkx matplotlib seaborn scikit-learn

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Package 'python-dev' has no installation candidate


In [2]:
# 필요한 라이브러리 임포트
import os
from google.colab import files
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from bertopic import BERTopic

In [3]:
# MeCab 한국어 사전 설치
!apt-get install -qq mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3==0.996.5
!git clone --depth 1 https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
!bash Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab190912.sh

# MeCab 환경 변수 설정 (Colab 환경에서 필요)
os.environ['MECAB_CONFIG'] = '/usr/local/etc/mecabrc'  # mecabrc 파일 경로 설정

Collecting mecab-python3==0.996.5
  Using cached mecab-python3-0.996.5.tar.gz (65 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.
fatal: destination path 'Mecab-ko-for-Google-Colab' already exists and is not an empty directory.
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.99

In [4]:
# MeCab 테스트
import MeCab
mecab = MeCab.Tagger()
print(mecab.parse("mecab-python3 설치 테스트"))

mecab	SL,*,*,*,*,*,*,*
-	SY,*,*,*,*,*,*,*
python	SL,*,*,*,*,*,*,*
3	SN,*,*,*,*,*,*,*
설치	NNG,행위,F,설치,*,*,*,*
테스트	NNG,행위,F,테스트,*,*,*,*
EOS



In [5]:
# CSV 파일 업로드
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv('TP 1-2028.csv', encoding='utf-8')  # 인코딩이 다를 경우 적절히 변경하세요

Saving TP 1-2028.csv to TP 1-2028 (4).csv


In [6]:
# 'text' 열에서 데이터 추출 및 전처리
preprocessed_documents = []
for line in tqdm(df['text']):
 # 빈 문자열이거나 숫자로만 이루어진 줄은 제외
  if isinstance(line, str) and line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|██████████| 2027/2027 [00:00<00:00, 187174.81it/s]


In [7]:
#커스텀 토크나이저 정의
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        # parse 메서드를 사용하여 형태소 분석 결과 문자열을 얻습니다.
        nodes = self.tagger.parseToNode(sent)
        word_tokens = []
        while nodes:
            word = nodes.surface
            if word:  # 빈 문자열이 아닌 경우에만 추가합니다.
                word_tokens.append(word)
            nodes = nodes.next
        result = [word for word in word_tokens if len(word) > 1]
        return result

custom_tokenizer = CustomTokenizer(MeCab.Tagger())

# 한국어 불용어 목록 (예시, 필요에 따라 확장하세요)
korean_stop_words = ['있다', '하다', '되다', '이다', '도', '만', '것', '수', '등', '를', '을', '에', '에서', '의', '습니다', '는데', '너무', '어요', '입니다', '으로', '어서', '라서', '지만', '합니다', '정말', '에게', '갑니다', '없이', '다는', '면서', '19', '50']

vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000, stop_words=korean_stop_words)

In [None]:
#BERTopic 모델 설정 및 학습
MODEL_NAME = "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens"
model = BERTopic(embedding_model=MODEL_NAME, vectorizer_model=vectorizer, nr_topics=50, top_n_words=10, calculate_probabilities=True)
topics, probs = model.fit_transform(preprocessed_documents)

In [None]:
# 토픽 정보 확인
print(model.get_topic_info())

In [None]:
# 분류된 토픽에 대한 시각화(버블 차트)
fig = model.visualize_topics()
fig.write_html("topic_visualization.html")
files.download("topic_visualization.html")

# 특정 Document에 대해서 Topic별 Probabilities를 확인
fig = model.visualize_distribution(probs[200], min_probability=0.015)
fig.write_html("topic_distribution.html")
files.download("topic_distribution.html")

# Topic 별 Hierarchy를 보여줌
fig = model.visualize_hierarchy(top_n_topics=50)
fig.write_html("topic_hierarchy.html")
files.download("topic_hierarchy.html")

# Topic을 대표하는 상위 단어 5개씩 보여줌
fig = model.visualize_barchart(top_n_topics=5)
fig.write_html("topic_barchart.html")
files.download("topic_barchart.html")

# Topic간 유사도를 Cosine Similarity로 계산 후 Heatmap으로 표현
fig = model.visualize_heatmap(n_clusters=20, width=1000, height=1000)
fig.write_html("topic_heatmap.html")
files.download("topic_heatmap.html")

# Topic내 대표하는 단어들에 대해서 c-tf-idf로 계산해서 각 단어가 Topic에서 차지하는 중요도를 계산했던 것을 Rank 순대로 보여줌
fig = model.visualize_term_rank()
fig.write_html("topic_term_rank.html")
files.download("topic_term_rank.html")

print("모든 분석 결과가 HTML 파일로 다운로드되었습니다.")

In [None]:
# MulticoreTSNE 설치
!apt-get update -qq
!apt-get install -qq g++ cmake libopenblas-dev
!git clone https://github.com/DmitryUlyanov/Multicore-TSNE.git
!cd Multicore-TSNE && pip install .

!pip install MulticoreTSNE  # 혹시 설치에 실패할 경우를 대비해 pip로도 설치

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.manifold import TSNE
from MulticoreTSNE import MulticoreTSNE
from google.colab import files

def create_tsne_visualization(model, docs, n_topics=20, use_multicore=True):
    # 문서 임베딩 추출
    embedding_model = model.embedding_model.embedding_model
    doc_embeddings = embedding_model.encode(docs)

    # MulticoreTSNE 사용 또는 기본 sklearn TSNE 사용
    if use_multicore:
        tsne = MulticoreTSNE(n_components=2, random_state=42, perplexity=50, n_iter=5000, n_jobs=4)
    else:
        tsne = TSNE(n_components=2, random_state=42, perplexity=50, n_iter=5000)

    tsne_results = tsne.fit_transform(doc_embeddings)

    # 각 문서의 주요 토픽 할당
    topic_labels, _ = model.transform(docs)

    # 상위 n_topics개의 토픽만 선택
    top_topics = model.get_topic_freq().head(n_topics)['Topic'].tolist()
    mask = np.isin(topic_labels, top_topics)
    tsne_results = tsne_results[mask]
    topic_labels = topic_labels[mask]

    # 시각화
    plt.figure(figsize=(20, 16))
    sns.set_style("whitegrid")
    sns.set_palette("husl", n_colors=n_topics)

    scatter = plt.scatter(tsne_results[:, 0], tsne_results[:, 1],
                          c=topic_labels, cmap='husl', alpha=0.7, s=50)

    # 토픽 중심 계산 및 레이블 추가
    for topic in top_topics:
        topic_docs = tsne_results[topic_labels == topic]
        if len(topic_docs) > 0:
            center = topic_docs.mean(axis=0)
            plt.text(center[0], center[1], f"Topic {topic}", fontsize=12, fontweight='bold',
                     bbox=dict(facecolor='white', alpha=0.7, edgecolor='none'))

    plt.title("t-SNE Visualization of Document Embeddings", fontsize=24)
    plt.xlabel("t-SNE feature 0", fontsize=18)
    plt.ylabel("t-SNE feature 1", fontsize=18)

    # 컬러바 추가
    cbar = plt.colorbar(scatter)
    cbar.set_label('Topics', fontsize=18)

    plt.tight_layout()
    plt.savefig('tsne_visualization.png', dpi=300, bbox_inches='tight')
    plt.close()

    print("t-SNE 시각화가 'tsne_visualization.png' 파일로 저장되었습니다.")
    files.download("tsne_visualization.png")

# 현재 작업 디렉토리 확인 및 변경 (Google Colab 환경용)
print("현재 작업 디렉토리:", os.getcwd())
if 'google.colab' in str(get_ipython()):
    os.chdir('/content')
    print("변경된 작업 디렉토리:", os.getcwd())

# t-SNE 시각화 함수 호출
create_tsne_visualization(model, preprocessed_documents, use_multicore=True)

print("t-SNE 시각화가 완료되었습니다. 'tsne_visualization.png' 파일을 확인해 주세요.")