<a href="https://colab.research.google.com/github/rhwans/rhwans/blob/main/NLP_%EB%B2%84%ED%86%A0%ED%94%BD(BERTopic).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 필요한 패키지 설치
!apt-get update -qq
!apt-get install -qq g++ openjdk-8-jdk python-dev python3-dev
!pip install konlpy
!pip install python-mecab-ko
!pip install mecab-python3
!pip install bertopic plotly
!pip install bertopic[visualization] plotly
!pip install wordcloud networkx matplotlib seaborn scikit-learn

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
E: Package 'python-dev' has no installation candidate
Collecting konlpy
  Downloading konlpy-0.6.0-py2.py3-none-any.whl.metadata (1.9 kB)
Collecting JPype1>=0.7.0 (from konlpy)
  Downloading jpype1-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Downloading konlpy-0.6.0-py2.py3-none-any.whl (19.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m19.4/19.4 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jpype1-1.5.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.9/493.9 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: JPype1, konlpy
Successfully installed JPype1-1.5.1 konlpy-0.6.0
Collecting python-mecab-ko
  Downl

In [2]:
# 필요한 라이브러리 임포트
import os
from google.colab import files
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab
from bertopic import BERTopic

In [3]:
# MeCab 한국어 사전 설치
!apt-get install -qq mecab libmecab-dev mecab-ipadic-utf8
!pip install mecab-python3==0.996.5
!git clone --depth 1 https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
!bash Mecab-ko-for-Google-Colab/install_mecab-ko_on_colab190912.sh

# MeCab 환경 변수 설정 (Colab 환경에서 필요)
os.environ['MECAB_CONFIG'] = '/usr/local/etc/mecabrc'  # mecabrc 파일 경로 설정

Selecting previously unselected package libmecab2:amd64.
(Reading database ... 124561 files and directories currently installed.)
Preparing to unpack .../0-libmecab2_0.996-14build9_amd64.deb ...
Unpacking libmecab2:amd64 (0.996-14build9) ...
Selecting previously unselected package libmecab-dev.
Preparing to unpack .../1-libmecab-dev_0.996-14build9_amd64.deb ...
Unpacking libmecab-dev (0.996-14build9) ...
Selecting previously unselected package mecab-utils.
Preparing to unpack .../2-mecab-utils_0.996-14build9_amd64.deb ...
Unpacking mecab-utils (0.996-14build9) ...
Selecting previously unselected package mecab-ipadic.
Preparing to unpack .../3-mecab-ipadic_2.7.0-20070801+main-3_all.deb ...
Unpacking mecab-ipadic (2.7.0-20070801+main-3) ...
Selecting previously unselected package mecab.
Preparing to unpack .../4-mecab_0.996-14build9_amd64.deb ...
Unpacking mecab (0.996-14build9) ...
Selecting previously unselected package mecab-ipadic-utf8.
Preparing to unpack .../5-mecab-ipadic-utf8_2.7

In [4]:
# MeCab 테스트
import MeCab
mecab = MeCab.Tagger()
print(mecab.parse("mecab-python3 설치 테스트"))

mecab	SL,*,*,*,*,*,*,*
-	SY,*,*,*,*,*,*,*
python	SL,*,*,*,*,*,*,*
3	SN,*,*,*,*,*,*,*
설치	NNG,행위,F,설치,*,*,*,*
테스트	NNG,행위,F,테스트,*,*,*,*
EOS



In [5]:
# 유사어 처리를 위한 클래스 정의
class SynonymProcessor:
    def __init__(self):
        self.synonym_dict = {
            # 스탬프 관련
            '스탬프': ['도장', '스템프', '스탬프인증', '스탬프확인', '도장찍기', '스탬프투어'],
            '배지': ['배찌', '뱃지', '뱃찌', '기념배지', '배찌', '뱃지'],

            # 보상 관련
            '보상': ['리워드', '상품', '상금', '포인트'],
            '선물': ['기념품', '증정품', '사은품', '기념선물', '기념메달'],

            # 축제 관련
            '둘레길': ['둘레길코스', '트레킹코스', '산책로', '둘레', '코스', '구간'],
            '걷기': ['워킹', '하이킹', '트레킹', '산책'],
            '인증': ['체크', '확인', '인증샷', '인증도장'],

            # 기타 활동
            '참여': ['참가', '등록', '신청', '접수'],
            '완주': ['완료', '종료', '마무리', '완보']
        }

        # 역방향 매핑 생성
        self.reverse_mapping = {}
        for standard, synonyms in self.synonym_dict.items():
            for synonym in synonyms:
                self.reverse_mapping[synonym] = standard
            self.reverse_mapping[standard] = standard

    def standardize_text(self, text):
        if not isinstance(text, str):
            return text

        words = text.split()
        standardized_words = []

        for word in words:
            if word in self.reverse_mapping:
                standardized_words.append(self.reverse_mapping[word])
            else:
                standardized_words.append(word)

        return ' '.join(standardized_words)

In [None]:
# CSV 파일 업로드
uploaded = files.upload()
filename = next(iter(uploaded))
df = pd.read_csv('걷기축제 통합(8개).csv', encoding='utf-8')  # 인코딩이 다를 경우 적절히 변경하세요

In [None]:
# 'text' 열에서 데이터 추출 및 전처리
preprocessed_documents = []
for line in tqdm(df['내용']):
 # 빈 문자열이거나 숫자로만 이루어진 줄은 제외
  if isinstance(line, str) and line and not line.replace(' ', '').isdecimal():
    preprocessed_documents.append(line)

100%|██████████| 7665/7665 [00:00<00:00, 18994.90it/s]


In [None]:
# 유사어 처리기 초기화
synonym_processor = SynonymProcessor()

# 데이터 전처리 및 유사어 처리
preprocessed_documents = []
for line in tqdm(df['내용']):
    if isinstance(line, str) and line and not line.replace(' ', '').isdecimal():
        # 유사어 처리 적용
        processed_line = synonym_processor.standardize_text(line)
        preprocessed_documents.append(processed_line)

print(f"전처리된 문서 수: {len(preprocessed_documents)}")

100%|██████████| 7665/7665 [00:03<00:00, 2516.61it/s]

전처리된 문서 수: 7665





In [None]:
#커스텀 토크나이저 정의
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger
    def __call__(self, sent):
        sent = sent[:1000000]
        # parse 메서드를 사용하여 형태소 분석 결과 문자열을 얻습니다.
        nodes = self.tagger.parseToNode(sent)
        word_tokens = []
        while nodes:
            word = nodes.surface
            if word:  # 빈 문자열이 아닌 경우에만 추가합니다.
                word_tokens.append(word)
            nodes = nodes.next
        result = [word for word in word_tokens if len(word) > 1]
        return result

custom_tokenizer = CustomTokenizer(MeCab.Tagger())

# 한국어 불용어 목록 (예시, 필요에 따라 확장하세요)
korean_stop_words = ['ㅋㅋ', '까지', '코스', '축제', '올레', '^^', '네요', '해요', 'ㅎㅎ', 'ㅇㅇ', '!!', '아요', '서울', '페스티벌', '제주', 'ㅋㅋㅋ', '한강', '...', '..', '셔서', '라는', '있다', '하다', '되다', '이다', '도', '만', '것', '수', '등', '를', '을', '에', '에서', '의', '습니다', '는데', '너무', '어요', '입니다', '으로', '어서', '라서', '지만', '합니다', '정말', '에게', '갑니다', '없이', '다는', '면서']
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=2000, stop_words=korean_stop_words)

# 중요 키워드는 불용어에서 제외
important_keywords = ['스탬프', '배지', '보상', '선물', '둘레길', '걷기', '인증', '참여', '완주']
korean_stop_words = [word for word in korean_stop_words if word not in important_keywords]

vectorizer = CountVectorizer(tokenizer=custom_tokenizer,
                           max_features=3000,
                           stop_words=korean_stop_words)

In [None]:
#BERTopic 모델 설정 및 학습
MODEL_NAME = "sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens"
model = BERTopic(embedding_model=MODEL_NAME, vectorizer_model=vectorizer, nr_topics=20, top_n_words=5, calculate_probabilities=True)
topics, probs = model.fit_transform(preprocessed_documents)

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# 토픽 정보 확인
print(model.get_topic_info())

    Topic  Count                  Name                Representation  \
0      -1   4056        -1_부산_구간_공원_km          [부산, 구간, 공원, km, 10]   
1       0   1795         0_부산_10_사랑_생명          [부산, 10, 사랑, 생명, 00]   
2       1    523        1_청산_청산도_슬로_유채         [청산, 청산도, 슬로, 유채, 여행]   
3       2    345         2_맛있_해서_시간_사람          [맛있, 해서, 시간, 사람, 생각]   
4       3    220         3_행사_참여_함께_10          [행사, 참여, 함께, 10, 공연]   
5       4    184        4_보내_이동_10_페이지         [보내, 이동, 10, 페이지, 슬로]   
6       5    115         5_러브_사랑_가족_생명         [러브, 사랑, 가족, 생명, 장길자]   
7       6     74        6_포항_호미_둘레길_해파        [포항, 호미, 둘레길, 해파, 호미곶]   
8       7     68         7_버스_공원_청산_하늘          [버스, 공원, 청산, 하늘, 시간]   
9       8     65        8_얼음_겨울_알프스_청양         [얼음, 겨울, 알프스, 청양, 분수]   
10      9     34  9_portfolio_촬영_행사_보내  [portfolio, 촬영, 행사, 보내, 컨벤션]   
11     10     32     10_민들레_투데이_아시아_치매       [민들레, 투데이, 아시아, 치매, 도박]   
12     11     29    11_장미_둘레길_corp_알프스      [장미, 둘레길, corp, 알프스,

In [None]:
# Topic을 대표하는 상위 단어 5개씩 보여줌
fig = model.visualize_barchart(top_n_topics=25)
fig.write_html("topic_barchart.html")
files.download("topic_barchart.html")

print("모든 분석 결과가 HTML 파일로 다운로드되었습니다.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

모든 분석 결과가 HTML 파일로 다운로드되었습니다.


In [None]:
# 분류된 토픽에 대한 시각화(버블 차트)
fig = model.visualize_topics()
fig.write_html("topic_visualization.html")
files.download("topic_visualization.html")

# 특정 Document에 대해서 Topic별 Probabilities를 확인
fig = model.visualize_distribution(probs[200], min_probability=0.015)
fig.write_html("topic_distribution.html")
files.download("topic_distribution.html")

# Topic 별 Hierarchy를 보여줌
fig = model.visualize_hierarchy(top_n_topics=50)
fig.write_html("topic_hierarchy.html")
files.download("topic_hierarchy.html")

# Topic을 대표하는 상위 단어 5개씩 보여줌
fig = model.visualize_barchart(top_n_topics=5)
fig.write_html("topic_barchart.html")
files.download("topic_barchart.html")

# Topic간 유사도를 Cosine Similarity로 계산 후 Heatmap으로 표현
fig = model.visualize_heatmap(n_clusters=20, width=1000, height=1000)
fig.write_html("topic_heatmap.html")
files.download("topic_heatmap.html")

# Topic내 대표하는 단어들에 대해서 c-tf-idf로 계산해서 각 단어가 Topic에서 차지하는 중요도를 계산했던 것을 Rank 순대로 보여줌
fig = model.visualize_term_rank()
fig.write_html("topic_term_rank.html")
files.download("topic_term_rank.html")

print("모든 분석 결과가 HTML 파일로 다운로드되었습니다.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

모든 분석 결과가 HTML 파일로 다운로드되었습니다.


In [None]:
import numpy as np
from umap import UMAP
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler

def create_enhanced_topic_cluster_visualization(model, docs):
    # Get the underlying SentenceTransformer model
    sentence_model = model.embedding_model.embedding_model

    # Extract document embeddings using the SentenceTransformer model
    embeddings = sentence_model.encode(docs, show_progress_bar=False)

    # UMAP을 사용하여 2차원으로 축소
    umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine')
    umap_embeddings = umap_model.fit_transform(embeddings)

    # 결과 정규화 (시각화를 위해)
    scaler = MinMaxScaler()
    umap_embeddings = scaler.fit_transform(umap_embeddings)

    # 토픽 할당
    topics, _ = model.transform(docs)

    # 토픽 정보 가져오기
    topic_info = model.get_topic_info()

    # Plotly를 사용한 산점도 생성
    fig = go.Figure()

    # 색상 팔레트 정의
    colors = [
        '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd',
        '#8c564b', '#e377c2', '#7f7f7f', '#bcbd22', '#17becf'
    ]

    # 각 토픽에 대해 점 추가
    for i, topic in enumerate(set(topics)):
        if topic != -1:  # -1은 미분류 토픽
            topic_name = topic_info[topic_info['Topic'] == topic]['Name'].values[0]
            mask = np.array(topics) == topic
            fig.add_trace(go.Scatter(
                x=umap_embeddings[mask, 0],
                y=umap_embeddings[mask, 1],
                mode='markers',
                name=f'Topic {topic}: {topic_name}',
                marker=dict(
                    size=6,
                    color=colors[i % len(colors)],
                    line=dict(width=1, color='DarkSlateGrey'),
                    opacity=0.7
                ),
                text=[f"Document {i}<br>Topic: {topic}<br>{topic_name}" for i in np.where(mask)[0]],
                hoverinfo='text'
            ))

    # 레이아웃 설정
    fig.update_layout(
        title={
            'text': "Enhanced Visualization of Topic Clusters",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=24, family="Arial, sans-serif")
        },
        xaxis_title="UMAP Dimension 1",
        yaxis_title="UMAP Dimension 2",
        legend_title="Topics",
        width=1200,
        height=800,
        plot_bgcolor='rgba(240,240,240,0.5)',
        paper_bgcolor='white',
        font=dict(family="Arial, sans-serif"),
        legend=dict(
            itemsizing='constant',
            font=dict(size=10),
            borderwidth=1
        ),
        margin=dict(l=50, r=50, t=80, b=50),
    )

    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', zeroline=False)
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey', zeroline=False)

    # HTML 파일로 저장
    fig.write_html("enhanced_topic_clusters_visualization.html")
    files.download("enhanced_topic_clusters_visualization.html")
    print("향상된 토픽 클러스터 시각화가 'enhanced_topic_clusters_visualization.html' 파일로 저장되었습니다.")

# 함수 호출
create_enhanced_topic_cluster_visualization(model, preprocessed_documents)

In [None]:
import pandas as pd
import numpy as np

# c-TF-IDF 값 추출
c_tf_idf = model.c_tf_idf_
words = model.vectorizer_model.get_feature_names_out()

# 모든 토픽에 대한 c-TF-IDF 값 합산
total_c_tf_idf = c_tf_idf.toarray().sum(axis=0)

# 결과를 데이터프레임으로 변환
c_tf_idf_df = pd.DataFrame({'단어': words, 'c-TF-IDF': total_c_tf_idf})
c_tf_idf_df = c_tf_idf_df.sort_values('c-TF-IDF', ascending=False)

# CSV 파일로 저장
c_tf_idf_df.to_csv('c_tf_idf_values.csv', index=False, encoding='utf-8-sig')
files.download('c_tf_idf_values.csv')

print("c-TF-IDF 값이 'c_tf_idf_values.csv' 파일로 저장되었습니다.")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

c-TF-IDF 값이 'c_tf_idf_values.csv' 파일로 저장되었습니다.


In [None]:
# 단어 빈도수 계산
vectorizer = CountVectorizer(tokenizer=custom_tokenizer, max_features=3000, stop_words=korean_stop_words)
X = vectorizer.fit_transform(preprocessed_documents)

# 단어 목록과 빈도수
word_freq = X.sum(axis=0).A1
words = vectorizer.get_feature_names_out()
word_count = pd.DataFrame({'단어': words, '빈도수': word_freq})

# 빈도수에 따른 퍼센트 계산
word_count['퍼센트'] = (word_count['빈도수'] / word_count['빈도수'].sum()) * 100

# 빈도수에 따라 정렬
word_count = word_count.sort_values(by='빈도수', ascending=False)

# 상위 20개의 단어 확인
print(word_count.head(20))

# 전체 단어의 빈도수와 퍼센트를 CSV로 저장해서 다운로드
word_count.to_csv('word_frequencies.csv', index=False, encoding='utf-8-sig')
files.download('word_frequencies.csv')


         단어   빈도수       퍼센트
1627     시간  2409  2.615522
959      마음  1985  2.155172
1476     생각  1899  2.061800
2728  템플스테이  1592  1.728481
122      감사  1569  1.703509
1604     스님  1362  1.478763
1037     명상   805  0.874012
671      다시   701  0.761096
217      경험   688  0.746982
2619     처음   643  0.698124
2896     행복   629  0.682924
1408     사람   593  0.643837
2633     체험   552  0.599323
997      말씀   549  0.596065
2856     함께   521  0.565665
2260     자신   460  0.499435
1557     소리   447  0.485321
2224     일상   435  0.472292
440      기회   432  0.469035
366     그리고   413  0.448406


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import numpy as np
from umap import UMAP
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
import plotly.express as px

def create_enhanced_topic_cluster_visualization(model, docs):
    # Get the underlying SentenceTransformer model
    sentence_model = model.embedding_model.embedding_model

    # Extract document embeddings using the SentenceTransformer model
    embeddings = sentence_model.encode(docs, show_progress_bar=False)

    # UMAP을 사용하여 2차원으로 축소
    umap_model = UMAP(n_neighbors=15, n_components=2, min_dist=0.1, metric='cosine')
    umap_embeddings = umap_model.fit_transform(embeddings)

    # 결과 정규화 (시각화를 위해)
    scaler = MinMaxScaler()
    umap_embeddings = scaler.fit_transform(umap_embeddings)

    # 토픽 할당
    topics, _ = model.transform(docs)

    # 토픽 정보 가져오기
    topic_info = model.get_topic_info()

    # 색상 팔레트 생성
    color_palette = px.colors.qualitative.Bold

    # Plotly를 사용한 산점도 생성
    fig = go.Figure()

    # 각 토픽에 대해 점 추가
    for i, topic in enumerate(set(topics)):
        if topic != -1:  # -1은 미분류 토픽
            topic_name = topic_info[topic_info['Topic'] == topic]['Name'].values[0]
            mask = np.array(topics) == topic
            color = color_palette[i % len(color_palette)]
            fig.add_trace(go.Scatter(
                x=umap_embeddings[mask, 0],
                y=umap_embeddings[mask, 1],
                mode='markers',
                name=f'Topic {topic}: {topic_name}',
                marker=dict(
                    size=8,
                    color=color,
                    symbol='circle',
                    line=dict(width=1, color='DarkSlateGrey')
                ),
                text=[f"Document {i}<br>Topic: {topic}<br>{topic_name}" for i in np.where(mask)[0]],
                hoverinfo='text'
            ))

    # 레이아웃 설정
    fig.update_layout(
        title={
            'text': "Visualization of Topic Clusters",
            'y':0.95,
            'x':0.5,
            'xanchor': 'center',
            'yanchor': 'top',
            'font': dict(size=24, color='DarkSlateGrey')
        },
        xaxis_title="UMAP Dimension 1",
        yaxis_title="UMAP Dimension 2",
        legend_title="Topics",
        width=1200,
        height=800,
        plot_bgcolor='rgb(250,250,250)',
        legend=dict(
            itemsizing='constant',
            font=dict(size=10),
            borderwidth=1
        ),
        margin=dict(l=50, r=50, t=80, b=50),
    )

    fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')
    fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='LightGrey')

    # HTML 파일로 저장
    fig.write_html("enhanced_topic_clusters_visualization.html")
    files.download("enhanced_topic_clusters_visualization.html")
    print("향상된 토픽 클러스터 시각화가 'enhanced_topic_clusters_visualization.html' 파일로 저장되었습니다.")

# 함수 호출
create_enhanced_topic_cluster_visualization(model, preprocessed_documents)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

향상된 토픽 클러스터 시각화가 'enhanced_topic_clusters_visualization.html' 파일로 저장되었습니다.


In [None]:
!pip install bertopic
!pip install plotly
!pip install networkx
import networkx as nx
import plotly.graph_objects as go
from bertopic import BERTopic # Import the BERTopic class
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def create_topic_network(model, n_topics=10):
    topic_info = model.get_topic_info().head(n_topics)
    G = nx.Graph()

    for _, row in topic_info.iterrows():
        G.add_node(row['Topic'], size=row['Count'])

    # Get topic embeddings
    topic_embeddings = model.topic_embeddings_

    for i in range(len(topic_info)):
        for j in range(i+1, len(topic_info)):
            # Calculate cosine similarity between topic embeddings
            similarity = cosine_similarity(topic_embeddings[topic_info.iloc[i]['Topic']].reshape(1, -1),
                                           topic_embeddings[topic_info.iloc[j]['Topic']].reshape(1, -1))[0][0]

            if similarity > 0.2:  # 유사도 임계값 설정
                G.add_edge(topic_info.iloc[i]['Topic'], topic_info.iloc[j]['Topic'], weight=similarity)

    pos = nx.spring_layout(G)
    edge_x = []
    edge_y = []
    for edge in G.edges():
        x0, y0 = pos[edge[0]]
        x1, y1 = pos[edge[1]]
        edge_x.extend([x0, x1, None])
        edge_y.extend([y0, y1, None])

    edge_trace = go.Scatter(x=edge_x, y=edge_y, line=dict(width=0.5, color='#888'), hoverinfo='none', mode='lines')

    node_x = [pos[node][0] for node in G.nodes()]
    node_y = [pos[node][1] for node in G.nodes()]

    node_trace = go.Scatter(x=node_x, y=node_y, mode='markers+text', hoverinfo='text',
                            marker=dict(showscale=True, colorscale='YlGnBu', size=10, colorbar=dict(thickness=15, title='Node Connections')),
                            text=[f"Topic {node}" for node in G.nodes()], textposition="top center")

    fig = go.Figure(data=[edge_trace, node_trace],
                    layout=go.Layout(showlegend=False, hovermode='closest',
                                     margin=dict(b=20,l=5,r=5,t=40)))
    fig.write_html("topic_network.html")

    # Assuming 'files' is from google.colab
    try:
        from google.colab import files
        files.download("topic_network.html")
    except ImportError:
        print("Could not import 'files' from google.colab. Skipping download.")

create_topic_network(model)



<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import plotly.graph_objects as go
import pandas as pd
from google.colab import files

def visualize_c_tf_idf_html(model, n_topics=5, n_words=10):
    topic_info = model.get_topic_info()

    # 상위 n_topics 개의 토픽에 대해 상위 n_words 개의 단어와 c-TF-IDF 값 추출
    data = []
    for topic in topic_info['Topic'][:n_topics]:
        words, values = zip(*model.get_topic(topic)[:n_words])
        data.extend([(topic, word, value) for word, value in zip(words, values)])

    # DataFrame 생성
    df = pd.DataFrame(data, columns=['Topic', 'Word', 'c-TF-IDF'])

    # 피벗 테이블 생성
    pivot_df = df.pivot(index='Word', columns='Topic', values='c-TF-IDF')

    # Plotly를 사용한 히트맵 생성
    fig = go.Figure(data=go.Heatmap(
                    z=pivot_df.values,
                    x=pivot_df.columns,
                    y=pivot_df.index,
                    colorscale='YlOrRd',
                    hoverongaps = False))

    fig.update_layout(
        title='c-TF-IDF Values Heatmap',
        xaxis_title='Topics',
        yaxis_title='Words',
        width=900,
        height=700
    )

    # HTML 파일로 저장
    fig.write_html("c_tf_idf_heatmap.html")
    files.download("c_tf_idf_heatmap.html")

    print("c-TF-IDF 히트맵이 'c_tf_idf_heatmap.html' 파일로 저장되었습니다.")

# 함수 호출
visualize_c_tf_idf_html(model, n_topics=5, n_words=10)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

c-TF-IDF 히트맵이 'c_tf_idf_heatmap.html' 파일로 저장되었습니다.
