<a href="https://colab.research.google.com/github/seulmi0827/fininsight/blob/main/JACE/BERTopic.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q bertopic
!pip install -q bertopic[visualization]

In [None]:
!apt-get update
!apt-get install g++ openjdk-8-jdk -y
!pip install konlpy
!pip install mecab-python
!apt-get install curl -y
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
import pandas as pd
import numpy as np
import torch
import random
import os
from konlpy.tag import Mecab
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic import BERTopic
import plotly.graph_objects as go

In [None]:
def set_total_original_news(df):
    total_original_news = len(df)
    return total_original_news


def set_weekend_news(df):
    weekend_news = len(df[df["inp_date"].dt.dayofweek.isin([5, 6])])
    return weekend_news


def set_weekday_news(df):
    weekday_news = len(df) - set_weekend_news(df)
    return weekday_news


def load_data(file_path):
    """데이터만 로드하는 함수"""
    # 데이터프레임 로드
    df = pd.read_csv(file_path)

    # 날짜 변환
    df["inp_date"] = pd.to_datetime(df["inp_date"])

    # 기사 수 저장
    original_news_count = set_total_original_news(df)
    weekend_news_count = set_weekend_news(df)
    weekday_news_count = set_weekday_news(df)

    # 주말 제거
    df = df[~df["inp_date"].dt.dayofweek.isin([5, 6])]

    # 날짜 관련 컬럼 추가
    df["date"] = df["inp_date"].dt.date
    df["month"] = df["inp_date"].dt.to_period("M")
    df["week"] = df["inp_date"].dt.to_period("W")

    return df, original_news_count, weekend_news_count, weekday_news_count


mecab = Mecab()


def preprocess(text):
    pos_tagged = mecab.pos(text)

    filtered = [
        word
        for word, pos in pos_tagged
        if (
            pos.startswith("NN")  # 명사
            or pos.startswith("VV")  # 동사
            or pos.startswith("VA")  # 형용사
            or pos == "MAG"  # 일반 부사
        )
        and len(word) > 1  # 1글자 이상만
    ]

    return filtered


def prepare_text_data(df):
    """텍스트 데이터 전처리 함수"""
    # 텍스트 전처리
    df["content"] = df["content"].fillna("").astype(str)
    df["preprocessed_content"] = df["content"].apply(lambda x: " ".join(preprocess(x)))

    # 토픽 모델링용 데이터 준비
    preprocessed_content = df["content"].apply(preprocess).tolist()

    return df, preprocessed_content


# 시드 설정
SEED = random.randint(0, 1000000)


def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False


def initialize_topic_model(seed):
    umap_model = UMAP(
        n_neighbors=15, n_components=5, min_dist=0.0, metric="cosine", random_state=seed
    )

    hdbscan_model = HDBSCAN(
        min_cluster_size=20,
        metric="euclidean",
        prediction_data=True,
        gen_min_span_tree=True,
        cluster_selection_method="eom",
    )

    embedding_model = SentenceTransformer("jhgan/ko-sroberta-multitask")
    vectorizer = CountVectorizer(stop_words=None)

    return BERTopic(
        language="korean",
        nr_topics=10,  # 유효 토픽 결정
        top_n_words=7,  # 토픽 별 키워드 수 결정
        calculate_probabilities=True,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        embedding_model=embedding_model,
        vectorizer_model=vectorizer,
        verbose=True,
    )


def run_topic_modeling(preprocessed_content):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print("사용중인 디바이스 : ", device)
    print()

    # 시드 설정
    set_seed(SEED)
    print(f"사용중인 시드 : {SEED}")
    print()

    topic_model = initialize_topic_model(SEED)
    content_for_topic = [" ".join(doc) for doc in preprocessed_content]
    topics, probs = topic_model.fit_transform(content_for_topic)

    return topics, probs, topic_model


def calculate_basic_stats(df, original_news_count, weekend_news_count, weekday_news_count):
    """기본 통계 정보 계산"""
    min_date = df["inp_date"].min()
    max_date = df["inp_date"].max()
    total_days = (max_date - min_date).days + 1

    date_range = pd.date_range(start=min_date, end=max_date)
    weekend_days = sum(date.weekday() >= 5 for date in date_range)
    weekday_days = total_days - weekend_days

    return {
        "min_date": min_date,
        "max_date": max_date,
        "total_days": total_days,
        "weekend_days": weekend_days,
        "weekday_days": weekday_days,
        "total_news": original_news_count,
        "weekend_news": weekend_news_count,
        "weekday_news": weekday_news_count,
    }


def analyze_data_distribution(df, original_news_count, weekend_news_count, weekday_news_count, topics=None, topic_model=None):
    """데이터 분포 및 토픽 분석 결과를 출력하는 함수"""
    # df.attrs에서 파일 경로 가져오기
    file_path = df.attrs.get("filepath", "")

    # 파일 경로에서 파일명 추출하여 타이틀 생성
    file_name = os.path.basename(file_path)
    title = os.path.splitext(file_name)[0]

    print()
    print("제목:", title)
    print()

    # 기본 통계 계산
    stats = calculate_basic_stats(df, original_news_count, weekend_news_count, weekday_news_count)

    # 주말 날짜 확인
    weekend_dates = df[df["inp_date"].dt.dayofweek.isin([5, 6])]["inp_date"].dt.date.unique()

    print(f"데이터 기간: {stats['min_date']} ~ {stats['max_date']}")
    print()
    print(f"총 기간: {stats['total_days']}일")
    print(f"평일 기간: {stats['weekday_days']}일")
    print(f"주말 기간: {stats['weekend_days']}일")
    print()
    print(f"전체 기사 수: {stats['total_news']}개")
    print(f"평일 기사 수: {stats['weekday_news']}개")
    print(f"주말 기사 수: {stats['weekend_news']}개")
    print()
    print("주말 날짜 10개 까지만 샘플 출력:")
    print("\n".join([f"    {date}" for date in sorted(weekend_dates)[:9]]))

    # 각 단위별 통계
    date_counts = df["date"].value_counts().sort_index()
    week_counts = df["week"].value_counts().sort_index()
    month_counts = df["month"].value_counts().sort_index()

    # 토픽 모델링 결과 분석
    topic_info = topic_model.get_topic_info()
    valid_topics = topic_info[topic_info["Topic"] != -1]
    noise_info = topic_info[topic_info["Topic"] == -1]

    num_topics = valid_topics.shape[0]
    filtered_count = len(topics)
    noise_count = noise_info["Count"].values[0] if not noise_info.empty else 0
    valid_count = filtered_count - noise_count

    print("\n=== 토픽 분석 결과 ===")
    print(f"유효 토픽 수: {num_topics}개")
    print(f"주말 제거된 기사 수: {filtered_count}개")
    print(f"노이즈 아웃라이너 기사 수: {noise_count}개")
    print(f"유효 토픽 기사 수: {valid_count}개")
    print()

    print("토픽별 기사 분포:")
    result_df = topic_info[["Topic", "Count", "Representation"]].copy()
    result_df["비율(%)"] = (result_df["Count"] / filtered_count * 100).round(2)
    print(result_df)

    # 날짜별 문서 수 통계
    print("\n== 날짜별 문서 수 통계 ==")
    print(f"평균 문서 수(일별): {date_counts.mean():.2f}")
    print(f"최소 문서 수(일별): {date_counts.min()} (날짜: {date_counts.idxmin()})")
    print(f"최대 문서 수(일별): {date_counts.max()} (날짜: {date_counts.idxmax()})")

    # 주별 문서 수 통계
    print("\n== 주별 문서 수 통계 ==")
    print(f"평균 문서 수(주별): {week_counts.mean():.2f}")
    print(f"최소 문서 수(주별): {week_counts.min()} (주: {week_counts.idxmin()})")
    print(f"최대 문서 수(주별): {week_counts.max()} (주: {week_counts.idxmax()})")

    # 월별 문서 수 통계
    print("\n== 월별 문서 수 통계 ==")
    print(f"평균 문서 수(월별): {month_counts.mean():.2f}")
    print(f"최소 문서 수(월별): {month_counts.min()} (월: {month_counts.idxmin()})")
    print(f"최대 문서 수(월별): {month_counts.max()} (월: {month_counts.idxmax()})")

    # 통계 정보 업데이트
    stats.update(
        {

            "title": title,
            "daily_avg": date_counts.mean(),
            "weekly_avg": week_counts.mean(),
            "monthly_avg": month_counts.mean(),
            "filtered_count": filtered_count,
            "noise_count": noise_count,
            "valid_count": valid_count,
        }
    )

    return stats


def create_topic_timeseries(
    df,
    topics,
    topic_model,
    time_unit="day",
    custom_data=None,
    stats=None,
):
    """토픽 시계열 시각화 생성"""
    # 토픽 정보 가져오기
    topic_info = topic_model.get_topic_info()
    all_topics = topic_info[topic_info["Topic"] != -1]["Topic"].tolist()

    # 통계 정보가 없으면 기본값 설정
    if stats is None:
        stats = {}

    # 기본 통계 정보 추출
    title = stats.get("title", 0)
    min_date = stats.get("min_date", df["inp_date"].min())
    max_date = stats.get("max_date", df["inp_date"].max())
    total_days = stats.get("total_days", (max_date - min_date).days + 1)
    weekday_days = stats.get("weekday_days", 0)
    weekend_days = stats.get("weekend_days", 0)
    total_news = stats.get("total_news", 0)
    weekday_news = stats.get("weekday_news", 0)
    weekend_news = stats.get("weekend_news", 0)
    filtered_count = stats.get("filtered_count", 0)
    noise_count = stats.get("noise_count", 0)
    valid_count = stats.get("valid_count", 0)

    # 시간 단위별 평균
    daily_avg = stats.get("daily_avg", 0)
    weekly_avg = stats.get("weekly_avg", 0)
    monthly_avg = stats.get("monthly_avg", 0)

    # 색상 팔레트
    colors = [
        "#1f77b4",
        "#ff7f0e",
        "#2ca02c",
        "#d62728",
        "#9467bd",
        "#8c564b",
        "#e377c2",
        "#7f7f7f",
        "#bcbd22",
        "#17becf",
        "#aec7e8",
        "#ffbb78",
        "#98df8a",
        "#ff9896",
        "#c5b0d5",
        "#c49c94",
        "#f7b6d2",
        "#c7c7c7",
        "#dbdb8d",
        "#9edae5",
        "#636363",
        "#6baed6",
        "#fd8d3c",
        "#74c476",
        "#969696",
        "#3182bd",
        "#e6550d",
        "#31a354",
        "#756bb1",
        "#de2d26",
    ]

    # Figure 생성
    fig = go.Figure()

    def add_topic_trace(topic_data, topic_id, time_unit_label, marker_size=6):
        """토픽별 trace 추가"""
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)][:10]
        topic_label = f"토픽 {topic_id}: {', '.join(topic_words)}"

        fig.add_trace(
            go.Scatter(
                x=topic_data["date"],
                y=topic_data["count"],
                mode="lines+markers",
                name=topic_label,
                line=dict(color=colors[i % len(colors)]),
                marker=dict(size=marker_size),
                hovertemplate=f"<b>%{{text}}</b><br>{time_unit_label}: %{{x|%Y-%m-%d}}<br>문서 수: %{{y}}<extra></extra>",
                text=[topic_label] * len(topic_data),
            )
        )

    def create_title_text(time_unit):
        """타이틀 텍스트 생성"""
        title_text = f"{title} 토픽분석 ({time_unit}단위)<br>"
        title_text += f"<sup>기간: {min_date.strftime('%Y-%m-%d')} ~ {max_date.strftime('%Y-%m-%d')} (총기간 : {total_days}일 | (평일: {weekday_days}일 | 주말: {weekend_days}일)</sup><br>"
        title_text += f"<sup>문서: 전체 {total_news}개 | 평일 {weekday_news}개 | 주말 {weekend_news}개</sup><br>"
        title_text += f"<sup>문서: 유효 {valid_count}개 ({valid_count/filtered_count:.1%}) | 노이즈 아웃라이너 {noise_count}개 ({noise_count/filtered_count:.1%})</sup><br>"
        # title_text += f"<sup>일평균: {daily_avg:.1f}개 | 주평균: {weekly_avg:.1f}개 | 월평균: {monthly_avg:.1f}개</sup>"
        return title_text

    def update_layout(title_text, xaxis_title):
        """레이아웃 업데이트"""
        fig.update_layout(
            title={
                "text": title_text,
                "y": 0.95,
                "x": 0.5,
                "xanchor": "center",
                "yanchor": "top",
                "font": {"size": 20, "color": "#1f1f1f"},
            },
            xaxis=dict(title=xaxis_title, tickformat="%Y-%m-%d", gridcolor="lightgray"),
            yaxis=dict(title="문서 수", gridcolor="lightgray"),
            legend=dict(title="토픽", orientation="v"),
            hovermode="closest",
            plot_bgcolor="white",
        )

    if time_unit == "day":
        # 일별 분석
        topic_df = pd.DataFrame({"date": df["inp_date"].dt.date, "topic": topics})
        daily_counts = (
            topic_df.groupby(["date", "topic"]).size().reset_index(name="count")
        )

        for i, topic_id in enumerate(all_topics):
            topic_data = daily_counts[daily_counts["topic"] == topic_id]
            add_topic_trace(topic_data, topic_id, "날짜", 6)

        title_text = create_title_text("일")
        update_layout(title_text, "날짜")

    elif time_unit == "week":
        # 주별 분석
        if custom_data is None:
            df_with_topics = df.copy()
            df_with_topics["topic_id"] = topics
            df_with_topics["date"] = (
                df_with_topics["inp_date"]
                .dt.to_period("W-SAT")
                .apply(lambda r: r.start_time)
            )
            weekly_data = (
                df_with_topics.groupby(["date", "topic_id"])
                .size()
                .reset_index(name="count")
            )
        else:
            weekly_data = custom_data.reset_index()

        for i, topic_id in enumerate(all_topics):
            topic_data = (
                weekly_data[weekly_data["topic_id"] == topic_id]
                if "topic_id" in weekly_data.columns
                else None
            )
            if topic_data is not None and not topic_data.empty:
                add_topic_trace(topic_data, topic_id, "주 시작일", 8)

        title_text = create_title_text("주")
        update_layout(title_text, "주 시작일")

    elif time_unit == "month":
        # 월별 분석
        if custom_data is None:
            df_with_topics = df.copy()
            df_with_topics["topic_id"] = topics
            df_with_topics["date"] = (
                df_with_topics["inp_date"]
                .dt.to_period("M")
                .apply(lambda r: r.start_time)
            )
            monthly_data = (
                df_with_topics.groupby(["date", "topic_id"])
                .size()
                .reset_index(name="count")
            )
        else:
            monthly_data = custom_data.reset_index()

        for i, topic_id in enumerate(all_topics):
            topic_data = (
                monthly_data[monthly_data["topic_id"] == topic_id]
                if "topic_id" in monthly_data.columns
                else None
            )
            if topic_data is not None and not topic_data.empty:
                add_topic_trace(topic_data, topic_id, "월 시작일", 10)

        title_text = create_title_text("월")
        update_layout(title_text, "월 시작일")

    else:
        raise ValueError(
            "지원되지 않는 시간 단위입니다. 'day', 'week' 또는 'month'를 사용하세요."
        )

    return fig

In [None]:
# 데이터프레임 로드
path1 = "/content/drive/MyDrive/Colab Notebooks/경북산불_10000.csv"
path2 = "/content/drive/MyDrive/Colab Notebooks/경북산불_20000.csv"
df1 = pd.read_csv(path1)
df2 = pd.read_csv(path2)

# 데이터프레임 합치기
combined_df = pd.concat([df1, df2], ignore_index=True)

# 임시 파일로 저장
temp_path = "/content/drive/MyDrive/Colab Notebooks/경북산불.csv"
combined_df.to_csv(temp_path, index=False)

In [None]:
# 데이터 분석 및 통계 정보 계산
path = "/content/drive/MyDrive/Colab Notebooks/경북산불.csv"

df, original_news_count, weekend_news_count, weekday_news_count = load_data(path)
df.attrs["filepath"] = path
df, preprocessed_content = prepare_text_data(df)
topics, probs, topic_model = run_topic_modeling(preprocessed_content)

stats = analyze_data_distribution(
    df, original_news_count, weekend_news_count, weekday_news_count, topics, topic_model
)

# 시계열 시각화 생성
daily_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="day", stats=stats
)
weekly_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="week", stats=stats
)
monthly_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="month", stats=stats
)

In [None]:
# 시각화 표시
daily_fig.show()
print()
weekly_fig.show()
# print()
# monthly_fig.show()

In [None]:
# 데이터 분석 및 통계 정보 계산
path = "/content/drive/MyDrive/Colab Notebooks/전남소멸_10000.csv"

df, original_news_count, weekend_news_count, weekday_news_count = load_data(path)
df.attrs["filepath"] = path
df, preprocessed_content = prepare_text_data(df)
topics, probs, topic_model = run_topic_modeling(preprocessed_content)

stats = analyze_data_distribution(
    df, original_news_count, weekend_news_count, weekday_news_count, topics, topic_model
)

# 시계열 시각화 생성
daily_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="day", stats=stats
)
weekly_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="week", stats=stats
)
monthly_fig = create_topic_timeseries(
    df, topics, topic_model, time_unit="month", stats=stats
)

In [None]:

weekly_fig.show()
print()
monthly_fig.show()