In [1]:
# Install Java 1.8 or up
!sudo apt-get install g++ openjdk-8-jdk python3-dev python3-pip curl

# Set Dependency(Ubuntu) for KoNLpy
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev

# Install KoNLpy
!python3 -m pip instlall --upgrade pip
!pip install konlpy

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
g++ is already the newest version (4:11.2.0-1ubuntu1).
g++ set to manually installed.
curl is already the newest version (7.81.0-1ubuntu1.18).
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libfontenc1 libgail-common libgail18 libgtk2.0-0
  libgtk2.0-bin libgtk2.0-common libice-dev librsvg2-common libsm-dev
  libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-8-jdk-headless
  openjdk-8-jre openjdk-8-jre-headless python3-setuptools python3-wheel
  x11-utils
Suggested packages:
  gvfs libice-doc libsm-doc libxt-doc openjdk-8-demo openjdk-8-source visualvm
  libnss-mdns fonts-nanum fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei fonts-wqy-zenhei fonts-indic python-setuptools-doc
  mesa-uti

In [2]:
# bertopic 설치
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m66.5 MB/s[0m eta [36m0:00:00[0m
[?

In [3]:
# sentence transformers 설치
!pip install sentence-transformers



In [4]:
# 한국어 임베딩을 위한 mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab_light_220429.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 138 (delta 26), reused 22 (delta 8), pack-reused 91 (from 1)[K
Receiving objects: 100% (138/138), 1.72 MiB | 19.51 MiB/s, done.
Resolving deltas: 100% (65/65), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2024-10-25 07:57:06--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 13.200.41.134, 13.200.41.136, 13.200.41.135, ...
Connecting to bitbucket.org (bitbucket.org)|13.200.41.134|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/eunjeon/mecab-ko/download

In [5]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [6]:
import pandas as pd

In [7]:
crawled_data_df = pd.read_csv('/content/crawl_complete_data.csv')

In [8]:
crawled_data_df.shape

(1116, 8)

In [9]:
crawled_data_df.columns

Index(['idx', 'text', 'files', 'url', 'title', 'published_date',
       'deadline_date', 'tag'],
      dtype='object')

In [10]:
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab

In [11]:
# 한국어 임베딩을 위한 custom tokenizer
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [12]:
custom_tokenizer = CustomTokenizer(Mecab())
vectorizer = CountVectorizer(tokenizer=custom_tokenizer,
                             max_features=3000)

In [16]:
model = BERTopic(language='korean', # 사용언어
                 top_n_words=10, # 주제별로 추출할 Top N 키워드 단어수
                 n_gram_range=(1,3),
                 min_topic_size=2, # 최소 분류 주제 수
                 nr_topics=None, # 분류할 주제 수 지정
                 calculate_probabilities=True, # 메소드 'visualize_probabilites' 사용을 위한 설정
                 seed_topic_list=None, # 각 주제에 대해 중심을 잡아주는 시드 단어 리스트
                 embedding_model="sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens",
                 vectorizer_model=vectorizer)

In [17]:
# 간단한 데이터 전처리
crawled_data_df['text'] = crawled_data_df['text'].fillna('')  # NaN 값을 빈 문자열로 대체
crawled_data_df['text'] = crawled_data_df['text'].astype(str)

In [19]:
topics, probs = model.fit_transform(crawled_data_df['text'])

In [29]:
### 모든 문서들이 분류되었는지 확인하기 ###
# 'Count'열은 해당 주제로 분류된 문서의 수를 나타냄
# -> 모든 'Count'열의 수를 더했을때 원래 문서의 수와 동일하면 모든 문서가 분류된 것

SUM = model.get_topic_info()["Count"].sum()
print("Current document total :", crawled_data_df.shape[0])
print("Sum of 'Count' column :", SUM)

Current document total : 1116
Sum of 'Count' column : 1116


In [30]:
### 각 주제별 유사도 시각화하기 ###

model.visualize_heatmap()

In [31]:
### 분류된 주제 시각화하기 ###
# LDA 알고리즘과 유사한 원리인듯?
# 시각화를 통해 분류된 주제들에 대한 통찰력을 얻을 수 있음

model.visualize_topics()

In [21]:
model.visualize_distribution(probs[0])

In [25]:
for i in range(1, 139):
  print(i,'번째 토픽 :', model.get_topic(i-1), '\n')

1 번째 토픽 : [('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)] 

2 번째 토픽 : [('바로', 0.021181090358214125), ('24', 0.01944754530659046), ('course', 0.01584581242789336), ('분석', 0.014215388905718043), ('교육', 0.013361504499094193), ('감탄', 0.012916555883124852), ('수강', 0.012681793231654886), ('산대', 0.012021672690193953), ('데이터', 0.011804814786581038), ('로드', 0.011646522799253933)] 

3 번째 토픽 : [('+/', 0.01872328474114974), ('69', 0.016667339821188194), ('58', 0.016392036790942692), ('65', 0.016340459361676392), ('48', 0.015485204793276232), ('++', 0.01536798180771432), ('47', 0.015105599803098496), ('49', 0.014958539640899406), ('95', 0.014570794978896899), ('57', 0.014518818972770768)] 

4 번째 토픽 : [('근로', 0.04163849468564449), ('***', 0.03062396541966785), ('지명', 0.026509064804894), ('교내', 0.02574874058601801), ('교외', 0.02530610219415178), ('보성', 0.02370462772843674), ('학번', 0.02347297971624443), ('학과', 0.02290973

In [23]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,256,-1_10_교육_00_참여,"[10, 교육, 00, 참여, 2024, 프로그램, 11, 기간, 신청, 지원]",[[ 교육비 무료 & 2.5개월 스펙 완성 ]\n - 모든 직무 필수역량 ‘데이터 ...
1,0,27,0____,"[, , , , , , , , , ]","[, , ]"
2,1,18,1_바로_24_course_분석,"[바로, 24, course, 분석, 교육, 감탄, 수강, 산대, 데이터, 로드]",[◈IT (정보처리기술)◈\n(산대특) 생성형AI활용 UI/UX웹디자인(웹퍼블리셔)...
3,2,17,2_+/_69_58_65,"[+/, 69, 58, 65, 48, ++, 47, 49, 95, 57]",[Error in summarizing: This model's maximum co...
4,3,17,3_근로_***_지명_교내,"[근로, ***, 지명, 교내, 교외, 보성, 학번, 학과, 다향, 순번]",[한 줄 소개\n 대한민국 대표 차문화 축제 보성다향대축제는 보성차의 역사·문화·경...
...,...,...,...,...,...
134,133,2,133_경남_커리어_55_영상,"[경남, 커리어, 55, 영상, 픽셀, 1920, 1080, 장르, 모든, 매력]","[[상담] 나다운 커리어 찾기 2차: 버크만커리어리포트\n 지난 4월, 모집률 35..."
135,134,2,134_한라_여수_2026_박람회,"[한라, 여수, 2026, 박람회, 세계, 단편, 페스타, 출품, 공모, 영상]",[제주한라대학교 방송영상학과가 주최‧주관하는 <2024 제1회 제주 디지털 필름 페...
136,135,2,135_작가_축제_신진_전시,"[작가, 축제, 신진, 전시, art, 자원, 갤러리, 예술, 스페이스, 공연]",[갤러리 서린 스페이스는 2012년 10월 부산 해운대 랜드마크의 상징인 마린시티에...
137,136,2,136_훈련_ybm_gpt_그린,"[훈련, ybm, gpt, 그린, 인천, 컴퓨터, 산대, 사무, 아카데미, 엔드]","[YBM 마케팅, 디자인, 경영사무 인턴즈 대규모 모집!\n\nybm 인턴즈 참여자..."


In [27]:
# Create a dictionary to store indices for each topic
topic_to_docs = dict()

topic_to_docs[-1] = []
for i in range(138):
    topic_to_docs[i] = []

# Populate the dictionary with document indices for each topic
for idx, topic in enumerate(topics):
    topic_to_docs[topic].append(idx)

# Display the indices of documents for each topic
for topic, doc_indices in topic_to_docs.items():
    print(f"Topic {topic}: {doc_indices}\n\n")

Topic -1: [1, 8, 10, 22, 25, 30, 40, 41, 45, 53, 55, 59, 67, 68, 71, 75, 76, 78, 85, 89, 90, 101, 103, 117, 119, 121, 126, 127, 128, 130, 131, 133, 143, 146, 152, 153, 165, 169, 170, 173, 177, 178, 182, 183, 184, 185, 194, 195, 196, 198, 202, 209, 210, 212, 214, 219, 221, 224, 226, 230, 233, 238, 239, 245, 250, 259, 261, 268, 280, 282, 286, 289, 295, 296, 297, 298, 303, 309, 318, 320, 323, 324, 327, 339, 344, 346, 347, 348, 368, 371, 372, 373, 376, 380, 385, 388, 393, 401, 404, 408, 410, 415, 423, 426, 427, 428, 429, 462, 471, 477, 479, 480, 484, 487, 490, 491, 492, 495, 501, 507, 508, 509, 510, 512, 513, 514, 521, 529, 537, 539, 540, 541, 543, 547, 548, 551, 552, 554, 556, 557, 560, 564, 571, 573, 575, 580, 590, 593, 595, 598, 603, 606, 607, 612, 614, 625, 626, 627, 628, 635, 638, 647, 649, 650, 652, 654, 655, 661, 664, 667, 682, 685, 687, 688, 690, 698, 701, 702, 710, 713, 717, 768, 775, 782, 783, 784, 785, 787, 790, 793, 795, 796, 798, 801, 805, 809, 813, 814, 815, 817, 818, 819, 82