In [1]:
# Install Java 1.8 or up
!sudo apt-get install g++ openjdk-8-jdk python3-dev python3-pip curl

# Set Dependency(Ubuntu) for KoNLpy
!apt-get update
!apt-get install g++ openjdk-8-jdk python-dev

# Install KoNLpy
!python3 -m pip instlall --upgrade pip
!pip install konlpy

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
g++ is already the newest version (4:11.2.0-1ubuntu1).
g++ set to manually installed.
curl is already the newest version (7.81.0-1ubuntu1.18).
python3-dev is already the newest version (3.10.6-1~22.04.1).
python3-dev set to manually installed.
The following additional packages will be installed:
  fonts-dejavu-core fonts-dejavu-extra libatk-wrapper-java
  libatk-wrapper-java-jni libfontenc1 libgail-common libgail18 libgtk2.0-0
  libgtk2.0-bin libgtk2.0-common libice-dev librsvg2-common libsm-dev
  libxkbfile1 libxt-dev libxtst6 libxxf86dga1 openjdk-8-jdk-headless
  openjdk-8-jre openjdk-8-jre-headless python3-setuptools python3-wheel
  x11-utils
Suggested packages:
  gvfs libice-doc libsm-doc libxt-doc openjdk-8-demo openjdk-8-source visualvm
  libnss-mdns fonts-nanum fonts-ipafont-gothic fonts-ipafont-mincho
  fonts-wqy-microhei fonts-wqy-zenhei fonts-indic python-setuptools-doc
  mesa-uti

In [2]:
# bertopic 설치
!pip install bertopic

Collecting bertopic
  Downloading bertopic-0.16.4-py3-none-any.whl.metadata (23 kB)
Collecting hdbscan>=0.8.29 (from bertopic)
  Downloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting sentence-transformers>=0.4.1 (from bertopic)
  Downloading sentence_transformers-3.2.1-py3-none-any.whl.metadata (10 kB)
Collecting umap-learn>=0.5.0 (from bertopic)
  Downloading umap_learn-0.5.6-py3-none-any.whl.metadata (21 kB)
Collecting pynndescent>=0.5 (from umap-learn>=0.5.0->bertopic)
  Downloading pynndescent-0.5.13-py3-none-any.whl.metadata (6.8 kB)
Downloading bertopic-0.16.4-py3-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.7/143.7 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading hdbscan-0.8.39-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m42.7 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
# sentence transformers 설치
!pip install sentence-transformers



In [4]:
# 한국어 임베딩을 위한 mecab 설치
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git
%cd Mecab-ko-for-Google-Colab
!bash install_mecab-ko_on_colab_light_220429.sh

Cloning into 'Mecab-ko-for-Google-Colab'...
remote: Enumerating objects: 138, done.[K
remote: Counting objects: 100% (47/47), done.[K
remote: Compressing objects: 100% (38/38), done.[K
remote: Total 138 (delta 26), reused 22 (delta 8), pack-reused 91 (from 1)[K
Receiving objects: 100% (138/138), 1.72 MiB | 33.78 MiB/s, done.
Resolving deltas: 100% (65/65), done.
/content/Mecab-ko-for-Google-Colab
Installing konlpy.....
Done
Installing mecab-0.996-ko-0.9.2.tar.gz.....
Downloading mecab-0.996-ko-0.9.2.tar.gz.......
from https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
--2024-10-25 09:44:32--  https://bitbucket.org/eunjeon/mecab-ko/downloads/mecab-0.996-ko-0.9.2.tar.gz
Resolving bitbucket.org (bitbucket.org)... 13.200.41.136, 13.200.41.135, 13.200.41.134, ...
Connecting to bitbucket.org (bitbucket.org)|13.200.41.136|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://bbuseruploads.s3.amazonaws.com/eunjeon/mecab-ko/download

In [5]:
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer

In [6]:
import pandas as pd

In [9]:
crawled_data_df = pd.read_csv('/content/crawl_complete_data.csv')

In [10]:
crawled_data_df.shape

(1116, 8)

In [11]:
crawled_data_df.columns

Index(['idx', 'text', 'files', 'url', 'title', 'published_date',
       'deadline_date', 'tag'],
      dtype='object')

In [12]:
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Mecab

In [13]:
# 한국어 임베딩을 위한 custom tokenizer
class CustomTokenizer:
    def __init__(self, tagger):
        self.tagger = tagger

    def __call__(self, sent):
        sent = sent[:1000000]
        word_tokens = self.tagger.morphs(sent)
        result = [word for word in word_tokens if len(word) > 1]
        return result

In [14]:
custom_tokenizer = CustomTokenizer(Mecab())
vectorizer = CountVectorizer(tokenizer=custom_tokenizer,
                             max_features=3000)

In [15]:
model = BERTopic(language='korean', # 사용언어
                 top_n_words=10, # 주제별로 추출할 Top N 키워드 단어수
                 n_gram_range=(1,3),
                 min_topic_size=2, # 최소 분류 주제 수
                 nr_topics=None, # 분류할 주제 수 지정
                 calculate_probabilities=True, # 메소드 'visualize_probabilites' 사용을 위한 설정
                 seed_topic_list=None, # 각 주제에 대해 중심을 잡아주는 시드 단어 리스트
                 embedding_model="sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens",
                 vectorizer_model=vectorizer)

In [16]:
# 간단한 데이터 전처리
crawled_data_df['text'] = crawled_data_df['text'].fillna('')  # NaN 값을 빈 문자열로 대체
crawled_data_df['text'] = crawled_data_df['text'].astype(str)

In [17]:
topics, probs = model.fit_transform(crawled_data_df['text'])

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.09k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/731 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [18]:
### 모든 문서들이 분류되었는지 확인하기 ###
# 'Count'열은 해당 주제로 분류된 문서의 수를 나타냄
# -> 모든 'Count'열의 수를 더했을때 원래 문서의 수와 동일하면 모든 문서가 분류된 것

SUM = model.get_topic_info()["Count"].sum()
print("Current document total :", crawled_data_df.shape[0])
print("Sum of 'Count' column :", SUM)

Current document total : 1116
Sum of 'Count' column : 1116


In [33]:
topic_length = len(model.get_topic_info())-1

In [34]:
### 각 주제별 유사도 시각화하기 ###

model.visualize_heatmap()

In [35]:
### 분류된 주제 시각화하기 ###
# LDA 알고리즘과 유사한 원리인듯?
# 시각화를 통해 분류된 주제들에 대한 통찰력을 얻을 수 있음

model.visualize_topics()

In [36]:
model.visualize_distribution(probs[0])

In [37]:
for i in range(topic_length):
  print(i,'번째 토픽 :', model.get_topic(i), '\n')

0 번째 토픽 : [('학생', 0.021674719737263932), ('평가', 0.019758525427447986), ('kau', 0.016570088553064067), ('학부', 0.01595277128208634), ('영어', 0.015108156495705218), ('학점', 0.014920743031842347), ('강의', 0.013872869871697021), ('회화', 0.012962010637969418), ('과목', 0.012083554169037114), ('학년도', 0.012013063103250397)] 

1 번째 토픽 : [('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05), ('', 1e-05)] 

2 번째 토픽 : [('지질', 0.02305333697764422), ('사진', 0.019436535127311066), ('상금', 0.018829601643993672), ('캐릭터', 0.01753134009435833), ('자원', 0.016874658071744923), ('촬영', 0.015633383151546217), ('수상', 0.014521487070151584), ('공모전', 0.014269467298417304), ('저작', 0.01376209708258819), ('usd', 0.01266128222793185)] 

3 번째 토픽 : [('학점', 0.03999539194960865), ('교류', 0.024973761519487277), ('수강', 0.023929280471657228), ('성적', 0.021428895814861175), ('학기', 0.020049735368771932), ('실습', 0.019049556108020567), ('등록금', 0.016155380100725078), ('조회', 0.0

In [38]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,241,-1_10_2024_참여_00,"[10, 2024, 참여, 00, 접수, 기간, 교육, 제출, 합니다, 에서]",[청년취업사관학교 은평캠퍼스에서 산업 실무 프로젝트 기반 LLM 서비스 기획 과정의...
1,0,25,0_학생_평가_kau_학부,"[학생, 평가, kau, 학부, 영어, 학점, 강의, 회화, 과목, 학년도]",[교수학습센터에서는 2024학년도 하계 KAU 오프라인 비교과 영어회화 프로그램을 ...
2,1,20,1____,"[, , , , , , , , , ]","[, , ]"
3,2,19,2_지질_사진_상금_캐릭터,"[지질, 사진, 상금, 캐릭터, 자원, 촬영, 수상, 공모전, 저작, usd]",[제27회 말 사진 공모전 「동행」\n ● 참여 대상\n - 대한민국 국민 누구나 ...
4,3,18,3_학점_교류_수강_성적,"[학점, 교류, 수강, 성적, 학기, 실습, 등록금, 조회, 장애, 신청]",[2024학년도 2학기 홍익대학교 학점교류 수강안내\n 1. 신청자격 : 2학년 ...
...,...,...,...,...,...
140,139,2,139_퐁당_패밀리_서포터_투표,"[퐁당, 패밀리, 서포터, 투표, 여행, 재단, 컨텐츠, 미션, 문화, 사랑]",[2024 나만의 정선아리랑 여행 공모전\n\n가리왕산에게 주는 행운의 편지\n나만...
141,140,2,140_아트_33_assist_that,"[아트, 33, assist, that, sorry, can, bp, 더욱, art...",[구글폼으로 신청하기\n https://forms.gle/5m5bP9EFamhpz7...
142,141,2,141_장학금_학위_석사_졸업,"[장학금, 학위, 석사, 졸업, 학년도, 참조, 추천, 위원회, 수여식, 학부]",[2023학년도 후기(제67회) 학위수여식 실시 안내\n 졸업을 축하드리며 학위수...
143,142,2,142_남산_학당_세종_체류,"[남산, 학당, 세종, 체류, 광부, 체육관, 한국, 유학, 사례, 경험]","[「지속가능한 남산」프로젝트의 첫번째 시민참여 이벤트,\n 남산에 애정과 관심을 가..."


In [39]:
# Create a dictionary to store indices for each topic
topic_to_docs = dict()

topic_to_docs[-1] = []
for i in range(topic_length):
    topic_to_docs[i] = []

# Populate the dictionary with document indices for each topic
for idx, topic in enumerate(topics):
    topic_to_docs[topic].append(idx)

# Display the indices of documents for each topic
for topic, doc_indices in topic_to_docs.items():
    print(f"Topic {topic}: {doc_indices}\n\n")

Topic -1: [1, 2, 8, 10, 11, 15, 22, 24, 35, 40, 45, 49, 53, 55, 57, 58, 64, 67, 68, 69, 71, 72, 73, 76, 78, 83, 87, 90, 91, 95, 101, 119, 121, 127, 130, 146, 152, 153, 170, 173, 177, 178, 180, 181, 184, 194, 195, 198, 202, 209, 210, 214, 220, 221, 224, 230, 232, 233, 234, 237, 238, 250, 255, 259, 261, 268, 273, 275, 286, 289, 293, 295, 296, 297, 298, 299, 303, 307, 309, 312, 320, 323, 331, 341, 347, 348, 351, 352, 355, 368, 372, 373, 380, 388, 393, 402, 404, 408, 410, 440, 452, 462, 471, 472, 484, 486, 487, 489, 492, 495, 501, 509, 513, 520, 521, 527, 528, 529, 534, 539, 541, 543, 554, 556, 557, 560, 564, 573, 577, 590, 593, 595, 598, 607, 612, 614, 625, 626, 627, 628, 635, 638, 641, 647, 650, 652, 661, 667, 675, 682, 685, 687, 688, 690, 698, 701, 702, 703, 710, 711, 733, 739, 742, 756, 760, 766, 768, 773, 777, 782, 783, 784, 785, 787, 788, 790, 793, 794, 795, 798, 800, 814, 815, 817, 819, 825, 835, 841, 843, 850, 852, 854, 855, 858, 865, 866, 870, 875, 877, 887, 896, 897, 901, 905, 90

In [40]:
# 주제별로 CSV 파일 저장
for topic, doc_indices in topic_to_docs.items():
    # DataFrame으로 변환
    df = pd.DataFrame({'Document Index': doc_indices})

    # 파일 이름을 설정 (-1은 'Topic_neg_1'로 저장)
    filename = f"Topic_{topic}.csv" if topic != -1 else "Topic_neg_1.csv"

    # CSV 파일로 저장
    df.to_csv(filename, index=False)