In [None]:
import re
import pandas as pd
from tqdm import tqdm

import torch
from pykospacing import Spacing
from konlpy.tag import Hannanum
from transformers import BertTokenizer, BertModel

tqdm.pandas() # tqdm과 pandas 통합

In [None]:
data = pd.read_excel('../../../../../data/filtered_news.xlsx')

# Pykospacing을 통해 전처리 (for문 사용)
spacing = Spacing()
processed_features = []
for feature in data['feature']:
    processed_feature = spacing(feature)
    processed_features.append(processed_feature)
data['feature'] = processed_features

In [None]:
# Hannanum 형태소 분석기로 토큰화 (for문 사용)
hannanum = Hannanum()
tokens = []
for feature in data['feature']:
    tokenized = hannanum.nouns(feature)
    tokens.append(tokenized)
data['tokens'] = tokens

# 불용어 처리 (for문 사용)
with open('../../../../../data/stopwords-ko.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

filtered_tokens = []
for token_list in data['tokens']:
    filtered = [word for word in token_list if word not in stopwords]
    filtered_tokens.append(filtered)
data['tokens'] = filtered_tokens

In [None]:
# KoBERT 모델과 토크나이저 불러오기
tokenizer = BertTokenizer.from_pretrained('monologg/kobert')
model = BertModel.from_pretrained('monologg/kobert')

# KoBERT 함수 정의
def text_to_vector(text):
    inputs = tokenizer(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
        
    cls_vector = outputs.last_hidden_state[0][0].numpy()
    return cls_vector

# 리스트를 문자열로 변환
data['tokens_str'] = data['tokens'].apply(lambda x: ' '.join(x))

# 텍스트를 벡터로 변환하여 데이터프레임에 추가
data['vector'] = data['tokens_str'].progress_apply(text_to_vector)