In [None]:
import re
import pandas as pd
from tqdm import tqdm

import torch
from pykospacing import Spacing
from konlpy.tag import Hannanum
from transformers import ElectraModel, ElectraTokenizer

tqdm.pandas() # tqdm과 pandas 통합

In [None]:
data = pd.read_excel('../../../../../data/filtered_news.xlsx')

# Pykospacing을 통해 전처리 (for문 사용)
spacing = Spacing()
processed_features = []
for feature in data['feature']:
    processed_feature = spacing(feature)
    processed_features.append(processed_feature)
data['feature'] = processed_features

In [None]:
# Hannanum 형태소 분석기로 토큰화 (for문 사용)
hannanum = Hannanum()
tokens = []
for feature in data['feature']:
    tokenized = hannanum.nouns(feature)
    tokens.append(tokenized)
data['tokens'] = tokens

In [None]:
# 불용어 처리 (for문 사용)
with open('../../../../../data/stopwords-ko.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

filtered_tokens = []
for token_list in data['tokens']:
    filtered = [word for word in token_list if word not in stopwords]
    filtered_tokens.append(filtered)
data['tokens'] = filtered_tokens

In [None]:
# KoELECTRA 토크나이저와 모델을 로드합니다.
tokenizer = ElectraTokenizer.from_pretrained('monologg/koelectra-base-v3-discriminator')
model = ElectraModel.from_pretrained('monologg/koelectra-base-v3-discriminator')

# KoELECTRA 텍스트의 임베딩을 얻는 함수
def get_koelectra_embeddings(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()

# 리스트를 문자열로 변환
data['tokens_str'] = data['tokens'].apply(lambda x: ' '.join(x))

# 각 처리된 피처에 대해 함수를 적용합니다.
data['koelectra_embedding'] = data['tokens_str'].progress_apply(get_koelectra_embeddings)

# 시각화를 위해 DataFrame으로 변환합니다.
koelectra_embeddings_df = pd.DataFrame(data['koelectra_embedding'].tolist())