In [None]:
import re

import numpy as np
import pandas as pd
from tqdm import tqdm
from pykospacing import Spacing

import spacy
from gensim.models import Word2Vec

nlp = spacy.load("ko_core_news_sm")  # 한국어 모델 로드
tqdm.pandas()                        # tqdm과 pandas 통합

In [None]:
# 불용어 파일 로드
stopwords_file = '../../../../../data/stopwords-ko.txt'
with open(stopwords_file, 'r', encoding='utf-8') as f:
    stopwords = set(line.strip() for line in f if line.strip())

# 불용어 처리 함수 정의
def remove_stopwords(text, stopwords):
    doc = nlp(text)
    filtered_tokens = [token.text for token in doc if token.text not in stopwords and not token.is_punct and not token.is_space]
    return filtered_tokens

spacing = Spacing()

def preprocessing(text):
    text = spacing(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# 문서 벡터화 함수
def get_document_vector(tokens, model):
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [None]:
df = pd.read_excel('../../../../../data/filtered_news.xlsx')

# 데이터 전처리 및 토큰화
# df = df[:10]
df['cleaned'] = df['feature'].apply(lambda x: remove_stopwords(preprocessing(x), stopwords))

# Word2Vec 모델 학습
model = Word2Vec(sentences=df['cleaned'], vector_size=100, window=4, hs=1, min_count=2, sg=1)

# 문서 벡터화
df['vector'] = df['cleaned'].apply(lambda x: get_document_vector(x, model))