In [None]:
import re
import numpy as np
import pandas as pd
from pykospacing import Spacing

import spacy
import rhinoMorph
from gensim.models import Word2Vec

rn = rhinoMorph.startRhino()

In [None]:
spacing = Spacing()

def preprocessing(text):
    text = spacing(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    return text

# 불용어 리스트 가져오기
def get_stopwords():
    stopwords = set()
    with open('../../../../../data/stopwords-ko.txt', 'r', encoding='utf-8') as file:
        for line in file:
            stopwords.add(line.strip())
    return stopwords

# 불용어 리스트 가져오기
stopwords = get_stopwords()

# 형태소 분석 및 불용어 제거 함수
def morph_and_remove_stopwords(text, stopwords):
    text = preprocessing(text)
    morphs = rhinoMorph.onlyMorph_list(rn, text)
    meaningful_words = [word for word in morphs if word not in stopwords]
    return meaningful_words

# 문서 벡터화 함수
def get_document_vector(tokens, model):
    word_vectors = [model.wv[token] for token in tokens if token in model.wv]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

In [None]:
df = pd.read_excel('../../../../../data/filtered_news.xlsx')

# 데이터프레임의 'feature' 열에 대해 형태소 분석 및 불용어 제거 적용
df['processed_feature'] = df['feature'].apply(lambda x: morph_and_remove_stopwords(x, stopwords))

In [None]:
# Word2Vec 모델 학습
model = Word2Vec(sentences=df['processed_feature'], vector_size=100, window=4, hs=1, min_count=2, sg=1)

# 문서 벡터화
df['vector'] = df['processed_feature'].apply(lambda x: get_document_vector(x, model))