In [None]:
import re
import pandas as pd
from pykospacing import Spacing

from konlpy.tag import Kkma
from gensim.models import Word2Vec

In [None]:
spacing = Spacing()

def preprocessing(text):
    text = text.lower()  
    text = re.sub(r'[^\w\s]', '', text)  
    text = spacing(text)  
    return text

def get_stopwords():  
    stopwords = set()
    with open('../../../../../data/stopwords-ko.txt', 'r', encoding='utf-8') as file:
        for line in file:
            stopwords.add(line.strip())
    return stopwords

def analyze_and_remove_stopwords(text, stopwords):                                                           # 형태소 분석 및 불용어 제거 함수
    text = preprocessing(text)                                                                               # 전처리
    kkma = Kkma()                                                                                            # 형태소 분석기 초기화
    morphs = kkma.morphs(text)  
    meaningful_words = [word for word in morphs if word not in stopwords]                                    # 불용어 제거
    return meaningful_words

In [None]:
stopwords = get_stopwords()                                                                                 
df = pd.read_excel('../../../../../data/filtered_news.xlsx')

df['processed_feature'] = df['feature'].apply(lambda x: analyze_and_remove_stopwords(x, stopwords))          # 'feature' 열에 대해 형태소 분석 및 불용어 제거 

In [None]:
model = Word2Vec(df['processed_feature'], vector_size=100, window=5, min_count=1, workers=4,sg=1)            # Word2Vec 모델 학습
  
# 각 단어의 벡터를 데이터프레임에 추가하는 함수
def get_feature_vector(words, model):
    vector = [model.wv[word] for word in words if word in model.wv]
    return sum(vector) / len(vector) if vector else [0] * model.vector_size

# 데이터프레임에 feature_vector 열 추가
df['feature_vector'] = df['processed_feature'].apply(lambda x: get_feature_vector(x, model))