In [None]:
import pandas as pd
from konlpy.tag import Hannanum
from pykospacing import Spacing
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
data = pd.read_csv('../../../data/combined_questions.csv', encoding = 'cp949')

# Pykospacing을 통해 전처리 (for문 사용)
spacing = Spacing()
processed_features = []
for feature in data['feature']:
    processed_feature = spacing(feature)
    processed_features.append(processed_feature)
data['feature'] = processed_features

# Hannanum 형태소 분석기로 토큰화 (for문 사용)
hannanum = Hannanum()
tokens = []
for feature in data['feature']:
    tokenized = hannanum.nouns(feature)
    tokens.append(tokenized)
data['tokens'] = tokens

In [None]:
# 불용어 처리 (for문 사용)
with open('../../../data/stopwords-ko.txt', 'r', encoding='utf-8') as file:
    stopwords = file.read().splitlines()

filtered_tokens = []
for token_list in data['tokens']:
    filtered = [word for word in token_list if word not in stopwords]
    filtered_tokens.append(filtered)
data['tokens'] = filtered_tokens

In [None]:
data['tokens_str'] = data['tokens'].apply(lambda x: ' '.join(x))                 # 리스트를 문자열로 변환

vectorizer = TfidfVectorizer()                                                   # TfidfVectorizer를 사용하여 TF-IDF 모델 생성

X = vectorizer.fit_transform(data['tokens_str'])                                 # 각 문장을 벡터화

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out()) # 벡터를 데이터프레임으로 변환

df_tfidf = pd.concat([data, tfidf_df], axis=1)                                   # 원본 데이터프레임에 벡터화된 특성 추가

model.save('Hannanum_TFiDF_model.model')                                         # 모델 저장 (선택 사항)