## 머신러닝 모델 이용

## 라이브러리 로드

In [1]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy, 시각화를 위한 seaborn, matplotlib 을 로드합니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_translate_del.csv")
df_test = pd.read_csv("../../data/test_df_translate_del.csv")

In [3]:
df_train.drop_duplicates(subset=['제목', '키워드'], keep='first', inplace=True)

In [4]:
import pandas as pd
import re

# 제거할 단어 리스트
remove_words = ['http', 'www.', '.kr', '.net', '.com']

def remove_specific_words(text):
    words = text.split(',')
    filtered_words = [word for word in words if not any(remove_word in word for remove_word in remove_words)]
    return ','.join(filtered_words)

# '키워드' 열에 함수 적용
df_train['키워드'] = df_train['키워드'].apply(remove_specific_words)
df_test['키워드'] = df_test['키워드'].apply(remove_specific_words)

In [5]:
# 제거할 기호 리스트
symbols_to_remove = {'*', '“', '▷', '‧', '■', '○', '!', '⋯', '→', '㎡', ')', '·', '․', '∙', '?',
                     '㈜', '.', '(', '•', ':', '⋅', '%', ';', '/', '˙', '・', '▲', '’', '~'} # '&' '-'

# 기호를 제거하는 함수
def remove_symbols(text):
    pattern = '[' + re.escape(''.join(symbols_to_remove)) + ']'  # 제거할 기호 패턴 생성
    return re.sub(pattern, '', text)

# '키워드' 열에서 기호 제거
df_train['키워드'] = df_train['키워드'].apply(remove_symbols)
df_test['키워드'] = df_test['키워드'].apply(remove_symbols)

In [6]:
df_train.shape, df_test.shape

((54315, 4), (23405, 3))

## 3. 앙상블

In [7]:
# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 모델 정의
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_w2v_sg = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)

# 모델 학습
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[0])  # Logistic Regression 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_count, model_w2v_cbow]
X_valid_list = [X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8413881984718771, Macro F1-score: 0.6804965483246619
Soft Voting - Accuracy: 0.8353125287673755, Macro F1-score: 0.6729745657278795


.