## 머신러닝 모델 이용

## 라이브러리 로드

In [1]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy, 시각화를 위한 seaborn, matplotlib 을 로드합니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 데이터 로드

In [2]:
RANDOM_STATE = 110

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

((54314, 4), (23405, 3))

In [3]:
df_train['분류'].value_counts().head(5)

지역          26850
경제:부동산       3447
사회:사건_사고     2545
경제:반도체       2309
사회:사회일반      1457
Name: 분류, dtype: int64

## 1. 로지스틱

max_iter = 100 적절해 보임

In [10]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(100)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(100)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(100)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(100)], axis=0) for text in X_valid])

# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=10, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.6211911994844886, Macro F1-score: 0.10295490748498067
Count Vectorization - Accuracy: 0.7752922765350272, Macro F1-score: 0.4820289910032497
Word2Vec (CBOW) - Accuracy: 0.7004510724477584, Macro F1-score: 0.2673183690911866
Word2Vec (Skip-gram) - Accuracy: 0.6558961612814139, Macro F1-score: 0.18534134203238484


In [11]:
# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=100, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7949921752738655, Macro F1-score: 0.4995781881822617
Count Vectorization - Accuracy: 0.8337475835404584, Macro F1-score: 0.6715901126181764
Word2Vec (CBOW) - Accuracy: 0.7629568259228574, Macro F1-score: 0.4911300351003649
Word2Vec (Skip-gram) - Accuracy: 0.7561447114056891, Macro F1-score: 0.4671125090590625


In [12]:
# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=300, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7945318972659486, Macro F1-score: 0.49684338301538167
Count Vectorization - Accuracy: 0.8336555279388751, Macro F1-score: 0.6691282708433699
Word2Vec (CBOW) - Accuracy: 0.7678357728067753, Macro F1-score: 0.5080390162696593
Word2Vec (Skip-gram) - Accuracy: 0.7576176010310227, Macro F1-score: 0.4739103759751612


In [13]:
# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=500, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7945318972659486, Macro F1-score: 0.49684338301538167
Count Vectorization - Accuracy: 0.8336555279388751, Macro F1-score: 0.6689017845209749
Word2Vec (CBOW) - Accuracy: 0.7675596060020252, Macro F1-score: 0.5056452778191008
Word2Vec (Skip-gram) - Accuracy: 0.7573414342262726, Macro F1-score: 0.47352432671650263


## 2. 랜덤포레스트

n_estimators=100 이 적절함

In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

RANDOM_STATE = 110

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(100)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(100)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=100, window=5, min_count=1, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(100)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(100)], axis=0) for text in X_valid])

# 랜덤 포레스트 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7742796649176102, Macro F1-score: 0.5545635255259403
Count Vectorization - Accuracy: 0.7738193869096934, Macro F1-score: 0.558167010164971
Word2Vec (CBOW) - Accuracy: 0.7960968424928657, Macro F1-score: 0.5815340238828816
Word2Vec (Skip-gram) - Accuracy: 0.7953603976801988, Macro F1-score: 0.6030652481849351


In [9]:
# 랜덤 포레스트 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = RandomForestClassifier(n_estimators=300, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7732670533001933, Macro F1-score: 0.5532150684563435
Count Vectorization - Accuracy: 0.774463776120777, Macro F1-score: 0.5597891541418811
Word2Vec (CBOW) - Accuracy: 0.7977538433213661, Macro F1-score: 0.5794650584606268
Word2Vec (Skip-gram) - Accuracy: 0.7971094541102826, Macro F1-score: 0.604222727512024


## cat, 나이브 베이즈 성능 별로 였음

.