## 머신러닝 모델 이용

## 라이브러리 로드

In [1]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy, 시각화를 위한 seaborn, matplotlib 을 로드합니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

## 데이터 로드

In [3]:
RANDOM_STATE = 110

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/train_df_1012.csv")
df_test = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/test_df_1012.csv")
df_train.shape, df_test.shape

((54314, 4), (23405, 3))

In [4]:
df_train['분류'].value_counts().head(5)

분류
지역          26850
경제:부동산       3447
사회:사건_사고     2545
경제:반도체       2309
사회:사회일반      1457
Name: count, dtype: int64

## 1. 로지스틱

max_iter = 200 결과

In [8]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/train_df_1012.csv")
df_test = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=256, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=256, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=200, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7945318972659486, Macro F1-score: 0.49684338301538167
Count Vectorization - Accuracy: 0.8345760839547086, Macro F1-score: 0.6715446278197489
Word2Vec (CBOW) - Accuracy: 0.7816441130442787, Macro F1-score: 0.539596364023002
Word2Vec (Skip-gram) - Accuracy: 0.7753843321366105, Macro F1-score: 0.5078328918048641


In [5]:
# 로지스틱 회귀 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = LogisticRegression(max_iter=100, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7949921752738655, Macro F1-score: 0.4995781881822617
Count Vectorization - Accuracy: 0.8337475835404584, Macro F1-score: 0.6715901126181764
Word2Vec (CBOW) - Accuracy: 0.7771333885666943, Macro F1-score: 0.5280927156285279
Word2Vec (Skip-gram) - Accuracy: 0.7779618889809445, Macro F1-score: 0.5159025245549202


## 2. 랜덤포레스트

n_estimators=100 이 적절함

In [9]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

RANDOM_STATE = 110

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/train_df_1012.csv")
df_test = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=256, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=256, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 랜덤 포레스트 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7741876093160269, Macro F1-score: 0.5559152975472867
Count Vectorization - Accuracy: 0.7740034981128602, Macro F1-score: 0.5602129538670343
Word2Vec (CBOW) - Accuracy: 0.8024486790021172, Macro F1-score: 0.5950446513823952
Word2Vec (Skip-gram) - Accuracy: 0.7979379545245329, Macro F1-score: 0.6043294228218642


In [7]:
# 랜덤 포레스트 모델 학습 및 평가 함수
def train_and_evaluate(X_train, X_valid, y_train, y_valid, method_name):
    model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
    model.fit(X_train, y_train)
    y_valid_pred = model.predict(X_valid)
    accuracy = accuracy_score(y_valid, y_valid_pred)
    macro_f1 = f1_score(y_valid, y_valid_pred, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")
    # print(classification_report(y_valid, y_valid_pred, target_names=label_encoder.classes_))

# TF-IDF
train_and_evaluate(X_train_tfidf, X_valid_tfidf, y_train_encoded, y_valid_encoded, "TF-IDF")

# Count Vectorization
train_and_evaluate(X_train_count, X_valid_count, y_train_encoded, y_valid_encoded, "Count Vectorization")

# Word2Vec (CBOW)
train_and_evaluate(X_train_w2v_cbow, X_valid_w2v_cbow, y_train_encoded, y_valid_encoded, "Word2Vec (CBOW)")

# Word2Vec (Skip-gram)
train_and_evaluate(X_train_w2v_sg, X_valid_w2v_sg, y_train_encoded, y_valid_encoded, "Word2Vec (Skip-gram)")

TF-IDF - Accuracy: 0.7742796649176102, Macro F1-score: 0.5545635255259403
Count Vectorization - Accuracy: 0.7738193869096934, Macro F1-score: 0.558167010164971
Word2Vec (CBOW) - Accuracy: 0.7999631777593667, Macro F1-score: 0.5948373426587558
Word2Vec (Skip-gram) - Accuracy: 0.7960968424928657, Macro F1-score: 0.6043990408673594


## cat, 나이브 베이즈 성능 별로 였음

## 3. 앙상블

하드보팅, 소프트보팅, 스태킹

### 3-1. 로지스틱

In [2]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 로지스틱 회귀 모델 정의
model_tfidf = LogisticRegression(random_state=RANDOM_STATE)
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_sg = LogisticRegression(random_state=RANDOM_STATE)

# 모델 학습
model_tfidf.fit(X_train_tfidf, y_train_encoded)
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[1])  # Count Vectorization 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_tfidf, model_count, model_w2v_cbow, model_w2v_sg]
X_valid_list = [X_valid_tfidf, X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]
X_train_list = [X_train_tfidf, X_train_count, X_train_w2v_cbow, X_train_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8086164043082021, Macro F1-score: 0.5455198213287042
Soft Voting - Accuracy: 0.8273036914296235, Macro F1-score: 0.6347485982842994


model_count 모델 2번 있다 생각하고 보팅

In [4]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 로지스틱 회귀 모델 정의
model_tfidf = LogisticRegression(random_state=RANDOM_STATE)
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_sg = LogisticRegression(random_state=RANDOM_STATE)

# 모델 학습
model_tfidf.fit(X_train_tfidf, y_train_encoded)
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[1])  # Count Vectorization 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트 (model_count 모델을 두 번 포함)
models = [model_tfidf, model_count, model_count, model_w2v_cbow, model_w2v_sg]
X_valid_list = [X_valid_tfidf, X_valid_count, X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]
X_train_list = [X_train_tfidf, X_train_count, X_train_count, X_train_w2v_cbow, X_train_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8196630764982049, Macro F1-score: 0.6010128529755605
Soft Voting - Accuracy: 0.8366013071895425, Macro F1-score: 0.6670766691395071


### 3-2. 랜포

In [3]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 랜덤 포레스트 모델 정의
model_tfidf = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_count = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_w2v_cbow = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_w2v_sg = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)

# 모델 학습
model_tfidf.fit(X_train_tfidf, y_train_encoded)
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[3])  # Word2Vec (Skip-gram) 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_tfidf, model_count, model_w2v_cbow, model_w2v_sg]
X_valid_list = [X_valid_tfidf, X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]
X_train_list = [X_train_tfidf, X_train_count, X_train_w2v_cbow, X_train_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.7817361686458622, Macro F1-score: 0.5680682228074191
Soft Voting - Accuracy: 0.7930590076406149, Macro F1-score: 0.5923956868842417


3-3. 성능 좋은 3개 모델 섞어서 사용

In [5]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 모델 정의
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_w2v_sg = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)

# 모델 학습
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[0])  # Logistic Regression 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_count, model_w2v_cbow, model_w2v_sg]
X_valid_list = [X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.81349535119212, Macro F1-score: 0.6312578899765534
Soft Voting - Accuracy: 0.8390868084322931, Macro F1-score: 0.6710935194397225


In [6]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("../../data/train_df_1012.csv")
df_test = pd.read_csv("../../data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=300, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 모델 정의
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)
model_w2v_sg = RandomForestClassifier(n_estimators=100, random_state=RANDOM_STATE)

# 모델 학습
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[0])  # Logistic Regression 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_count, model_w2v_cbow]
X_valid_list = [X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8337475835404584, Macro F1-score: 0.6715901126181764
Soft Voting - Accuracy: 0.8389026972291264, Macro F1-score: 0.6739752158257886


In [7]:
# 모델 리스트
models = [model_count, model_w2v_sg]
X_valid_list = [X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8337475835404584, Macro F1-score: 0.6715901126181764
Soft Voting - Accuracy: 0.8328270275246249, Macro F1-score: 0.6689213593541946


.