## 머신러닝 모델 이용

## 라이브러리 로드

In [1]:
# 데이터 분석을 위한 pandas, 수치계산을 위한 numpy, 시각화를 위한 seaborn, matplotlib 을 로드합니다.
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [1]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec
from collections import Counter

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/train_df_1012.csv")
df_test = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/test_df_1012.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X = df_train['키워드']  # 키워드 컬럼
y = df_train['분류']  # 카테고리 컬럼

# 데이터 분할 (클래스 비율 동일하게 유지)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

# TF-IDF 벡터화
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_valid_tfidf = tfidf_vectorizer.transform(X_valid)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_valid_count = count_vectorizer.transform(X_valid)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=200, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# Word2Vec 벡터화 (Skip-gram)
w2v_model_sg = Word2Vec(sentences=[text.split() for text in X_train], vector_size=200, window=10, min_count=2, workers=4, sg=1)
X_train_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_valid_w2v_sg = np.array([np.mean([w2v_model_sg.wv[word] for word in text.split() if word in w2v_model_sg.wv] or [np.zeros(300)], axis=0) for text in X_valid])

# 모델 정의
model_count = LogisticRegression(random_state=RANDOM_STATE)
model_w2v_cbow = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)
model_w2v_sg = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)

# 모델 학습
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)
model_w2v_sg.fit(X_train_w2v_sg, y_train_encoded)

# 하드보팅
def hard_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict(X_valid) for model, X_valid in zip(models, X_valid_list)]).T
    final_predictions = []
    for preds in predictions:
        vote_counts = Counter(preds)
        if len(vote_counts) == len(models):  # 다수표가 없는 경우
            final_predictions.append(preds[0])  # Logistic Regression 모델의 값을 취함
        else:
            final_predictions.append(vote_counts.most_common(1)[0][0])
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 소프트보팅
def soft_voting(models, X_valid_list, y_valid, method_name):
    predictions = np.array([model.predict_proba(X_valid) for model, X_valid in zip(models, X_valid_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    accuracy = accuracy_score(y_valid, final_predictions)
    macro_f1 = f1_score(y_valid, final_predictions, average='macro')
    print(f"{method_name} - Accuracy: {accuracy}, Macro F1-score: {macro_f1}")

# 모델 리스트
models = [model_count, model_w2v_cbow]
X_valid_list = [X_valid_count, X_valid_w2v_cbow, X_valid_w2v_sg]

# 하드보팅
hard_voting(models, X_valid_list, y_valid_encoded, "Hard Voting")

# 소프트보팅
soft_voting(models, X_valid_list, y_valid_encoded, "Soft Voting")

Hard Voting - Accuracy: 0.8339316947436252, Macro F1-score: 0.6719817800644726
Soft Voting - Accuracy: 0.8387185860259597, Macro F1-score: 0.6734375287778674


## 모든 데이터로 학습하여 예측

In [2]:
import pandas as pd
import numpy as np
import warnings
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from gensim.models import Word2Vec

RANDOM_STATE = 110

# 경고 메시지 억제
warnings.filterwarnings('ignore')

# 학습, 예측 데이터셋을 불러옵니다.
df_train = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/train_df_1012.csv")
df_test = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/test_df_1012.csv")
sample_submission = pd.read_csv("C:/Users/KimDongyoung/Desktop/GBT해커톤/GBT_hackerton/data/sample_submission.csv")
df_train.shape, df_test.shape

# 텍스트와 레이블 분리
X_train = df_train['키워드']  # 키워드 컬럼
y_train = df_train['분류']  # 카테고리 컬럼
X_test = df_test['키워드']

# 레이블 인코딩
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)

# Count Vectorization
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(X_train)
X_test_count = count_vectorizer.transform(X_test)

# Word2Vec 벡터화 (CBOW)
w2v_model_cbow = Word2Vec(sentences=[text.split() for text in X_train], vector_size=200, window=10, min_count=2, workers=4, sg=0)
X_train_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_train])
X_test_w2v_cbow = np.array([np.mean([w2v_model_cbow.wv[word] for word in text.split() if word in w2v_model_cbow.wv] or [np.zeros(300)], axis=0) for text in X_test])

# 모델 정의
model_count = LogisticRegression(random_state=RANDOM_STATE) # n_estimators=(dafault)100 
model_w2v_cbow = RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE)

# 모델 학습
model_count.fit(X_train_count, y_train_encoded)
model_w2v_cbow.fit(X_train_w2v_cbow, y_train_encoded)

# 소프트보팅
def soft_voting(models, X_test_list, method_name):
    predictions = np.array([model.predict_proba(X_test) for model, X_test in zip(models, X_test_list)])
    avg_predictions = np.mean(predictions, axis=0)
    final_predictions = np.argmax(avg_predictions, axis=1)
    return final_predictions

# 모델 리스트
models = [model_count, model_w2v_cbow]
X_test_list = [X_test_count, X_test_w2v_cbow]

# 소프트보팅 예측
soft_voting_predictions = soft_voting(models, X_test_list, "Soft Voting")

# 예측 결과를 데이터프레임으로 변환
soft_voting_results = pd.DataFrame({'ID': df_test['ID'], '분류': label_encoder.inverse_transform(soft_voting_predictions)})

# sample_submission 파일에 예측 결과를 반영
sample_submission['분류'] = soft_voting_results['분류']

# 결과를 CSV 파일로 저장
sample_submission.to_csv("soft_voting_logNrf_DY.csv", encoding='UTF-8-sig', index=False)

# 저장된 결과 확인
print("CSV 파일로 저장된 결과:")
print(sample_submission['분류'].value_counts())

CSV 파일로 저장된 결과:
분류
지역               12250
경제:부동산            1464
사회:사건_사고          1152
경제:반도체             993
사회:사회일반            480
사회:교육_시험           438
사회:의료_건강           421
정치:국회_정당           401
스포츠:올림픽_아시안게임      366
경제:취업_창업           344
경제:자동차             280
스포츠:골프             272
경제:산업_기업           268
문화:전시_공연           266
정치:선거              266
경제:유통              243
IT_과학:모바일          234
사회:장애인             230
경제:경제일반            202
사회:여성              191
사회:노동_복지           179
경제:서비스_쇼핑          177
경제:무역              160
경제:금융_재테크          142
문화:방송_연예           136
사회:환경              134
정치:행정_자치           130
스포츠:축구             129
국제                 119
정치:청와대             115
문화:출판               99
IT_과학:과학            91
문화:미술_건축            88
IT_과학:IT_과학일반       82
문화:학술_문화재           76
IT_과학:인터넷_SNS       76
문화:요리_여행            75
문화:문화일반             71
경제:자원               70
정치:정치일반             64
문화:종교               53
사회:날씨               52
IT_과학:콘텐츠      

.