In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

# 벡터화 함수
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline

# 머신러닝 모델들
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC

# 모델 검증
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
import tensorflow as tf

## 2. 데이터 불러오기

In [2]:
num_words = 20000
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
print('훈련 샘플의 수: {}'.format(len(x_train)))
print('테스트 샘플의 수: {}'.format(len(x_test)))

훈련 샘플의 수: 8982
테스트 샘플의 수: 2246


In [3]:
# 단어 복원
word_index = reuters.get_word_index()
reverse_word_index = {v: k for k, v in word_index.items()}

In [4]:
def decode_review(text):
    return " ".join([reverse_word_index.get(i - 3, "?") for i in text])  # '-3'은 예약된 인덱스

In [5]:
x_train_texts = [decode_review(x) for x in x_train]
x_test_texts = [decode_review(x) for x in x_test]

In [6]:
# TF-IDF 변환
vectorizer = TfidfVectorizer(max_features=20000)
X_train_tfidf = vectorizer.fit_transform(x_train_texts)
X_test_tfidf = vectorizer.transform(x_test_texts)

In [7]:
# LSA 적용
svd = TruncatedSVD(n_components=100)  # 100차원으로 축소
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

In [8]:
X_train_lsa = lsa.fit_transform(X_train_tfidf)
X_test_lsa = lsa.transform(X_test_tfidf)

## 4. 머신러닝 모델들 성능 비교하기

### 4.1 MultinomialNB 모델
- MultinomialNB(다항 분포 나이브 베이즈)는 양수의 정수형 카운트 데이터(예: 단어 등장 횟수)를 기대하지만,
- LSA(TruncatedSVD)를 적용하면 음수가 포함된 실수값이 나오기 때문에 MultinomialNB가 이를 처리할 수 없음.

In [10]:
# model = MultinomialNB()
# model.fit(X_train_lsa, y_train)

# predicted = model.predict(X_test_lsa) #테스트 데이터에 대한 예측
# print("정확도:", accuracy_score(y_test, predicted))
# print("F1-score:", f1_score(y_test, predicted, average='weighted'))
# print(classification_report(y_test, predicted)) #예측값과 실제값 비교

### 4.2 ComplementNB 모델

In [12]:
# cb = ComplementNB()
# cb.fit(X_train_lsa, y_train)

# predicted = cb.predict(X_test_lsa) #테스트 데이터에 대한 예측
# print("정확도:", accuracy_score(y_test, predicted))
# print("F1-score:", f1_score(y_test, predicted, average='weighted'))

### 4.3 Logistic Regression 모델

In [13]:
lr = LogisticRegression(C=10000, penalty='l2')
lr.fit(X_train_lsa, y_train)

predicted = lr.predict(X_test_lsa) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted))
print("F1-score:", f1_score(y_test, predicted, average='weighted'))

정확도: 0.8054318788958148
F1-score: 0.7989859476360002


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### 4.4 선형 서포트 벡터 머신

In [14]:
lsvc = LinearSVC(C=1000, penalty='l1', max_iter=500, dual=False)
lsvc.fit(X_train_lsa, y_train)

predicted = lsvc.predict(X_test_lsa) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted))
print("F1-score:", f1_score(y_test, predicted, average='weighted'))

정확도: 0.7920747996438112
F1-score: 0.7854158503425255




### 4.5 의사결정나무

In [15]:
tree = DecisionTreeClassifier(max_depth=10, random_state=42)
tree.fit(X_train_lsa, y_train)

predicted = tree.predict(X_test_lsa) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted))
print("F1-score:", f1_score(y_test, predicted, average='weighted'))

정확도: 0.6825467497773821
F1-score: 0.6518254536498507


### 4.6 랜덤포레스트

In [16]:
forest = RandomForestClassifier(n_estimators=5, random_state=42)
forest.fit(X_train_lsa, y_train)

predicted = forest.predict(X_test_lsa) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted))
print("F1-score:", f1_score(y_test, predicted, average='weighted'))

정확도: 0.7235084594835263
F1-score: 0.7042786648375011


### 4.7 그래디언트 부스팅

In [17]:
grbt = GradientBoostingClassifier(random_state=42, verbose=0) # 
grbt.fit(X_train_lsa, y_train)

predicted = grbt.predict(X_test_lsa) #테스트 데이터에 대한 예측
print("정확도:", accuracy_score(y_test, predicted))
print("F1-score:", f1_score(y_test, predicted, average='weighted'))

정확도: 0.7257346393588602
F1-score: 0.7145644257630139


In [None]:
LR	20,000	100	0.8054	0.7989
LSVC			0.7920	0.7854
DT			0.6825	0.6518
RF			0.7235	0.7043
GBT			0.7257	0.7145