In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = { index+3 : word for word, index in word_index.items() }
# index_to_word에 숫자 0은 <pad>, 숫자 1은 <sos>, 숫자 2는 <unk>를 넣어줍니다.
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token
# print(' '.join([index_to_word[index] for index in x_train[0]]))

In [4]:

def reuters_load_ml(num_words, mode = True):#mode true는 dtm, false는 tfidf
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    decoded = []
    for i in range(len(x_train)):
        t = ' '.join([index_to_word[index] for index in x_train[i]])
        decoded.append(t)

    x_train = decoded
    decoded = []
    for i in range(len(x_test)):
        t = ' '.join([index_to_word[index] for index in x_test[i]])
        decoded.append(t)
    x_test = decoded
    dtmvector = CountVectorizer()
    tfidf_transformer = TfidfTransformer()
    if mode :
        x_train = dtmvector.fit_transform(x_train)
        x_test = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
    else :
        x_train_dtm = dtmvector.fit_transform(x_train)
        x_train = tfidf_transformer.fit_transform(x_train_dtm)
        x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
        x_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환
        
    return x_train, y_train, x_test, y_test

In [5]:
def fit_ml(x_train, y_train, x_test, y_test) :
    #NB
    model = MultinomialNB()
    model.fit(x_train, y_train)
    predicted = model.predict(x_test) #테스트 데이터에 대한 예측
    print("NB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #CNB
    cb = ComplementNB()
    cb.fit(x_train, y_train)
    predicted = cb.predict(x_test) #테스트 데이터에 대한 예측
    print("CNB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #로지스틱회귀
    lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
    lr.fit(x_train, y_train)
    predicted = lr.predict(x_test) #테스트 데이터에 대한 예측
    print("로지스틱회귀 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #svc
    lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
    lsvc.fit(x_train, y_train)
    predicted = lsvc.predict(x_test) #테스트 데이터에 대한 예측
    print("SVC 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #tree
    tree = DecisionTreeClassifier(max_depth=10, random_state=0)
    tree.fit(x_train, y_train)
    predicted = tree.predict(x_test) #테스트 데이터에 대한 예측
    print("tree 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #RandomForest
    forest = RandomForestClassifier(n_estimators =5, random_state=0)
    forest.fit(x_train, y_train)
    predicted = forest.predict(x_test) #테스트 데이터에 대한 예측
    print("RandomForest 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #GradientBoosting
    grbt = GradientBoostingClassifier(random_state=0) # verbose=3
    grbt.fit(x_train, y_train)
    predicted = grbt.predict(x_test) #테스트 데이터에 대한 예측
    print("GradientBoosting 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

    #보팅
    voting_classifier = VotingClassifier(estimators=[('lr', lr), ('cb', cb), ('gnb', grbt)],voting='soft')
    voting_classifier.fit(x_train, y_train)
    predicted = voting_classifier.predict(x_test) #테스트 데이터에 대한 예측
    print("보팅 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

In [6]:
print("num_words=None, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(None,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=None, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(None,False)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=10000, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(10000,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=10000, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(10000,False)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=5000, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(5000,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=5000, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(5000,False)
fit_ml(x_train, y_train, x_test, y_test)

num_words=None, DTM을 활용한 정확도
NB 정확도: 0.7226179875333927
CNB 정확도: 0.7782724844167409


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


로지스틱회귀 정확도: 0.7867319679430098




SVC 정확도: 0.7520035618878005
tree 정확도: 0.6277827248441674
RandomForest 정확도: 0.655387355298308
GradientBoosting 정확도: 0.7711487088156723


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


보팅 정확도: 0.8116651825467498
num_words=None, TFIDF을 활용한 정확도
NB 정확도: 0.5997328584149599
CNB 정확도: 0.7649154051647373
로지스틱회귀 정확도: 0.8165627782724845




SVC 정확도: 0.7969723953695459
tree 정확도: 0.6211041852181657
RandomForest 정확도: 0.6544968833481746
GradientBoosting 정확도: 0.7702582368655387
보팅 정확도: 0.8156723063223509
num_words=10000, DTM을 활용한 정확도
NB 정확도: 0.7711487088156723
CNB 정확도: 0.7773820124666073


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


로지스틱회귀 정확도: 0.780053428317008




SVC 정확도: 0.7466607301869991
tree 정확도: 0.6273374888691006
RandomForest 정확도: 0.6709706144256455
GradientBoosting 정확도: 0.7724844167408726


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


보팅 정확도: 0.807212822796082
num_words=10000, TFIDF을 활용한 정확도
NB 정확도: 0.6567230632235085
CNB 정확도: 0.7707034728406055
로지스틱회귀 정확도: 0.8107747105966162




SVC 정확도: 0.7818343722172751
tree 정확도: 0.6202137132680321
RandomForest 정확도: 0.674087266251113
GradientBoosting 정확도: 0.7662511130899377
보팅 정확도: 0.8165627782724845
num_words=5000, DTM을 활용한 정확도
NB 정확도: 0.7773820124666073
CNB 정확도: 0.7689225289403384


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


로지스틱회귀 정확도: 0.7778272484416741




SVC 정확도: 0.7252894033837934
tree 정확도: 0.6242208370436332
RandomForest 정확도: 0.6941228851291185
GradientBoosting 정확도: 0.7702582368655387


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


보팅 정확도: 0.8107747105966162
num_words=5000, TFIDF을 활용한 정확도
NB 정확도: 0.6731967943009796
CNB 정확도: 0.7707034728406055
로지스틱회귀 정확도: 0.8036509349955476




SVC 정확도: 0.7707034728406055
tree 정확도: 0.6179875333926982
RandomForest 정확도: 0.701246660730187
GradientBoosting 정확도: 0.767586821015138
보팅 정확도: 0.8103294746215495


In [7]:
# print(classification_report(y_test, model.predict(x_test_dtm), zero_division=0))


In [8]:
def graph_confusion_matrix(model, x_test, y_test):#, classes_name):
  df_cm = pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)))#, index=classes_name, columns=classes_name)
  fig = plt.figure(figsize=(15,15))
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
  plt.ylabel('label')
  plt.xlabel('predicted value')

# graph_confusion_matrix(model, x_test_dtm, y_test)

In [9]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

# Reuters 데이터셋 로드
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)
# 데이터 전처리
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y_train)
y_test = label_binarizer.fit_transform(y_test)
# 시퀀스 패딩
max_sequence_length = 100  # 시퀀스의 최대 길이 지정
x_train = pad_sequences(x_train, maxlen=max_sequence_length)
x_test = pad_sequences(x_test, maxlen=max_sequence_length)
# 모델 구성
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(units=128))
model.add(Dense(units=46, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# 모델 학습
model.fit(x_train, y_train, batch_size=32, epochs=30, validation_data=(x_test, y_test))

# 테스트 데이터에 대한 예측
y_pred = model.predict(x_test)

# 분류 보고서 출력
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
report = classification_report(y_test_labels, y_pred_labels)
print(report)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
              precision    recall  f1-score   support

           0       0.45      0.42      0.43        12
           1       0.71      0.60      0.65       105
           2       0.64      0.45      0.53        20
           3       0.86      0.92      0.89       813
           4       0.80      0.76      0.78       474
           5       0.33      0.20      0.25         5
           6       0.57      0.57      0.57        14
           7       1.00      0.33      0.50         3
           8       0.61      0.61      0.61        38
           9       0.70      0.64      0.67        25
          10       0.92      0.80      0.86        30
