In [1]:
from tensorflow.keras.datasets import reuters
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB #다항분포 나이브 베이즈 모델
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score #정확도 계산
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

In [3]:
word_index = reuters.get_word_index(path="reuters_word_index.json")
index_to_word = { index+3 : word for word, index in word_index.items() }
# index_to_word에 숫자 0은 <pad>, 숫자 1은 <sos>, 숫자 2는 <unk>를 넣어줍니다.
for index, token in enumerate(("<pad>", "<sos>", "<unk>")):
  index_to_word[index]=token
# print(' '.join([index_to_word[index] for index in x_train[0]]))

In [4]:
dtmvector = CountVectorizer()
def reuters_load_ml(num_words, mode = True):#mode true는 dtm, false는 tfidf
    (x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=num_words, test_split=0.2)
    decoded = []
    for i in range(len(x_train)):
        t = ' '.join([index_to_word[index] for index in x_train[i]])
        decoded.append(t)

    x_train = decoded
    decoded = []
    for i in range(len(x_test)):
        t = ' '.join([index_to_word[index] for index in x_test[i]])
        decoded.append(t)
    x_test = decoded
    if mode :
        x_train = dtmvector.fit_transform(x_train)
        x_test = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
    else :
        tfidf_transformer = TfidfTransformer()
        x_train_dtm = dtmvector.fit_transform(x_train)
        x_train = tfidf_transformer.fit_transform(x_train_dtm)
        x_test_dtm = dtmvector.transform(x_test) #테스트 데이터를 DTM으로 변환
        x_test = tfidf_transformer.transform(x_test_dtm) #DTM을 TF-IDF 행렬로 변환
        
    return x_train, y_train, x_test, y_test

In [5]:
def fit_ml(x_train, y_train, x_test, y_test) :
    #NB
    model = MultinomialNB()
    model.fit(x_train, y_train)
    predicted = model.predict(x_test) #테스트 데이터에 대한 예측
    print("NB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #CNB
    cb = ComplementNB()
    cb.fit(x_train, y_train)
    predicted = cb.predict(x_test) #테스트 데이터에 대한 예측
    print("CNB 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #로지스틱회귀
    lr = LogisticRegression(C=10000, penalty='l2', max_iter=3000)
    lr.fit(x_train, y_train)
    predicted = lr.predict(x_test) #테스트 데이터에 대한 예측
    print("로지스틱회귀 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #svc
    lsvc = LinearSVC(C=1000, penalty='l1', max_iter=3000, dual=False)
    lsvc.fit(x_train, y_train)
    predicted = lsvc.predict(x_test) #테스트 데이터에 대한 예측
    print("SVC 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #tree
    tree = DecisionTreeClassifier(max_depth=10, random_state=0)
    tree.fit(x_train, y_train)
    predicted = tree.predict(x_test) #테스트 데이터에 대한 예측
    print("tree 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #RandomForest
    forest = RandomForestClassifier(n_estimators =5, random_state=0)
    forest.fit(x_train, y_train)
    predicted = forest.predict(x_test) #테스트 데이터에 대한 예측
    print("RandomForest 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교
    
    #GradientBoosting
    grbt = GradientBoostingClassifier(random_state=0) # verbose=3
    grbt.fit(x_train, y_train)
    predicted = grbt.predict(x_test) #테스트 데이터에 대한 예측
    print("GradientBoosting 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

    #보팅
    voting_classifier = VotingClassifier(estimators=[('lr', lr), ('cb', cb), ('gnb', grbt)],voting='soft')
    voting_classifier.fit(x_train, y_train)
    predicted = voting_classifier.predict(x_test) #테스트 데이터에 대한 예측
    print("보팅 정확도:", accuracy_score(y_test, predicted)) #예측값과 실제값 비교

In [None]:
print("num_words=None, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(None,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=None, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(None,False)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=10000, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(10000,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=10000, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(10000,False)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=5000, DTM을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(5000,True)
fit_ml(x_train, y_train, x_test, y_test)
print("num_words=5000, TFIDF을 활용한 정확도") 
x_train, y_train, x_test, y_test = reuters_load_ml(5000,False)
fit_ml(x_train, y_train, x_test, y_test)

num_words=None, DTM을 활용한 정확도
NB 정확도: 0.7226179875333927
CNB 정확도: 0.7782724844167409


In [None]:
# print(classification_report(y_test, model.predict(x_test_dtm), zero_division=0))


In [None]:
def graph_confusion_matrix(model, x_test, y_test):#, classes_name):
  df_cm = pd.DataFrame(confusion_matrix(y_test, model.predict(x_test)))#, index=classes_name, columns=classes_name)
  fig = plt.figure(figsize=(15,15))
  heatmap = sns.heatmap(df_cm, annot=True, fmt="d")
  heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=12)
  heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=12)
  plt.ylabel('label')
  plt.xlabel('predicted value')

# graph_confusion_matrix(model, x_test_dtm, y_test)

In [None]:
import numpy as np
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report

# Reuters 데이터셋 로드
(x_train, y_train), (x_test, y_test) = reuters.load_data(num_words=10000, test_split=0.2)
# 데이터 전처리
label_binarizer = LabelBinarizer()
y_train = label_binarizer.fit_transform(y_train)
y_test = label_binarizer.fit_transform(y_test)
# 시퀀스 패딩
max_sequence_length = 100  # 시퀀스의 최대 길이 지정
x_train = pad_sequences(x_train, maxlen=max_sequence_length)
x_test = pad_sequences(x_test, maxlen=max_sequence_length)
# 모델 구성
model = Sequential()
model.add(Embedding(input_dim=10000, output_dim=100, input_length=max_sequence_length))
model.add(LSTM(units=128))
model.add(Dense(units=46, activation='softmax'))

# 모델 컴파일
model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])

# 모델 학습
model.fit(x_train, y_train, batch_size=32, epochs=30, validation_data=(x_test, y_test))

# 테스트 데이터에 대한 예측
y_pred = model.predict(x_test)

# 분류 보고서 출력
y_pred_labels = np.argmax(y_pred, axis=1)
y_test_labels = np.argmax(y_test, axis=1)
report = classification_report(y_test_labels, y_pred_labels)
print(report)