In [None]:
# tensorflow.keras의 데이터셋 중 reuters 데이터셋을 사용해서 분류 모델 개발
# 1. 데이터 가져오기
# 2. 데이터의 특성 파악
# 3. 입력 데이터 구성
# 4. 모델 개발
# 5. 모델 평가

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras as tf_keras

In [2]:
# 데이터 준비
(X_train, y_train), (X_test, y_test) = tf_keras.datasets.reuters.load_data(num_words=10000) # 10000 개의 단어 집합 사용

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters.npz
[1m2110848/2110848[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [3]:
# 단어 사전 확인
word_to_index = tf_keras.datasets.reuters.get_word_index()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/reuters_word_index.json
[1m550378/550378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [4]:
# 단어 번호로 단어를 찾기 위한 맵 만들기
index_to_word = { value: key for key, value in word_to_index.items() }

In [5]:
# 각 문장을 BOW 형식으로 변환 -> (1, 10000)
def vectorize_sentences(sentences, dimension=10000, bow=True): # dimension : column, 전체단어갯수, sentences : 행, 문장들
    results = np.zeros((len(sentences), dimension))

    for i, sentence in enumerate(sentences):
        for word in sentence:
            if bow:
              results[i, word] += 1.  # 단어 위치에 발생 빈도 encoding
            else:
              results[i, word] = 1.   # 단어 위치에 1 encoding

    return results

In [6]:
# 입력 데이터 변환
X_train2 = vectorize_sentences(X_train, bow=False)
X_test2 = vectorize_sentences(X_test, bow=False)

X_train3 = vectorize_sentences(X_train, bow=True)
X_test3 = vectorize_sentences(X_test, bow=True)

In [9]:
# target 데이터 특성 확인
print(y_train.shape)
np.unique(y_train, return_counts=True)

(8982,)


(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
        34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45]),
 array([  55,  432,   74, 3159, 1949,   17,   48,   16,  139,  101,  124,
         390,   49,  172,   26,   20,  444,   39,   66,  549,  269,  100,
          15,   41,   62,   92,   24,   15,   48,   19,   45,   39,   32,
          11,   50,   10,   49,   19,   19,   24,   36,   30,   13,   21,
          12,   18]))

In [11]:
# 모델 구조 설계 1

base_model = tf_keras.models.Sequential()
base_model.add(tf_keras.layers.Input(shape=(10000,)))
base_model.add(tf_keras.layers.Dense(512, activation='relu'))
base_model.add(tf_keras.layers.Dense(256, activation='relu'))
base_model.add(tf_keras.layers.Dense(128, activation='relu'))
base_model.add(tf_keras.layers.Dense(46, activation='softmax')) # 다중분류인 경우 activation=softmax, 유닛갯수는 분류 갯수 사용

base_model.summary()

In [18]:
# 모델 구조 설계 2

model1 = tf_keras.models.Sequential()
model1.add(tf_keras.layers.Input(shape=(10000,)))
model1.add(tf_keras.layers.Dense(512, activation='relu', kernel_initializer="he_uniform"))
model1.add(tf_keras.layers.Dense(256, activation='relu', kernel_initializer="he_uniform"))
model1.add(tf_keras.layers.Dense(128, activation='relu', kernel_initializer=tf_keras.initializers.HeUniform(seed=42)))
model1.add(tf_keras.layers.Dense(46, activation='softmax')) # 다중분류인 경우 activation=softmax, 유닛갯수는 분류 갯수 사용

model1.summary()

In [23]:
# 모델 구조 설계 3

model2 = tf_keras.models.Sequential()
model2.add(tf_keras.layers.Input(shape=(10000,)))
model2.add(tf_keras.layers.Dense(512))
model2.add(tf_keras.layers.BatchNormalization())
model2.add(tf_keras.layers.Activation('relu'))

model2.add(tf_keras.layers.Dense(256))
model2.add(tf_keras.layers.BatchNormalization())
model2.add(tf_keras.layers.Activation('relu'))

model2.add(tf_keras.layers.Dense(128))
model2.add(tf_keras.layers.BatchNormalization())
model2.add(tf_keras.layers.Activation('relu'))

model2.add(tf_keras.layers.Dense(46, activation='softmax')) # 다중분류인 경우 activation=softmax, 유닛갯수는 분류 갯수 사용

model2.summary()

In [25]:
# 모델 훈련 설계

base_model.compile(loss='sparse_categorical_crossentropy',
                   optimizer='adam',
                   metrics=['accuracy'])
model1.compile(loss='sparse_categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])
model2.compile(loss='sparse_categorical_crossentropy',
               optimizer='adam',
               metrics=['accuracy'])

In [26]:
# fit_history_base = base_model.fit(X_train2, y_train, epochs=10, batch_size=256, validation_split=0.2)
# fit_history_1 = model1.fit(X_train2, y_train, epochs=10, batch_size=256, validation_split=0.2)
fit_history_1 = model2.fit(X_train2, y_train, epochs=10, batch_size=256, validation_split=0.2)

Epoch 1/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 231ms/step - accuracy: 0.5112 - loss: 2.2533 - val_accuracy: 0.7234 - val_loss: 2.8654
Epoch 2/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 168ms/step - accuracy: 0.8842 - loss: 0.5733 - val_accuracy: 0.6127 - val_loss: 2.2247
Epoch 3/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 172ms/step - accuracy: 0.9533 - loss: 0.2534 - val_accuracy: 0.5938 - val_loss: 1.7535
Epoch 4/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 194ms/step - accuracy: 0.9598 - loss: 0.1458 - val_accuracy: 0.6004 - val_loss: 1.5092
Epoch 5/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 189ms/step - accuracy: 0.9615 - loss: 0.1228 - val_accuracy: 0.6600 - val_loss: 1.3079
Epoch 6/10
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 176ms/step - accuracy: 0.9665 - loss: 0.1011 - val_accuracy: 0.7028 - val_loss: 1.1642
Epoch 7/10
[1m29/29[0m [

In [27]:
print( base_model.evaluate(X_train2, y_train), base_model.evaluate(X_test2, y_test) )
print( model1.evaluate(X_train2, y_train), model1.evaluate(X_test2, y_test) )
print( model2.evaluate(X_train2, y_train), model2.evaluate(X_test2, y_test) )

[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9714 - loss: 0.0758
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.8005 - loss: 1.4693
[0.3115396201610565, 0.9393230676651001] [1.4638079404830933, 0.7938557267189026]
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 14ms/step - accuracy: 0.9694 - loss: 0.0830
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.8008 - loss: 1.0494
[0.2573699951171875, 0.9387664198875427] [1.0756884813308716, 0.7934104800224304]
[1m281/281[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.9680 - loss: 0.1122
[1m71/71[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.7902 - loss: 0.9400
[0.2614251375198364, 0.9344244003295898] [0.9644649028778076, 0.7831701040267944]
