In [1]:
from tensorflow import keras

from tensorflow.keras.datasets import imdb

from tensorflow.keras.layers import Embedding
from tensorflow.keras import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Embedding

keras.__version__

'2.2.4-tf'

In [2]:
# 특성으로 사용할 단어의 수
max_features = 10000

# 정수 리스트로 데이터를 로드합니다.
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)


# 사용할 텍스트의 길이(가장 빈번한 max_features 개의 단어만 사용합니다)
maxlen = 20

# 리스트를 (samples, maxlen) 크기의 2D 정수 텐서로 변환합니다.
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [3]:
# 데이터 살펴보기 - 데이터의 원본 텍스트 디코딩
word_to_idx = keras.datasets.imdb.get_word_index()
word_to_idx = {k:(v+3) for k,v in word_to_idx.items()}
word_to_idx["<PAD>"] = 0; word_to_idx["<START>"] = 1; word_to_idx["<UNK>"] = 2; word_to_idx["<UNUSED>"] = 3

idx_to_word = {value:key for key,value in word_to_idx.items()}


review_idx = 0
print('label :', y_train[review_idx])
print('text  :', ' '.join(idx_to_word[i] for i in x_train[review_idx] ))
print('\ntext to tensor (first 10 items) :', x_train[review_idx][:10])

label : 1
text  : story was so lovely because it was true and was someone's life after all that was shared with us all

text to tensor (first 10 items) : [  65   16   38 1334   88   12   16  283    5   16]


In [4]:
model = Sequential()

# 나중에 임베딩된 입력을 Flatten 층에서 펼치기 위해 Embedding 층에 input_length를 지정합니다.
# Embedding 층의 출력 크기는 (samples, maxlen, 8)가 됩니다.
model.add(Embedding(max_features, 8, input_length=maxlen))

# 3D 임베딩 텐서를 (samples, maxlen * 8) 크기의 2D 텐서로 펼칩니다.
model.add(Flatten())

# 분류기를 추가합니다.
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 20, 8)             80000     
_________________________________________________________________
flatten (Flatten)            (None, 160)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [5]:
history = model.fit(x_train, y_train,
                    epochs=10,
                    batch_size=32,
                    validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
