In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras

In [5]:
train_dataset = \
    tf_keras.utils.text_dataset_from_directory("data-files/aclImdb/train", batch_size=32)
test_dataset = \
    tf_keras.utils.text_dataset_from_directory("data-files/aclImdb/test", batch_size=32)
# review_only_dataset = train_dataset.map(lambda X, y: X)
review_only_dataset = train_dataset.map(lambda review, label: review)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [4]:
for X, y in train_dataset:
    print(X.shape, y.shape)
    print(X[0])
    print(y[0])
    break

(32,) (32,)
tf.Tensor(b'COME ON!!! They did that on purpose!! Two of my current faves on TV (Meloni from "Oz" and "L and O-SVU" and Janel from "West Wing") hook up for a nice little sleeper/character study. Plot\'s nothing fancy, but the acting is right on the mark. Tim Busfield shows up for some neat bits. Worth a look.', shape=(), dtype=string)
tf.Tensor(1, shape=(), dtype=int32)


In [6]:
# 문장(단어 집합) -> 숫자 집합 : encoding
text_vectorizer = \
    tf_keras.layers.TextVectorization(max_tokens=20000, # 사전 크기, 총 단어 갯수
                                      output_mode="int",
                                      output_sequence_length=300) # 한 문장의 단어 갯수 

text_vectorizer.adapt(review_only_dataset) # 변환기에 단어 사전 만들기





In [8]:
# (문장 -> 숫자 리스트) 변환기 테스트
for X, y in train_dataset:
    d = text_vectorizer(X) # 변환 실행 [ X: (32, 1) -> X: (32, 300)]
    print(d.shape)
    print(d)
    break

(32, 300)
tf.Tensor(
[[ 414   10  118 ...   13   11   18]
 [  32   10   26 ...    0    0    0]
 [   4 1603   64 ...    0    0    0]
 ...
 [  48   24  106 ...    0    0    0]
 [   9    7  264 ...    0    0    0]
 [ 156   11   91 ...  109   11  139]], shape=(32, 300), dtype=int64)


In [11]:
# 단어 사전 확인
dictionary = text_vectorizer.get_vocabulary()
print( len(dictionary) )
dictionary[10:20]

20000


['i', 'this', 'that', 'br', 'was', 'as', 'for', 'with', 'movie', 'but']

In [18]:
# 숫자로 인코딩된 문장을 원래 문장으로 복원
print(d[0][:13].numpy())
for t in d[0]:
    if t != 0:
        print(dictionary[t], end=" ")

[ 414   10  118  142   29    5    2  164   83 6649    6    1   11]
yes i know im one of the few people longing to [UNK] this movie into the dust of [UNK] let me me tell you why i feel this way in [UNK] it been [UNK] as a zombie film or the [UNK] might have enjoyed itbut right [UNK] totally [UNK] br [UNK] im not sure whats to spoil lets start with the first huge flaw if i did not know that the movie is called darkness the vampire version and had i not seen some sequences where some individuals seem to be sucking blood i would not have seen the connection with vampires i mean [UNK] give me a breakbr br second bad point whats with the metal it appears that all young people but mainly those socalled vampires are into various kinds of [UNK] mainly by their shirts dont get me wrong ive been into the more extreme forms of music for almost 15 years but nobody s going to scare me by showing me some ridiculous teenagers in iron maiden of all bands tshirts running [UNK] to be vampires pathetic is

In [20]:
# Embedding 모델 만들기 : 단어(토큰을 벡터로 만드는 모델)

input = tf_keras.layers.Input(shape=(None,))
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)

embedding_model = tf_keras.models.Model(input, output)

In [33]:
for review in review_only_dataset:
    # print( review )
    vectorized_review = text_vectorizer(review)             # 단어 1개 -> 숫자 1개
    embedded_review = embedding_model(vectorized_review)    # 숫자 1개 -> 숫자 100개
    break

In [34]:
vectorized_review.shape, embedded_review.shape

(TensorShape([32, 300]), TensorShape([32, 300, 100]))

In [35]:
# 훈련 데이터의 모든 문자열(리뷰)을 숫자로 변경
vectorized_train_dataset = \
    train_dataset.map( lambda review, label: (text_vectorizer(review), label) )

In [36]:
# 변경 확인
for X, y in vectorized_train_dataset:
    print(X)
    break

tf.Tensor(
[[  45   23  731 ...    0    0    0]
 [ 150    9   59 ...    0    0    0]
 [  10   41  284 ...    0    0    0]
 ...
 [  22  593    4 ...    0    0    0]
 [  10  604  369 ...    4  738    5]
 [   2  838 3632 ...    0    0    0]], shape=(32, 300), dtype=int64)


In [42]:
# 모델 구조 설계 : 텍스트 데이터 처리를 위한 순환신경망 모델

input = tf_keras.layers.Input(shape=(None,))
x = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input) # None, 300, 100
x = tf_keras.layers.LSTM(units=16)(x)
output = tf_keras.layers.Dense(units=1, activation="sigmoid")(x)

model = tf_keras.models.Model(input, output)

model.summary()

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding_4 (Embedding)     (None, None, 100)         2000000   
                                                                 
 lstm_2 (LSTM)               (None, 16)                7488      
                                                                 
 dense_2 (Dense)             (None, 1)                 17        
                                                                 
Total params: 2007505 (7.66 MB)
Trainable params: 2007505 (7.66 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
# 모델 학습 설계

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

In [45]:
# 모델 학습

history = model.fit(vectorized_train_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
