In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras




In [2]:
# base_dir = "data-files/aclImdb"
base_dir = "D:\\instructor-och\\data-files\\aclImdb"
train_dataset = tf_keras.utils.text_dataset_from_directory(base_dir + "\\train", batch_size=32)
test_dataset = tf_keras.utils.text_dataset_from_directory(base_dir + "\\test", batch_size=32)
review_only_dataset = train_dataset.map(lambda review, label: review)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [5]:
for X, y in train_dataset:
    print(X.shape, y.shape)
    print(X[0], y[0])
    break

(32,) (32,)
tf.Tensor(b"... mainly because Ju-on 2 boasts an outrageous FORTY minutes' worth of material literally taken straight out of the first Ju-on - and when you consider that the sequel only runs for 76 minutes, that leaves you with 36 original minutes' worth of film. Ho-hum. I found that deeply irritating - as if viewers simply wouldn't remember the same stuff! - not to mention dull, having to watch it all over again.<br /><br />OK, that complaint aside, the byline for Ju-on 2 was that it was supposed to explain a lot of the unanswered questions from the first movie, which frankly, over 36 minutes, simply doesn't go far enough to making any kind of sense of the original's highly convoluted storyline.<br /><br />There are, however, some really nice new horror sequences which show how good the film might have been, had it had some time to develop; and some of the questions raised by the original - some, but not all - are answered.<br /><br />So in conclusion - if you loved the fi

In [7]:
# 텍스트 -> 숫자 인코딩 (BOW, 단어번호벡터, ...)
text_vectorizer = tf_keras.layers.TextVectorization(max_tokens=20000,  # 사용할 단어 갯수
                                                    output_mode="int", # 출력은 단어 사전의 번호
                                                    output_sequence_length=300) # 각 문장의 길이
text_vectorizer.adapt(review_only_dataset)

In [9]:
for X, y in train_dataset:    
    d = text_vectorizer(X)
    print(d.shape)    
    print( text_vectorizer(X) )
    break

(32, 300)
tf.Tensor(
[[  1 216  18 ...   0   0   0]
 [ 32  10  69 ...   0   0   0]
 [ 11  18  14 ...   0   0   0]
 ...
 [ 86 127  10 ...   0   0   0]
 [ 74  10  67 ...   0   0   0]
 [ 86   5  32 ...   0   0   0]], shape=(32, 300), dtype=int64)


In [None]:
text_vectorizer.get_vocabulary()

In [14]:
# 단어 값 -> 압축된 단어 벡터 ( 과정 학습 ), 한 행의 문장 -> 여러 행의 단어 벡터
# input_dim : 총 단어 갯수, output_dim : 한 단어를 표현하는 vector
input = tf_keras.layers.Input(shape=(None,))
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)

embedding_model = tf_keras.models.Model(input, output)

In [None]:
# train 데이터셋 전체에대해 text_vectorization 적용
input_dataset = train_dataset.map(lambda review, label: (text_vectorizer(review), label) )

for X, y in input_dataset:
    print(X, y)

In [None]:
all_data = []
for x, y in input_dataset.as_numpy_iterator(): # tensorflow tensor -> numpy ndarray
    # print(type(x))
    # break
    all_data.append(x)

review_only_input_dataset = np.concatenate(all_data)

In [20]:
print(review_only_input_dataset.shape)
review_only_input_dataset[:5]

(25000, 300)


array([[   11,    18,    14, ...,     0,     0,     0],
       [16226, 15814,     1, ...,     0,     0,     0],
       [  140,    43,  4135, ...,     0,     0,     0],
       [ 1424,   887,    15, ..., 11254,   380,    11],
       [ 5785,     7,     4, ...,     0,     0,     0]], dtype=int64)

In [21]:
# 배치크기 * 단어 갯수 -> 배치크기 * 단어 갯수 * 단어표현크기
embeded_dataset = embedding_model(review_only_input_dataset) 

In [22]:
embeded_dataset.shape

TensorShape([25000, 300, 100])

In [23]:
embeded_dataset

<tf.Tensor: shape=(25000, 300, 100), dtype=float32, numpy=
array([[[ 0.01187342, -0.01256521,  0.00752597, ..., -0.04648696,
         -0.01624123,  0.02742126],
        [-0.00443424,  0.01624299, -0.0347303 , ..., -0.04067074,
          0.02249147,  0.01066671],
        [ 0.03618124, -0.00179081, -0.02350274, ...,  0.04206263,
          0.01330247, -0.02644373],
        ...,
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ],
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ],
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ]],

       [[-0.01873448,  0.00226523, -0.00522152, ...,  0.04537462,
         -0.04577348, -0.02152989],
        [ 0.02371925,  0.04375834,  0.04883577, ..., -0.0227849 ,
         -0.04484495,  0.03489175],
        [ 0.00600901,  0.00932712, -0.00301258, ...,  0.04149136,
          0.01192706, -0.00964012],
        

In [25]:
input = tf_keras.layers.Input(shape=(None,))
x = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)
x = tf_keras.layers.LSTM(16)(x)
output = tf_keras.layers.Dense(units=1, activation="sigmoid")(x)
model = tf_keras.models.Model(input, output)

In [26]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])




In [28]:
history = model.fit(input_dataset, epochs=10)

Epoch 1/10

Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
