In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as tf_keras

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# base_dir = "data-files/aclImdb"
base_dir = "D:\\instructor-och\\data-files\\aclImdb"
train_dataset = tf_keras.utils.text_dataset_from_directory(base_dir + "\\train", batch_size=32)
test_dataset = tf_keras.utils.text_dataset_from_directory(base_dir + "\\test", batch_size=32)
review_only_dataset = train_dataset.map(lambda review, label: review)

Found 25000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


In [3]:
for X, y in train_dataset:
    print(X.shape, y.shape)
    print(X[0], y[0])
    break

(32,) (32,)
tf.Tensor(b'A singularly unfunny musical comedy that artificially tries to marry the then-cutting edge rock \'n\' roll explosion with the middle-class sensibilities of a suburban sitcom. The result is a jarringly dated mish-mash that will satisfy none of the audience that went for the music, but will at least keep their parents sated.<br /><br />A quick glance at the promo write-up on the back of the video release should give some idea of the content. Tom Ewell is a drunken agent, overplayed with so little comic ability you almost expect him to bellow "hi honey, I\'m home!" The blurb sites him as "So funny in \'The 7 Year Itch\'". It sounds almost like an excuse. What other film would sell itself on the fact that a leading player was good in something else? It reads like "So funny in \'The 7 Year Itch\' ... but he\'s rubbish in this".<br /><br />Mansfield, a beautiful girl with rumoured 50-inch assets, is, unfortunately, a bargain basement Monroe with all the acting ability

In [4]:
# 텍스트 -> 숫자 인코딩 (BOW, 단어번호벡터, ...)
text_vectorizer = tf_keras.layers.TextVectorization(max_tokens=20000,  # 사용할 단어 갯수
                                                    output_mode="int", # 출력은 단어 사전의 번호
                                                    output_sequence_length=300) # 각 문장의 길이
text_vectorizer.adapt(review_only_dataset)

In [5]:
for X, y in train_dataset:    
    d = text_vectorizer(X)
    print(d.shape)    
    print( text_vectorizer(X) )
    break

(32, 300)
tf.Tensor(
[[  10   26  108 ...    0    0    0]
 [  11  120    7 ...    0    0    0]
 [  51    2 1208 ...    0    0    0]
 ...
 [   8 4963  362 ...    0    0    0]
 [ 101 1214  141 ...    0    0    0]
 [4107 4912   44 ...    0    0    0]], shape=(32, 300), dtype=int64)


In [6]:
text_vectorizer.get_vocabulary()

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'br',
 'was',
 'as',
 'for',
 'with',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'he',
 'be',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'if',
 'out',
 'some',
 'there',
 'what',
 'good',
 'when',
 'more',
 'very',
 'even',
 'she',
 'my',
 'no',
 'up',
 'would',
 'which',
 'only',
 'time',
 'really',
 'story',
 'their',
 'were',
 'had',
 'see',
 'can',
 'me',
 'than',
 'we',
 'much',
 'well',
 'been',
 'get',
 'will',
 'into',
 'also',
 'because',
 'other',
 'do',
 'people',
 'bad',
 'great',
 'first',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'then',
 'movies',
 'make',
 'films',
 'could',
 'way',
 'them',
 'any',
 'too',
 'after',
 'characters',
 'think',
 'watch',
 'two',
 'many',
 'being',
 'seen',
 'character',
 'never',
 'little',
 'acting',
 'where',
 'plot',
 'be

In [7]:
# 단어 값 -> 압축된 단어 벡터 ( 과정 학습 ), 한 행의 문장 -> 여러 행의 단어 벡터
# input_dim : 총 단어 갯수, output_dim : 한 단어를 표현하는 vector
input = tf_keras.layers.Input(shape=(None,))
output = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)

embedding_model = tf_keras.models.Model(input, output)

In [8]:
# train 데이터셋 전체에대해 text_vectorization 적용
input_dataset = train_dataset.map(lambda review, label: (text_vectorizer(review), label) )

for X, y in input_dataset:
    print(X, y)
    break

tf.Tensor(
[[    9    44    75 ...   495    35   624]
 [   15   283    37 ...     0     0     0]
 [11402  7874  4668 ...     0     0     0]
 ...
 [  131    22   176 ...     0     0     0]
 [ 1198 18028    13 ...     2  2437 13752]
 [    1    13    45 ...     0     0     0]], shape=(32, 300), dtype=int64) tf.Tensor([1 0 0 1 1 1 1 0 0 0 0 1 1 0 1 1 1 0 1 1 1 1 1 1 0 0 0 0 1 0 1 0], shape=(32,), dtype=int32)


In [9]:
all_data = []
for x, y in input_dataset.as_numpy_iterator(): # tensorflow tensor -> numpy ndarray
    # print(type(x))
    # break
    all_data.append(x)

review_only_input_dataset = np.concatenate(all_data)

In [10]:
print(review_only_input_dataset.shape)
print( review_only_input_dataset[:5] )
review_only_input_dataset2 = review_only_input_dataset.astype('int16')

(25000, 300)
[[   10   232 12917 ...     0     0     0]
 [   51    10   208 ...    77  1101     2]
 [  731   108    99 ...     0     0     0]
 [   11    20    44 ...     0     0     0]
 [  252    75  2659 ...   501  9611   193]]


In [11]:
review_only_input_dataset2.dtype

dtype('int16')

In [12]:
# 배치크기 * 단어 갯수 -> 배치크기 * 단어 갯수 * 단어표현크기
embeded_dataset = embedding_model(review_only_input_dataset2) 

ResourceExhaustedError: Exception encountered when calling layer "embedding" "                 f"(type Embedding).

{{function_node __wrapped__ResourceGather_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[25000,300,100] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:ResourceGather]

Call arguments received by layer "embedding" "                 f"(type Embedding):
  • inputs=tf.Tensor(shape=(25000, 300), dtype=float32)

In [None]:
embeded_dataset.shape

TensorShape([25000, 300, 100])

In [None]:
embeded_dataset

<tf.Tensor: shape=(25000, 300, 100), dtype=float32, numpy=
array([[[ 0.01187342, -0.01256521,  0.00752597, ..., -0.04648696,
         -0.01624123,  0.02742126],
        [-0.00443424,  0.01624299, -0.0347303 , ..., -0.04067074,
          0.02249147,  0.01066671],
        [ 0.03618124, -0.00179081, -0.02350274, ...,  0.04206263,
          0.01330247, -0.02644373],
        ...,
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ],
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ],
        [ 0.04079923, -0.04126881,  0.0022082 , ..., -0.02891971,
          0.0403269 ,  0.0318062 ]],

       [[-0.01873448,  0.00226523, -0.00522152, ...,  0.04537462,
         -0.04577348, -0.02152989],
        [ 0.02371925,  0.04375834,  0.04883577, ..., -0.0227849 ,
         -0.04484495,  0.03489175],
        [ 0.00600901,  0.00932712, -0.00301258, ...,  0.04149136,
          0.01192706, -0.00964012],
        

In [None]:
input = tf_keras.layers.Input(shape=(None,))
x = tf_keras.layers.Embedding(input_dim=20000, output_dim=100)(input)
x = tf_keras.layers.LSTM(16)(x)
output = tf_keras.layers.Dense(units=1, activation="sigmoid")(x)
model = tf_keras.models.Model(input, output)

In [None]:
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=["accuracy"])

In [None]:
history = model.fit(input_dataset, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
