## 텍스트를 위한 딥러닝#2

In [1]:
import numpy as np
import os, pathlib, shutil, random
from tensorflow import keras

## 시퀀스 모델 방식
- 단어를 시퀀스로 처리하기

### 1. 데이터 준비하기

#### 예제 데이터셋: IMDB 영화리뷰
- 앤드류 마스(Andrew Mass)의 스탠포드 페이지에서 다운로드

**데이터 다운로드**

In [2]:
!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

--2024-06-19 06:33:15--  http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
Resolving ai.stanford.edu (ai.stanford.edu)... 171.64.68.10
Connecting to ai.stanford.edu (ai.stanford.edu)|171.64.68.10|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 84125825 (80M) [application/x-gzip]
Saving to: ‘aclImdb_v1.tar.gz’


2024-06-19 06:33:28 (6.56 MB/s) - ‘aclImdb_v1.tar.gz’ saved [84125825/84125825]



**데이터 준비**

In [3]:
!tar -xf aclImdb_v1.tar.gz
!rm -rf aclImdb/train/unsup

In [4]:
import os, pathlib, random, shutil
from sklearn.model_selection import train_test_split

base_dir = pathlib.Path('aclImdb')
train_dir = base_dir / 'train'
val_dir = base_dir / 'val'

for category in ('neg','pos'):
    os.makedirs(val_dir / category)
    files = os.listdir(train_dir / category)
    random.Random(1237).shuffle(files)
    num_val_samples = int(0.2 * len(files))
    val_files = files[-num_val_samples:]
    for fname in val_files:
        shutil.move(train_dir / category / fname,
                    val_dir / category / fname)

batch_size= 32
train_ds = keras.utils.text_dataset_from_directory('aclImdb/train',batch_size=batch_size)
val_ds = keras.utils.text_dataset_from_directory('aclImdb/val',batch_size=batch_size)
test_ds = keras.utils.text_dataset_from_directory('aclImdb/test',batch_size=batch_size)

Found 20000 files belonging to 2 classes.
Found 5000 files belonging to 2 classes.
Found 25000 files belonging to 2 classes.


**정수 시퀀스 데이터셋 준비하기**

In [5]:
from keras import Input, Model, layers

In [7]:
max_tokens = 20000
maxlen = 600
text_vect = layers.TextVectorization(max_tokens= max_tokens, output_mode='int',
                                     output_sequence_length= maxlen)

text_only_train_ds = train_ds.map(lambda x, y: x)
text_vect.adapt(text_only_train_ds)

# num_parallel_calls = 4 -> 다중 CPU 코어 활용을 위한 매개변수
int_train_ds = train_ds.map(lambda x, y: (text_vect(x), y),num_parallel_calls=4)
int_val_ds = val_ds.map(lambda x, y: (text_vect(x), y),num_parallel_calls=4)
int_test_ds = test_ds.map(lambda x, y: (text_vect(x), y),num_parallel_calls=4)

### 2. 모델 구성하고 훈련하기

#### 1) 원-핫 인코딩된 벡터 시퀀스로 시퀀스 모델 만들기

In [8]:
import tensorflow as tf

inputs = Input(shape=(None,), dtype='int64')
embedded = tf.one_hot(inputs, depth = max_tokens)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = Model(inputs, outputs)
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 tf.one_hot (TFOpLambda)     (None, None, 20000)       0         
                                                                 
 bidirectional (Bidirection  (None, 64)                5128448   
 al)                                                             
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 5128513 (19.56 MB)
Trainable params: 5128513 (19.56 MB)
Non-trainable params: 0 (0.00 Byte)
_____________________

**첫 번째 시퀀스 모델 훈련하기**

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
model_path = '/content/drive/MyDrive/Colab Notebooks/model/'
model_name = 'aclImdb_one_hot_bidir_lstm.h5'

model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['accuracy'])
callbacks = [keras.callbacks.ModelCheckpoint(model_name)]
history = model.fit(int_train_ds.cache(),
                    validation_data= int_val_ds.cache(),
                    epochs=10,
                    callbacks= callbacks)
best_model = keras.models.load_model(model_name)
print(f'정수형 임베딩 적용 -> 테스트 정확도: {best_model.evaluate(int_test_ds)[1]:.4f}')

Epoch 1/10
Epoch 2/10


  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
정수형 임베딩 적용 -> 테스트 정확도: 0.8687


- 정수형 임베딩 적용 -> 테스트 정확도: 0.8687

#### 2) Embedding 층으로 단어 임베딩 학습하기

#### 케라스의 Embedding layer

**임베딩 층은 lookup table**
- 임베딩 층의 입력을 사용하기 위해 입력시퀀스의 각 단어들은 모두 정수 인코딩이 되어있어야 함
- 어떤 단어 -> 단어에 부여된 고유한 정수값 -> 임베딩 층 통과 -> 밀집벡터(dense vector)
- 임베딩 층은 입력 정수에 대해 밀집 벡터로 맵핑하고 이 밀집벡터는 인공신경망의 학습과정에서 가중치가 학습되는 것과 같은 방식으로 훈련됨
- 훈련과정에서 단어는 모델이 풀고자하는 작업에 맞는 값으로 업데이트되는데 이 밀집벡터를 **임베딩 벡터** 라고 함

![image.png](attachment:8dd8c015-aa4d-478f-b5bc-dfba98688728.png)

**tf.keras.layers.Embedding()**
- https://www.tensorflow.org/api_docs/python/tf/keras/layers/Embedding
```python
tf.keras.layers.Embedding(input_dim,
                          output_dim,
                          embeddings_initializer='uniform',
                          embeddings_regularizer=None,
                          embeddings_constraint=None,
                          mask_zero=False,
                          weights=None,
                          lora_rank=None,
                          **kwargs
                         )
- input_dim : Integer. Size of vocabulary, maximum integer index+1
- output_dim : Integer, Dimension of dense embedding
```


**`Embedding` 층 만들기**

In [13]:
embedding_layer = layers.Embedding(input_dim=max_tokens, output_dim=256)

**밑바닥부터 훈련하는 `Embedding` 층을 사용한 모델**

In [19]:
inputs = Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = Model(inputs, outputs)

model_path = '/content/drive/MyDrive/Colab Notebooks/model/'
model_name = model_path + 'aclImdb_embedding_bidir_lstm.h5'

model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['accuracy'])
callbacks = [keras.callbacks.ModelCheckpoint(model_name)]
history = model.fit(int_train_ds.cache(),
                    validation_data= int_val_ds.cache(),
                    epochs=10,
                    callbacks= callbacks)
best_model = keras.models.load_model(model_name)
print(f'Embedding layer 적용 -> 테스트 정확도: {best_model.evaluate(int_test_ds)[1]:.4f}')

Epoch 1/10

  saving_api.save_model(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Embedding layer 적용 -> 테스트 정확도: 0.8388


- 정수형 임베딩 적용 -> 테스트 정확도: 0.8687
- Embedding layer 적용 -> 테스트 정확도: 0.8388

#### 패딩과 마스킹 이해하기

**마스킹을 활성화한 `Embedding` 층 사용하기**

In [20]:
inputs = Input(shape=(None,), dtype='int64')
embedded = layers.Embedding(input_dim=max_tokens, output_dim=256, mask_zero=True)(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = Model(inputs, outputs)

model_path = '/content/drive/MyDrive/Colab Notebooks/model/'
model_name = model_path + 'aclImdb_embedding_bidir_lstm_with_masking.h5'

model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['accuracy'])
callbacks = [keras.callbacks.ModelCheckpoint(model_name)]
history = model.fit(int_train_ds.cache(),
                    validation_data= int_val_ds.cache(),
                    epochs=10,
                    callbacks= callbacks)
best_model = keras.models.load_model(model_name)
print(f'Embedding layer with masking 적용 -> 테스트 정확도: {best_model.evaluate(int_test_ds)[1]:.4f}')

Epoch 1/10
Epoch 2/10
  1/625 [..............................] - ETA: 25s - loss: 0.2297 - accuracy: 0.9375

  saving_api.save_model(


Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Embedding layer with masking 적용 -> 테스트 정확도: 0.8618


- 정수형 임베딩 적용 -> 테스트 정확도: 0.8687
- Embedding layer 적용 -> 테스트 정확도: 0.8388
- Embedding layer with masking 적용 -> 테스트 정확도: 0.8618

#### 3) 사전 훈련된 단어 임베딩 사용하기

In [15]:
# glove 임베딩 모델 사용: glove.6B
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2024-06-19 07:20:18--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-06-19 07:20:19--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-06-19 07:20:19--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

**GloVe 단어 임베딩 파일 파싱하기**

In [16]:
import numpy as np
path_glovefile = 'glove.6B.100d.txt'

embedding_idx= {}
with open(path_glovefile) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embedding_idx[word] = coefs
print(f'단어 벡터 개수: {len(embedding_idx)}')

단어 벡터 개수: 400000


**GloVe 단어 임베딩 행렬 준비하기**

In [21]:
embedding_dim = 100

vocabs = text_vect.get_vocabulary()
word_idx = dict(zip(vocabs, range(len(vocabs))))
embedding_matrix = np.zeros((max_tokens, embedding_dim))
for word, i in word_idx.items():
    if i < max_tokens:
        embedding_vector = embedding_idx.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [23]:
embedding_layer = layers.Embedding(input_dim= max_tokens,
                                   output_dim=embedding_dim,
                                   embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                                   trainable=False,
                                   mask_zero=True)

**사전 훈련된 임베딩을 사용하는 모델**

In [None]:
inputs = Input(shape=(None,), dtype='int64')
embedded = embedding_layer(inputs)
x = layers.Bidirectional(layers.LSTM(32))(embedded)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(1, activation='sigmoid')(x)
model = Model(inputs, outputs)

model_path = '/content/drive/MyDrive/Colab Notebooks/model/'
model_name = model_path + 'aclImdb_glove_bidir_lstm_with_masking.h5'

model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['accuracy'])
callbacks = [keras.callbacks.ModelCheckpoint(model_name)]
history = model.fit(int_train_ds.cache(),
                    validation_data= int_val_ds.cache(),
                    epochs=10,
                    callbacks= callbacks)
best_model = keras.models.load_model(model_name)
print(f'Glove with masking 적용 -> 테스트 정확도: {best_model.evaluate(int_test_ds)[1]:.4f}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


- 정수형 임베딩 적용 -> 테스트 정확도: 0.8687
- Embedding layer 적용 -> 테스트 정확도: 0.8388
- Embedding layer with masking 적용 -> 테스트 정확도: 0.8618
- Glove with masking 적용 -> 테스트 정확도: 0.8980

- 작은 데이터셋을 다룰 때는 사전 훈련된 임베딩을 사용하는 것이 도움이 됨
- 훈련 샘플 개수와 샘플 당 평균단어 개수의 비율
    - (샘플개수 / 평균샘플길이) > 1500 -> 시퀀스 모델
    - (샘플개수 / 평균샘플길이) < 1500 -> 바이그램 모델


---