# RNN과 어텐션을 사용한 자연어 처리

먼저 몇 개의 모듈을 임포트한다. 맷플롯립 그림을 저장하는 함수를 준비한다.

In [1]:
# 공통 모듈 임포트
import os
import matplotlib.pyplot as plt

# 그림을 저장할 위치
PROJECT_ROOT_DIR = '.'
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, 'images')
os.makedirs(IMAGES_PATH, exist_ok=True)


def save_fig(fig_id, tight_layout=True, fig_extension='png', resolution=300):
    path = os.path.join(IMAGES_PATH, f'{fig_id}.{fig_extension}')
    print(f'그림 저장 {fig_id}')
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, dpi=resolution, format=fig_extension)

## Char-RNN을 사용해 셰익스피어 문제 생성하기

예를 들어, 0~14까지 시퀀스를 2개씩 이동하면서 길이가 5인 윈도우로 나누어 본다(가령,`[0, 1, 2, 3, 4]`, `[2, 3, 4, 5, 6]`, 등). 그다음 이를 섞고 입력(처음 네 개의 스텝)과 타깃(마지막 네 개의 스텝)으로 나눈다(즉, `[2, 3, 4, 5, 6]`를 `[[2, 3, 4, 5], [3, 4, 5, 6]]`로 나눈다). 그다음 입력/타깃 쌍 세 개로 구성된 배치를 만든다:

In [2]:
import tensorflow as tf

n_steps = 5
dataset = tf.data.Dataset.from_tensor_slices(tf.range(15))
dataset = dataset.window(n_steps, 2, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(n_steps))
dataset = dataset.shuffle(10).map(lambda window: (window[:-1], window[1:]))
dataset = dataset.batch(3).prefetch(tf.data.AUTOTUNE)
for index, (X_batch, Y_batch) in enumerate(dataset):
    print(f'{"_" * 20} Batch {index}\nX_batch\n{X_batch.numpy()}\n{"=" * 5}\nY_batch\n{Y_batch.numpy()}')

____________________ Batch 0
X_batch
[[10 11 12 13]
 [ 8  9 10 11]
 [ 0  1  2  3]]
=====
Y_batch
[[11 12 13 14]
 [ 9 10 11 12]
 [ 1  2  3  4]]
____________________ Batch 1
X_batch
[[2 3 4 5]
 [4 5 6 7]
 [6 7 8 9]]
=====
Y_batch
[[ 3  4  5  6]
 [ 5  6  7  8]
 [ 7  8  9 10]]


### 훈련 데이터셋 만들기

In [3]:
from tensorflow import keras

shakespeare_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = keras.utils.get_file('shakespeare.txt', shakespeare_url)
with open(filepath) as f:
    shakespeare_text = f.read()

In [4]:
print(shakespeare_text[:148])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?



In [5]:
''.join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [7]:
tokenizer.texts_to_sequences(['First'])

[[20, 6, 9, 8, 3]]

In [8]:
tokenizer.sequences_to_texts([[20, 6, 9, 8, 3]])

['f i r s t']

In [9]:
max_id = len(tokenizer.word_index)  # 고유한 문자 개수
dataset_size = tokenizer.document_count  # 전체 문자 개수

In [10]:
import numpy as np

[encoded] = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1

### 순차 데이터셋을 나누는 방법

In [11]:
train_size = dataset_size * 90 // 100
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

### 순차 데이터를 윈도 여러 개로 자르기

In [12]:
n_steps = 100
window_length = n_steps + 1  # 타깃 = 한 글자 앞선 입력
dataset = dataset.window(window_length, 1, drop_remainder=True)

In [13]:
dataset = dataset.flat_map(lambda window: window.batch(window_length))

In [14]:
batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [15]:
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, max_id), Y_batch))

In [16]:
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [17]:
for X_batch, Y_batch in dataset.take(1):
    print(X_batch.shape, Y_batch.shape)

(32, 100, 39) (32, 100)


### Char-RNN 모델 만들고 훈련하기

**경고**: 다음 코드는 하드웨어에 따라 실행하는데 24시간이 걸릴 수 있다. GPU를 사용하면 1~2시간 정도 걸릴 수 있다.

**노트**: `GRU` 클래스는 다음 매개변수에서 기본값을 사용할 때에만 GPU를 사용한다: `activation`, `recurrent_activation`, `recurrent_dropout`, `unroll`, `use_bias` `reset_after`. 이 때문에 `recurrent_dropout=0.2`를 주석 처리했다.

In [18]:
model = keras.models.Sequential(
    [
        keras.layers.GRU(128, dropout=0.2, return_sequences=True, input_shape=[None, max_id]),  # recurrent_dropout=0.2
        keras.layers.GRU(128, dropout=0.2, return_sequences=True),  # recurrent_dropout=0.2
        keras.layers.TimeDistributed(keras.layers.Dense(max_id, 'softmax'))
    ]
)
model.compile('adam', 'sparse_categorical_crossentropy')
# history = model.fit(dataset, epochs=10)
history = model.fit(dataset.take(2222))



### Char-RNN 모델 사용하기

In [19]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts)) - 1
    return tf.one_hot(X, max_id)

In [20]:
X_new = preprocess(['How are yo'])
Y_pred = np.argmax(model(X_new), -1)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]  # 1st sentence, last char

'u'

### 가짜 셰익스피어 텍스트를 생성하기

In [21]:
tf.random.categorical([[np.log(0.5), np.log(0.4), np.log(0.1)]], 40).numpy()

array([[1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0,
        0, 0, 0, 1, 1, 2, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 2, 0]],
      dtype=int64)

In [22]:
def next_char(text, temperature=1):
    X_new = preprocess([text])
    y_proba = model(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, 1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [23]:
next_char('How are yo')

'u'

In [24]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [25]:
print(complete_text('t', temperature=0.2))

the country's send ender that i have beard the cous


In [26]:
print(complete_text('t'))

til'd so,
as i lord, fair.
my lear me dreat toke, t


In [27]:
print(complete_text('t', temperature=2))

thaovsemt-libtamudh
of jups betoulm
it'surlf-
osiri


### 상태가 있는 RNN

In [28]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])
dataset = dataset.window(window_length, n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, max_id), Y_batch))
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [29]:
batch_size = 32
encoded_parts = np.array_split(encoded[:train_size], batch_size)
datasets = []
for encoded_part in encoded_parts:
    dataset = tf.data.Dataset.from_tensor_slices(encoded_part)
    dataset = dataset.window(window_length, n_steps, drop_remainder=True)
    dataset = dataset.flat_map(lambda window: window.batch(window_length))
    datasets.append(dataset)
dataset = tf.data.Dataset.zip(tuple(datasets)).map(lambda *windows: tf.stack(windows))
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, max_id), Y_batch))
dataset = dataset.prefetch(tf.data.AUTOTUNE)

**노트**: 여기에서도 GPU 가속을 위해 `recurrent_dropout=0.2`을 주석 처리한다.

In [30]:
model = keras.models.Sequential(
    [
        keras.layers.GRU(
            128, dropout=0.2, return_sequences=True, stateful=True, batch_input_shape=[batch_size, None, max_id]
        ),  # recurrent_dropout=0.2
        keras.layers.GRU(128, dropout=0.2, return_sequences=True, stateful=True),  # recurrent_dropout=0.2
        keras.layers.TimeDistributed(keras.layers.Dense(max_id, 'softmax'))
    ]
)

In [31]:
class ResetStatesCallback(keras.callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [32]:
model.compile('adam', 'sparse_categorical_crossentropy')
# history = model.fit(dataset, epochs=50, callbacks=[ResetStatesCallback()])
history = model.fit(dataset, epochs=7, callbacks=[ResetStatesCallback()])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


모델에 다른 크기의 배치를 사용하려면 상태가 없는 복사본을 만들어야 한다. 드롭아웃은 훈련에만 사용되기 때문에 삭제한다:

In [33]:
stateless_model = keras.models.Sequential(
    [
        keras.layers.GRU(128, return_sequences=True, input_shape=[None, max_id]),
        keras.layers.GRU(128, return_sequences=True),
        keras.layers.TimeDistributed(keras.layers.Dense(max_id, 'softmax'))
    ]
)

가중치를 복사하려면 먼저 (가중치를 만들기 위해) 모델을 빌드한다:

In [34]:
stateless_model.build(tf.TensorShape([None, None, max_id]))

In [35]:
stateless_model.set_weights(model.get_weights())
model = stateless_model

In [36]:
print(complete_text('t'))

t,
'tis have like anfless, i can strenghher vanch'd


## 감성 분석

In [37]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()

In [38]:
X_train[0][:10]

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [39]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(('<pad>', '<sos>', '<unk>')):
    id_to_word[id_] = token
' '.join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [40]:
import tensorflow_datasets as tfds

datasets, info = tfds.load('imdb_reviews', as_supervised=True, with_info=True)

In [41]:
datasets.keys()

dict_keys(['train', 'test', 'unsupervised'])

In [42]:
train_size = info.splits['train'].num_examples
test_size = info.splits['test'].num_examples

In [43]:
train_size, test_size

(25000, 25000)

In [44]:
for X_batch, y_batch in datasets['train'].batch(2).take(1):
    for review, label in zip(X_batch.numpy(), y_batch.numpy()):
        print(f'Review: {review.decode("utf-8")[:200]}...\nLabel: {label} = {"Positive" if label else "Negative"}\n')

Review: This was an absolutely terrible movie. Don't be lured in by Christopher Walken or Michael Ironside. Both are great actors, but this must simply be their worst role in history. Even their great acting ...
Label: 0 = Negative

Review: I have been known to fall asleep during films, but this is usually due to a combination of things including, really tired, being warm and comfortable on the sette and having just eaten a lot. However ...
Label: 0 = Negative



In [45]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b'<br\s*/?>', b' ')
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b' ')
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(b'<pad>'), y_batch

In [46]:
preprocess(X_batch, y_batch)

(<tf.Tensor: shape=(2, 53), dtype=string, numpy=
 array([[b'This', b'was', b'an', b'absolutely', b'terrible', b'movie',
         b"Don't", b'be', b'lured', b'in', b'by', b'Christopher',
         b'Walken', b'or', b'Michael', b'Ironside', b'Both', b'are',
         b'great', b'actors', b'but', b'this', b'must', b'simply', b'be',
         b'their', b'worst', b'role', b'in', b'history', b'Even',
         b'their', b'great', b'acting', b'could', b'not', b'redeem',
         b'this', b"movie's", b'ridiculous', b'storyline', b'This',
         b'movie', b'is', b'an', b'early', b'nineties', b'US',
         b'propaganda', b'pi', b'<pad>', b'<pad>', b'<pad>'],
        [b'I', b'have', b'been', b'known', b'to', b'fall', b'asleep',
         b'during', b'films', b'but', b'this', b'is', b'usually', b'due',
         b'to', b'a', b'combination', b'of', b'things', b'including',
         b'really', b'tired', b'being', b'warm', b'and', b'comfortable',
         b'on', b'the', b'sette', b'and', b'having', b'j

In [47]:
from collections import Counter

vocabulary = Counter()
for X_batch, _ in datasets['train'].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [48]:
vocabulary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [49]:
len(vocabulary)

53893

In [50]:
vocab_size = 10000
truncated_vocabulary = [word for word, _ in vocabulary.most_common()[:vocab_size]]

In [51]:
word_to_id = {word: index for index, word in enumerate(truncated_vocabulary)}
for word in b'This movie was faaaaaantastic'.split():
    print(word_to_id.get(word) or vocab_size)

22
12
11
10000


In [52]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets)

In [53]:
table.lookup(tf.constant([b'This movie was faaaaaantastic'.split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]], dtype=int64)>

In [54]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch


train_set = datasets['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(tf.data.AUTOTUNE)

In [55]:
for X_batch, y_batch in train_set.take(1):
    print(X_batch)
    print(y_batch)

tf.Tensor(
[[  22   11   28 ...    0    0    0]
 [   6   21   70 ...    0    0    0]
 [4099 6881    1 ...    0    0    0]
 ...
 [  22   12  118 ...  331 1047    0]
 [1757 4101  451 ...    0    0    0]
 [3365 4392    6 ...    0    0    0]], shape=(32, 60), dtype=int64)
tf.Tensor([0 0 0 1 1 1 0 0 0 0 0 1 1 0 1 0 1 1 1 0 1 1 1 1 1 0 0 0 1 0 0 0], shape=(32,), dtype=int64)


In [56]:
embed_size = 128
model = keras.models.Sequential(
    [
        keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size, mask_zero=True, input_shape=[None]),
        keras.layers.GRU(128, return_sequences=True),
        keras.layers.GRU(128),
        keras.layers.Dense(1, 'sigmoid')
    ]
)
model.compile('adam', 'binary_crossentropy', ['accuracy'])
# history = model.fit(train_set, epochs=5)
history = model.fit(train_set, epochs=2)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


또는 직접 마스킹을 한다:

In [66]:
K = keras.backend
embed_size = 128
inputs = keras.layers.Input([None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets, embed_size)(inputs)
z = keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = keras.layers.GRU(128)(z, mask=mask)
outputs = keras.layers.Dense(1, 'sigmoid')(z)
model = keras.models.Model([inputs], [outputs])
model.compile('adam', 'binary_crossentropy', ['accuracy'])
# history = model.fit(train_set, epochs=5)
history = model.fit(train_set, epochs=2)

Epoch 1/2
Epoch 2/2
