# 라이브러리

In [None]:
# 파이썬 표준 라이브러리
import random
from pathlib import Path

# 파이썬 서드파티 라이브러리
import numpy as np
import tensorflow as tf

# 클래스

# 함수

In [None]:
# 영어와 스페인어 데이터셋을 다운로드하는 함수
def download_en_es() -> tuple[list[str], list[str]]:
    """영어와 스페인어 데이터셋을 다운로드하는 함수

    Returns:
        (영어 데이터, 스페인인어 데이터) -> tuple[list[str], list[str]]
    """
    url = "https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip"
    path = tf.keras.utils.get_file("spa-eng.zip", origin=url, cache_dir="datasets",
                                  extract=True)
    text = (Path(path).with_name("spa-eng") / "spa.txt").read_text()

    text = text.replace("¡", "").replace("¿", "")
    pairs = [line.split("\t") for line in text.splitlines()]
    np.random.shuffle(pairs)
    sentences_en, sentences_es = zip(*pairs)
    return sentences_en, sentences_es

In [None]:
# 영어를 스페인어로 번역하는 함수
def translate(sentence_en: str) -> str:
    """영어를 스페인어로 번역하는 함수

    Args:
        sentence_en: 영어 텍스트 -> str

    Returns:
        sentence_es: 스페인어 텍스트 -> str
    """
    translation = ""
    for word_idx in range(max_length):
        X = np.array([sentence_en])  # 인코더 입력값
        X_dec = np.array(["startofseq " + translation])  # 디코더 입력값
        y_proba = model.predict((X, X_dec))[0, word_idx]  # 마지막 토큰의 확률
        predicted_word_id = np.argmax(y_proba)
        predicted_word = text_vec_layer_es.get_vocabulary()[predicted_word_id]
        if predicted_word == "endofseq":
            break
        translation += " " + predicted_word
    return translation.strip()

# 변수

In [None]:
SEED = 1234
tf.random.set_seed(SEED)

# 데이터 불러오기 및 텍스트 벡터화

## 영어와 스페인어 텍스트 데이터 불러오기

In [None]:
sentences_en, sentences_es = download_en_es()

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


## 영어와 스페인어 텍스트를 벡터화

In [None]:
vocab_size = 1000
max_length = 50
start_token = "startofseq"
end_token = "endofseq"

text_vec_layer_en = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=max_length)
text_vec_layer_es = tf.keras.layers.TextVectorization(
    max_tokens=vocab_size, output_sequence_length=max_length)
text_vec_layer_en.adapt(sentences_en)
text_vec_layer_es.adapt([f"{start_token} {s} {end_token}" for s in sentences_es]) # 스페인어 문장의 맨 앞과 뒤에 시작 토큰 및 종료 토큰 추가

# 모델

## 데이터셋 생성

In [None]:
n = 100000
X_train = tf.constant(sentences_en[:n])
X_valid = tf.constant(sentences_en[n:])
X_train_dec = tf.constant([f"{start_token} {s}" for s in sentences_es[:n]])
X_valid_dec = tf.constant([f"{start_token} {s}" for s in sentences_es[n:]])
Y_train = text_vec_layer_es([f"{s} {end_token}" for s in sentences_es[:n]])
Y_valid = text_vec_layer_es([f"{s} {end_token}" for s in sentences_es[n:]])

## 모델 아키텍처

In [None]:
encoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)
decoder_inputs = tf.keras.layers.Input(shape=[], dtype=tf.string)

In [None]:
embed_size = 128
encoder_input_ids = text_vec_layer_en(encoder_inputs)
decoder_input_ids = text_vec_layer_es(decoder_inputs)

encoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, mask_zero=True)
decoder_embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embed_size, mask_zero=True)
encoder_embeddings = encoder_embedding_layer(encoder_input_ids)
decoder_embeddings = decoder_embedding_layer(decoder_input_ids)

In [None]:
max_length = 50
embed_size = 128
pos_embed_layer = tf.keras.layers.Embedding(input_dim=max_length, output_dim=embed_size)
batch_max_len_enc = tf.shape(encoder_embeddings)[1]
encoder_in = encoder_embeddings + pos_embed_layer(tf.range(batch_max_len_enc))
batch_max_len_dec = tf.shape(decoder_embeddings)[1]
decoder_in = decoder_embeddings + pos_embed_layer(tf.range(batch_max_len_dec))

In [None]:
N = 2
num_heads = 8
dropout_rate = 0.1
n_units = 128
encoder_pad_mask = tf.math.not_equal(encoder_input_ids, 0)[:, tf.newaxis]
Z = encoder_in
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(units=n_units, activation="swish", kernel_initializer="he_normal")(Z)
    Z = tf.keras.layers.Dense(units=embed_size, activation="swish", kernel_initializer="he_normal")(Z)
    Z = tf.keras.layers.Dropout(rate=dropout_rate)(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

In [None]:
decoder_pad_mask = tf.math.not_equal(decoder_input_ids, 0)[:, tf.newaxis]
causal_mask = tf.linalg.band_part(tf.ones((batch_max_len_dec, batch_max_len_dec), tf.bool), -1, 0)

In [None]:
encoder_outputs = Z
Z = decoder_in
for _ in range(N):
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=Z, attention_mask=causal_mask & decoder_pad_mask)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    attn_layer = tf.keras.layers.MultiHeadAttention(
        num_heads=num_heads, key_dim=embed_size, dropout=dropout_rate)
    Z = attn_layer(Z, value=encoder_outputs, attention_mask=encoder_pad_mask)
    Z = tf.keras.layers.Normalization()(tf.keras.layers.Add()([Z, skip]))
    skip = Z
    Z = tf.keras.layers.Dense(units=n_units, activation="swish", kernel_initializer="he_normal")(Z)
    Z = tf.keras.layers.Dense(units=embed_size, activation="swish", kernel_initializer="he_normal")(Z)
    Z = tf.keras.layers.LayerNormalization()(tf.keras.layers.Add()([Z, skip]))

## 모델 훈련

In [None]:
Y_proba = tf.keras.layers.Dense(units=vocab_size, activation="softmax")(Z)
model = tf.keras.Model(inputs=[encoder_inputs, decoder_inputs], outputs=[Y_proba])
model.compile(loss="sparse_categorical_crossentropy", optimizer="nadam", metrics=["accuracy"])
model.fit((X_train, X_train_dec), Y_train, epochs=10, validation_data=((X_valid, X_valid_dec), Y_valid))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7c4bf39d8c40>

# 번역

In [None]:
for _ in range(20):
    input_sentence = random.choice(sentences_en)
    print("-")
    print(input_sentence)
    print(translate(input_sentence))

-
Tom showed Mary his new tattoo.
tom le [UNK] su nuevo [UNK]
-
He was wrong in thinking that she'd come to see him.
Él estaba equivocado en el [UNK] [UNK] venir a [UNK]
-
I like white wine better than red wine.
me gusta el vino más [UNK] que el vino de vino
-
It's sweltering.
es [UNK]
-
The king once lived in that palace.
la [UNK] una vez [UNK] en esa [UNK]
-
That's pseudoscience.
eso es [UNK]
-
I don't have enough money to travel.
no tengo suficiente dinero para viajar
-
He isn't perfect.
Él no es [UNK]
-
Your second button is coming off.
tu [UNK] [UNK] está [UNK]
-
This is never going to end.
esto nunca va a terminar
-
He was killed by his own brother.
Él fue [UNK] por su propio hermano
-
Do Tom and Mary know?
tom y mary [UNK]
-
Everyone is friendly to her.
todos se [UNK] con ella
-
You came home at 5:00.
[UNK] a casa a las [UNK]
-
I've never been to Paris.
nunca he estado en parís
-
It was really cheap.
era realmente [UNK]
-
Keep the money in a safe place.
[UNK] el dinero en un lug