In [1]:
import re
from konlpy.tag import Mecab

# Mecab 객체 생성
mecab = Mecab()

# 데이터 읽기
with open("korean-english-park.train.ko", "r", encoding="utf-8") as f:
    kor_lines = f.readlines()

with open("korean-english-park.train.en", "r", encoding="utf-8") as f:
    eng_lines = f.readlines()

# 한글 전처리
def preprocess_kor(text):
    text = re.sub(r"[^ㄱ-ㅎ가-힣0-9\s]", "", text)  # 한글, 숫자, 공백만 남김
    return mecab.morphs(text)  # 형태소 분석

# 영어 전처리
def preprocess_eng(text):
    text = re.sub(r"[^a-zA-Z0-9\s]", "", text.lower())  # 영어, 숫자, 공백만 남김
    return ["<start>"] + text.split() + ["<end>"]  # 토큰화


# 병렬 데이터 정제 및 중복 제거
cleaned_corpus = set()
for kor, eng in zip(kor_lines, eng_lines):
    kor_tokens = preprocess_kor(kor.strip())
    eng_tokens = preprocess_eng(eng.strip())
    if len(kor_tokens) <= 40 and len(eng_tokens) <= 40:
        cleaned_corpus.add((tuple(kor_tokens), tuple(eng_tokens)))

# 병렬 데이터 리스트 변환
cleaned_corpus = list(cleaned_corpus)

# 병렬 데이터 분리
kor_corpus = [" ".join(kor) for kor, _ in cleaned_corpus]
eng_corpus = [" ".join(eng) for _, eng in cleaned_corpus]

In [2]:
print("English:", eng_corpus[100])
print("Spanish:", kor_corpus[100])

English: <start> the environmental consequences would be regional but the social and economic consequences would be global <end>
Spanish: 환경 적 인 영향 은 지역 적 이 겠 지만 사회 및 경제 적 인 영향 은 전 세계 에 미치 게 될 겁니다


In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# 토큰화 함수
def tokenize(corpus, vocab_size=10000):
    tokenizer = Tokenizer(num_words=vocab_size, filters="", oov_token="<unk>")
    tokenizer.fit_on_texts(corpus)
    tensor = tokenizer.texts_to_sequences(corpus)
    tensor = pad_sequences(tensor, padding="post")  # 패딩 추가
    return tensor, tokenizer


# 데이터 토큰화
kor_tensor, kor_tokenizer = tokenize(kor_corpus, vocab_size=20000)
eng_tensor, eng_tokenizer = tokenize(eng_corpus, vocab_size=20000)

2025-01-11 09:53:22.924095: I tensorflow/core/util/port.cc:111] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-01-11 09:53:22.944596: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-11 09:53:22.944614: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-11 09:53:22.944627: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-11 09:53:22.948565: I tensorflow/core/platform/cpu_feature_g

In [4]:
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense


class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = Dense(units)
        self.W2 = Dense(units)
        self.V = Dense(1)

    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        score = self.V(tf.nn.tanh(
            self.W1(query_with_time_axis) + self.W2(values)))
        attention_weights = tf.nn.softmax(score, axis=1)
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)
        return context_vector, attention_weights


class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(enc_units, return_sequences=True, return_state=True)


    def call(self, x, initial_state):
        x = self.embedding(x)
        output, h, c = self.lstm(x, initial_state=initial_state)
        return output, h, c


class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.embedding = Embedding(vocab_size, embedding_dim)
        self.lstm = LSTM(dec_units, return_sequences=True, return_state=True)
        self.fc = Dense(vocab_size)
        self.attention = BahdanauAttention(dec_units)

    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(
            hidden[0], enc_output)
        x = self.embedding(x)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)
        output, h, c = self.lstm(x)
        x = self.fc(output)
        return x, h, c, attention_weights

In [5]:
# 옵티마이저
optimizer = tf.keras.optimizers.Adam()

# 손실 함수 정의
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction="none")


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))  # 패딩 제외
    loss_ = loss_object(real, pred)
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask
    return tf.reduce_mean(loss_)

2025-01-11 09:54:52.071338: E tensorflow/compiler/xla/stream_executor/cuda/cuda_driver.cc:268] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2025-01-11 09:54:52.071349: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:168] retrieving CUDA diagnostic information for host: 11be54d0c487
2025-01-11 09:54:52.071351: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:175] hostname: 11be54d0c487
2025-01-11 09:54:52.071374: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:199] libcuda reported version is: 535.183.1
2025-01-11 09:54:52.071379: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:203] kernel reported version is: 535.183.1
2025-01-11 09:54:52.071380: I tensorflow/compiler/xla/stream_executor/cuda/cuda_diagnostics.cc:309] kernel version seems to match DSO: 535.183.1


In [6]:
embedding_dim = 256
units = 512
batch_size = 64
vocab_inp_size = len(kor_tokenizer.word_index) + 1
vocab_tar_size = len(eng_tokenizer.word_index) + 1

# 모델 초기화
encoder = Encoder(vocab_inp_size, embedding_dim, units, batch_size)
decoder = Decoder(vocab_tar_size, embedding_dim, units, batch_size)

In [7]:
# 훈련 루프
import tensorflow as tf


@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0

    with tf.GradientTape() as tape:
        enc_output, enc_hidden_h, enc_hidden_c = encoder(inp, enc_hidden)
        dec_hidden = [enc_hidden_h, enc_hidden_c]
        dec_input = tf.expand_dims(
            [eng_tokenizer.word_index['<start>']] * batch_size, 1)

        for t in range(1, targ.shape[1]):
            predictions, dec_hidden_h, dec_hidden_c, _ = decoder(
                dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)

            dec_input = tf.expand_dims(targ[:, t], 1)

        batch_loss = loss / int(targ.shape[1])
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss

In [8]:
from sklearn.model_selection import train_test_split

# 훈련 데이터 분리
dataset = tf.data.Dataset.from_tensor_slices((kor_tensor, eng_tensor))
dataset = dataset.shuffle(len(kor_tensor)).batch(
    batch_size, drop_remainder=True)

# 에포크 설정
EPOCHS = 10

for epoch in range(EPOCHS):
    total_loss = 0
    enc_hidden = [tf.zeros((batch_size, units)) for _ in range(2)]

    for (batch, (inp, targ)) in enumerate(dataset):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 10 == 0:
            print(f'Epoch {epoch + 1} Batch {batch} Loss {batch_loss.numpy()}')

    print(f'Epoch {epoch + 1} Loss {total_loss / batch:.4f}')

Epoch 1 Batch 0 Loss 5.395641803741455
Epoch 1 Batch 10 Loss 4.617795467376709
Epoch 1 Batch 20 Loss 3.3545758724212646
Epoch 1 Batch 30 Loss 3.4984796047210693
Epoch 1 Batch 40 Loss 3.392894744873047
Epoch 1 Batch 50 Loss 3.289454698562622
Epoch 1 Batch 60 Loss 3.0413544178009033
Epoch 1 Batch 70 Loss 3.175771713256836
Epoch 1 Batch 80 Loss 3.5031380653381348
Epoch 1 Batch 90 Loss 3.1396377086639404
Epoch 1 Batch 100 Loss 3.352759599685669
Epoch 1 Batch 110 Loss 3.335084915161133
Epoch 1 Batch 120 Loss 3.2961952686309814
Epoch 1 Batch 130 Loss 3.4688851833343506
Epoch 1 Batch 140 Loss 3.2198104858398438
Epoch 1 Batch 150 Loss 3.514211654663086
Epoch 1 Batch 160 Loss 2.956819534301758
Epoch 1 Batch 170 Loss 2.9151599407196045
Epoch 1 Batch 180 Loss 3.126091241836548
Epoch 1 Batch 190 Loss 3.133451461791992
Epoch 1 Batch 200 Loss 3.436023712158203
Epoch 1 Batch 210 Loss 3.1658456325531006
Epoch 1 Batch 220 Loss 3.500621795654297
Epoch 1 Batch 230 Loss 3.0166022777557373
Epoch 1 Batch 24

In [9]:
def evaluate(sentence):
    sentence = preprocess_kor(sentence)
    inputs = [kor_tokenizer.word_index.get(
        word, kor_tokenizer.word_index['<unk>']) for word in sentence]
    inputs = pad_sequences(
        [inputs], maxlen=kor_tensor.shape[1], padding='post')
    inputs = tf.convert_to_tensor(inputs)

    result = []
    hidden = tf.zeros((1, 512))  # 초기 hidden state
    enc_output, enc_hidden_h, enc_hidden_c = encoder(inputs, [hidden, hidden])

    dec_input = tf.expand_dims([eng_tokenizer.word_index['<start>']], 0)

    for t in range(40):  # 최대 번역 길이
        predictions, dec_hidden_h, dec_hidden_c, _ = decoder(
            dec_input, [enc_hidden_h, enc_hidden_c], enc_output)
        predicted_id = tf.argmax(predictions[0][0]).numpy()
        if eng_tokenizer.index_word[predicted_id] == '<end>':
            break
        result.append(eng_tokenizer.index_word[predicted_id])
        dec_input = tf.expand_dims([predicted_id], 0)

    return " ".join(result)


# 평가
test_sentences = [
    "오바마는 대통령이다.",
    "시민들은 도시 속에 산다.",
    "커피는 필요 없다.",
    "일곱 명의 사망자가 발생했다."
]

for sentence in test_sentences:
    print(f"Input: {sentence}")
    print(f"Output: {evaluate(sentence)}")

Input: 오바마는 대통령이다.
Output: obama has been a candidate
Input: 시민들은 도시 속에 산다.
Output: the <unk>
Input: 커피는 필요 없다.
Output: its not going to be a lot of the world
Input: 일곱 명의 사망자가 발생했다.
Output: the us military service
