In [159]:
import pandas as pd
import numpy as np
import glob, os, re, jieba
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

from keras.utils import np_utils
from keras.models import Model, save_model, load_model
from keras.layers import Input, LSTM, Dense, Flatten
from keras.callbacks import EarlyStopping
import json

In [160]:
def preprocess_kr(w):
    w = re.sub(r"([?'!¿\-·\"])", r" \1 ", w)
    w = re.sub(r'[ |ㄱ-ㅎ|ㅏ-ㅣ]+', r" ", w)
    w = re.sub(r"\,(?=[0-9])", r"", w)
    w = w[:-1].strip()
    w = '<start> ' + w + ' <end>'
    return w  

In [161]:
# JSON 파일을 읽고 필요한 데이터를 추출하는 함수
def extract_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    standard_forms = []
    dialect_forms = []
    
    for utterance in data['utterance']:
        standard_forms.append(utterance['standard_form'])
        dialect_forms.append(utterance['dialect_form'])
    
    return standard_forms, dialect_forms

In [162]:
# 표준어와 제주어 데이터를 전처리하고 크기를 제한하는 함수
def preprocess(path, num_data):
    files = glob.glob(os.path.join(path, '*.json'))
    std, jej = [], []  # 빈 리스트 초기화
    
    for f in files:
        std_forms, dial_forms = extract_data_from_json(f)
        std.extend(std_forms)
        jej.extend(dial_forms)
    
    std_series = pd.Series(std)
    jej_series = pd.Series(jej)
    
    df = pd.concat([std_series, jej_series], axis=1)
    df.columns = ['표준어', '제주어']
    
    df['표준어'] = df['표준어'].apply(preprocess_kr)
    df['제주어'] = df['제주어'].apply(preprocess_kr)
    
    df = df.sample(num_data, random_state=2)
    
    return df

In [163]:
def tokenize(train_df):
    std_vocab, jej_vocab = set(), set()

    for line in train_df['표준어']:
        for c in line:
            std_vocab.add(c)

    for line in train_df['제주어']:
        for c in line:
            jej_vocab.add(c)
    
    std_vocab.add('<start>')
    std_vocab.add('<end>')
    jej_vocab.add('<start>')
    jej_vocab.add('<end>')
    
    std_vocab_size = len(std_vocab) + 1
    jej_vocab_size = len(jej_vocab) + 1
    
    std_vocab = sorted(list(std_vocab))
    jej_vocab = sorted(list(jej_vocab))
    
    std_to_index = {c: i+1 for i, c in enumerate(std_vocab)}
    jej_to_index = {c: i+1 for i, c in enumerate(jej_vocab)}
    
    encoder_input = []
    for line in train_df['표준어']:
        encoded_line = [std_to_index[c] for c in line]
        encoder_input.append(encoded_line)
        
    decoder_input = []
    for line in train_df['제주어']:
        encoded_line = [jej_to_index[c] for c in line]
        decoder_input.append(encoded_line)
        
    decoder_target = []
    for line in train_df['제주어']:
        encoded_line = [jej_to_index[c] for c in line[1:]]
        decoder_target.append(encoded_line)
    
    max_len_std = max(len(seq) for seq in encoder_input)
    max_len_jej = max(len(seq) for seq in decoder_input)
    
    encoder_input = pad_sequences(encoder_input, maxlen=max_len_std, padding='post')
    decoder_input = pad_sequences(decoder_input, maxlen=max_len_jej, padding='post')
    decoder_target = pad_sequences(decoder_target, maxlen=max_len_jej, padding='post')
    
    encoder_input = to_categorical(encoder_input, num_classes=std_vocab_size)
    decoder_input = to_categorical(decoder_input, num_classes=jej_vocab_size)
    decoder_target = to_categorical(decoder_target, num_classes=jej_vocab_size)
    
    return encoder_input, decoder_input, decoder_target, std_vocab_size, jej_vocab_size, std_to_index, jej_to_index


In [164]:
train_df = pd.read_csv('./train_df.csv')

In [165]:
df = train_df[:10000]

encoder_input, decoder_input, decoder_ko, std_vocab_size, jej_vocab_size, std_to_index, jej_to_index = tokenize(df)

# 표준어 인코딩
tmp_dict = dict((i, c) for c, i in std_to_index.items())

for i in tmp_dict:
    try:
        tmp_dict[i] = tmp_dict[i].encode('utf-8')  # 'EUC_CN'을 'utf-8'로 변경
    except:
        pass
    
index_to_std = dict((i, c) for c, i in tmp_dict.items())


In [171]:
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.models import Model

# 트레이닝 시 이전 상태의 실제값을 현재 상태의 디코더 입력으로 해야 함 (예측값으로 하면 안 됨)
encoder_inputs = Input(shape=(None, std_vocab_size), name='encoder_input')
decoder_inputs = Input(shape=(None, jej_vocab_size), name='decoder_input')

# 인코더 LSTM 셀
encoderLSTM = LSTM(units=512, return_state=True, name='encoderLSTM')  # return_state : 인코더의 마지막 상태 정보를 디코더의 입력 상태 정보로 전달
decoderLSTM = LSTM(units=512, return_sequences=True, return_state=True, name='decoderLSTM')

# 인코더 LSTM셀의 입력 정의
encoder_outputs, stateH, stateC = encoderLSTM(encoder_inputs)  # _, 히든 상태(위), 셀 상태(오른쪽)
encoder_state = [stateH, stateC]  # 컨텍스트 벡터

decoder_output, _, _ = decoderLSTM(decoder_inputs, initial_state=encoder_state)
decoder_softmax = Dense(jej_vocab_size, activation="softmax")
decoder_output = decoder_softmax(decoder_output)

model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_output)

model.summary()


Model: "model_26"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_input (InputLayer)     [(None, None, 1199)  0           []                               
                                ]                                                                 
                                                                                                  
 decoder_input (InputLayer)     [(None, None, 1246)  0           []                               
                                ]                                                                 
                                                                                                  
 encoderLSTM (LSTM)             [(None, 512),        3506176     ['encoder_input[0][0]']          
                                 (None, 512),                                              

In [172]:
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import save_model

# 모델 컴파일
model.compile(optimizer="adam", loss="categorical_crossentropy")

# 모델 훈련
history = model.fit(
    x=[encoder_input, decoder_input],
    y=decoder_ko,
    batch_size=128,
    epochs=50,
    validation_split=0.2,
)

# 모델 저장
save_model(model, 'std_to_jej.h5', overwrite=True)


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [179]:
from tensorflow.keras.models import load_model

# 데이터프레임을 2500개씩 분할하여 모델을 훈련
for i in range(1, len(train_df) // 5000):
    df = train_df[i * 5000:(i + 1) * 5000]
    
    encoder_input, decoder_input, decoder_ko, std_vocab_size, jej_vocab_size = tokenize(df)

    model = load_model('std_to_jej.h5')

    early_stopping = EarlyStopping(monitor='val_loss', patience=3)
    model.fit(x=[encoder_input, decoder_input], y=decoder_ko, batch_size=64, epochs=3, validation_split=0.2, callbacks=[early_stopping])
    save_model(model, 'std_to_jej.h5', overwrite=True)


NameError: name 'tokenize_dataset' is not defined

In [None]:
# 인코더 모델 정의
encoder_model = Model(inputs=encoder_inputs, outputs=encoder_state)

# 인덱스와 단어의 매핑을 변경
std_to_index = dict((i, c) for c, i in std_to_index.items())
jej_to_index = dict((i, c) for c, i in jej_to_index.items())


In [None]:
encoder_model.summary()


Model: "model_22"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 encoder_input (InputLayer)  [(None, None, 13)]        0         
                                                                 
 encoderLSTM (LSTM)          [(None, 1024),            4251648   
                              (None, 1024),                      
                              (None, 1024)]                      
                                                                 
Total params: 4,251,648
Trainable params: 4,251,648
Non-trainable params: 0
_________________________________________________________________


In [None]:
# 디코더
decoder_state_input_hidden = Input(shape=(1024,))
decoder_state_input_cell = Input(shape=(1024,))
decoder_state_input = [decoder_state_input_hidden, decoder_state_input_cell]

decoder_output, state_hidden, state_cell = decoderLSTM(decoder_inputs, initial_state = decoder_state_input)
decoder_state = [state_hidden, state_cell]
decoder_outputs = decoder_softmax(decoder_output)

decoder_model = Model(inputs=[decoder_inputs]+decoder_state_input, outputs=[decoder_output]+decoder_state)


In [None]:
decoder_model.summary()

Model: "model_23"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 decoder_input (InputLayer)     [(None, None, 13)]   0           []                               
                                                                                                  
 input_17 (InputLayer)          [(None, 1024)]       0           []                               
                                                                                                  
 input_18 (InputLayer)          [(None, 1024)]       0           []                               
                                                                                                  
 decoderLSTM (LSTM)             [(None, None, 1024)  4251648     ['decoder_input[0][0]',          
                                , (None, 1024),                   'input_17[0][0]',        

In [None]:
def decode_seq(input_seq): 
    state_value = encoder_model.predict(input_seq)
    print('encoder_model의 예상 state_value :', np.shape(state_value))
    
    target_seq = np.zeros((1, 1, jej_vocab_size))  #(1, 1, 제주어 어휘 크기)
    target_seq[0, 0, jej_to_index['<start>']] = 1  # 원핫인코딩
    
    stop = False
    decoded_sent = ""
    while not stop:  # "<end>" 문자를 만날 때까지 반복
        
        output, state_hidden, state_cell = decoder_model.predict([target_seq, state_value[0], state_value[1]])
        # 예측값을 제주어 문자로 변환
        token_index = np.argmax(output[0, -1, :]) 
        pred_char = index_to_jej[token_index]
        
        # 현시점 예측문자가 예측문장에 추가
        decoded_sent += pred_char
        
        if (pred_char == "<end>" or len(decoded_sent) > 373):
            stop = True
            
        # 현시점 예측결과가 다음 시점에 입력으로 
        target_seq = np.zeros((1, 1, jej_vocab_size))
        target_seq[0, 0, token_index] = 1
        
        # 현시점 상태를 다음 시점 상태로 사용
        state_value = [h, c]
    
    return decoded_sent  # 번역 결과


In [None]:
for seq_index in [1, 50, 100, 200, 300]:
    input_seq = encoder_input[seq_index:seq_index+1]  # (1, 117, 표준어 어휘 크기)
    decoded_seq = decode_seq(input_seq)
    
    print("입력문장:", train_df['표준어'][seq_index])
    print("정답:", train_df['제주어'][seq_index][1:len(train_df['제주어'][seq_index])-1])  # "<start>", "<end>" 제거
    print("번역기:", decoded_seq[:len(decoded_seq)-1])
    print("\n")


encoder_model의 예상 state_value : (2, 1, 1024)


ValueError: in user code:

    File "c:\anaconda3\envs\trans\lib\site-packages\keras\engine\training.py", line 2137, in predict_function  *
        return step_function(self, iterator)
    File "c:\anaconda3\envs\trans\lib\site-packages\keras\engine\training.py", line 2123, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\anaconda3\envs\trans\lib\site-packages\keras\engine\training.py", line 2111, in run_step  **
        outputs = model.predict_step(data)
    File "c:\anaconda3\envs\trans\lib\site-packages\keras\engine\training.py", line 2079, in predict_step
        return self(x, training=False)
    File "c:\anaconda3\envs\trans\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "c:\anaconda3\envs\trans\lib\site-packages\keras\engine\input_spec.py", line 217, in assert_input_compatibility
        f'Layer "{layer_name}" expects {len(input_spec)} input(s),'

    ValueError: Exception encountered when calling layer 'model_23' (type Functional).
    
    Layer "decoderLSTM" expects 3 input(s), but it received 1 input tensors. Inputs received: [<tf.Tensor 'IteratorGetNext:0' shape=(None, 1, 13) dtype=float32>]
    
    Call arguments received by layer 'model_23' (type Functional):
      • inputs=('tf.Tensor(shape=(None, 1, 13), dtype=float32)', 'tf.Tensor(shape=(None, 1024), dtype=float32)', 'tf.Tensor(shape=(None, 1024), dtype=float32)')
      • training=False
      • mask=None
