In [2]:
import pandas as pd
import numpy as np
import glob, os, re, jieba
import time
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import np_utils
from keras.models import Model, save_model, load_model
from keras.layers import Input, LSTM, Dense, Flatten
from keras.callbacks import EarlyStopping
import json

In [3]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 1089907746353116705
xla_global_id: -1
]


In [4]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        tf.config.experimental.set_memory_growth(gpus[0], True)
        print('GPU Running')
    except RuntimeError as e:
        # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
        (e)

In [5]:
def preprocess_kr(w):
    w = re.sub(r"([?'!¿\-·\"])", r" \1 ", w)
    w = re.sub(r'[ |ㄱ-ㅎ|ㅏ-ㅣ]+', r" ", w)
    w = re.sub(r"\,(?=[0-9])", r"", w)
    w = w[:-1].strip()
    w = '<start> ' + w + ' <end>'
    return w  

In [6]:
# JSON 파일을 읽고 필요한 데이터를 추출하는 함수
def extract_data_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    standard_forms = []
    dialect_forms = []
    
    for utterance in data['utterance']:
        standard_forms.append(utterance['standard_form'])
        dialect_forms.append(utterance['dialect_form'])
    
    return standard_forms, dialect_forms

In [7]:
# 표준어와 제주어 데이터를 전처리하고 크기를 제한하는 함수
def preprocess(path, num_data):
    files = glob.glob(os.path.join(path, '*.json'))
    std, jej = [], []  # 빈 리스트 초기화
    
    for f in files:
        std_forms, dial_forms = extract_data_from_json(f)
        std.extend(std_forms)
        jej.extend(dial_forms)
    
    std_series = pd.Series(std)
    jej_series = pd.Series(jej)
    
    df = pd.concat([std_series, jej_series], axis=1)
    df.columns = ['표준어', '제주어']
    
    df['표준어'] = df['표준어'].apply(preprocess_kr)
    df['제주어'] = df['제주어'].apply(preprocess_kr)
    
    df = df.sample(num_data, random_state=2)
    
    return df

In [8]:
def tokenize(texts):
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)
    tensor = tokenizer.texts_to_sequences(texts)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    return tensor, tokenizer

In [9]:
# 데이터셋을 토크나이즈하고 텐서로 변환하는 함수
def tokenize_dataset(path, num_data):
    df = preprocess(path, num_data)
    
    std_tensor, std_tokenizer = tokenize(df['표준어'].values)
    jej_tensor, jej_tokenizer = tokenize(df['제주어'].values)
    
    return std_tensor, jej_tensor, std_tokenizer, jej_tokenizer

In [10]:
# 설정한 경로와 데이터 크기 제한
num_data = 3000  # 이 이상이면 OOM error 발생...
path = os.getcwd() + '\\Training'

# 약 5분 소요
std_tensor, jej_tensor, std_lang, jej_lang = tokenize_dataset(path, num_data)

# 입력 텐서와 타겟 텐서의 최대 길이 계산
max_length_std = std_tensor.shape[1]
max_length_jej = jej_tensor.shape[1]

print('표준어 tensor 최장 길이 : {}'.format(max_length_std))
print('제주어 tensor 최장 길이 : {}'.format(max_length_jej))

표준어 tensor 최장 길이 : 26
제주어 tensor 최장 길이 : 26


In [11]:
# 표준어와 제주어 텐서를 훈련 데이터셋과 검증 데이터셋으로 나누기
std_tensor_train, std_tensor_val, jej_tensor_train, jej_tensor_val = train_test_split(std_tensor, jej_tensor, test_size=0.2)

print('훈련 데이터셋 크기 : {}, {}'.format(len(std_tensor_train), len(jej_tensor_train)))
print('검증 데이터셋 크기 : {}, {}'.format(len(std_tensor_val), len(jej_tensor_val)))

훈련 데이터셋 크기 : 2400, 2400
검증 데이터셋 크기 : 600, 600


In [12]:
def convert(tokenizer, tensor):
    for t in tensor:
        if t != 0:
            print("%12d ----> %s" % (t, tokenizer.index_word[t]))

print('표준어 index ----> token')
convert(std_lang, std_tensor_train[0])
print()
print('제주어 index ----> token')
convert(jej_lang, jej_tensor_train[0])


표준어 index ----> token
           1 ----> <start>
         146 ----> 삼
        4305 ----> 사일전에
        4306 ----> 시내에서
        1282 ----> 카
           2 ----> <end>

제주어 index ----> token
           1 ----> <start>
         124 ----> 삼
        4497 ----> 사일전에
        4498 ----> 시내에서
        1261 ----> 카
           2 ----> <end>


In [13]:
BUFFER_SIZE = len(std_tensor_train)
BATCH_SIZE = 4  # Out of Memory 에러 주의
steps_per_epoch = len(std_tensor_train) // BATCH_SIZE
embedding_size = 256
units = 1024

vocab_input_size = len(std_lang.word_index) + 1
vocab_target_size = len(jej_lang.word_index) + 1

print('표준어 토큰 개수 : {}'.format(vocab_input_size))
print('제주어 토큰 개수 : {}'.format(vocab_target_size))


표준어 토큰 개수 : 6610
제주어 토큰 개수 : 6960


In [14]:
dataset = tf.data.Dataset.from_tensor_slices((std_tensor_train, jej_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [15]:
example_input_batch, example_target_batch = next(iter(dataset))

print(example_input_batch.shape, example_target_batch.shape)

(4, 26) (4, 26)


In [16]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, enc_units, batch_size):
        super(Encoder, self).__init__()
        self.batch_size = batch_size
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')

    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state=hidden)
        return output, state

    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.enc_units))

In [17]:
encoder = Encoder(vocab_input_size, embedding_size, units, BATCH_SIZE)

#샘플 입력
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

print(f'Encoder output (batch size, sequence length, units) = {sample_output.shape}')   
print(f'Encoder Hidden state  (batch size, units) = {sample_hidden.shape}')   

Encoder output (batch size, sequence length, units) = (4, 26, 1024)
Encoder Hidden state  (batch size, units) = (4, 1024)


In [18]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, query, values):
        #query hidden state는 (batch_size, hidden_size)로 구성
        #query_with_time_axis는 (batch_size, 1, hidden_size)로 구성
        #values는 (batch_size, max_len, hidden_size)로 구성
        query_with_time_axis = tf.expand_dims(query, 1)

        #score는 (batch_size, max_len, units)로 구성
        #score를 self.V에 적용하기 때문에 마지막 축에 1을 얻어 (batch_size, max_len, 1)로 구성되게 됨
        score = self.V(tf.nn.tanh(self.W1(values) + self.W2(query_with_time_axis)))

        #attention_weights는 (batch_size, max_len, 1)로 구성
        attention_weights = tf.nn.softmax(score, axis=1)

        #병합 이후 context_vector는 (batch_size, hidden_size)로 구성
        context_vector = attention_weights * values
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [19]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_output)

print(f"Attention result shape: (batch size, units) {attention_result.shape}")
print(f"Attention weights shape: (batch_size, sequence_length, 1) {attention_weights.shape}")

Attention result shape: (batch size, units) (4, 1024)
Attention weights shape: (batch_size, sequence_length, 1) (4, 26, 1)


In [20]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_size):
        super(Decoder, self).__init__()
        self.batch_size = batch_size
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.dec_units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        self.attention = BahdanauAttention(self.dec_units)

    def call(self, x, hidden, enc_output):    # 단어 하나하나 해석 진행 
        #            hidden (batch_size, units),    enc_output (batch_size, max_length_inp, enc_units)
        # =>context_vector (batch_size, enc_units), attention_weights (batch_size, max_length_inp, 1)
        context_vector, attention_weights = self.attention(hidden, enc_output)

        # 임베딩 층 통과 후 x는 (batch_size, 1, embedding_dim)로 구성
        x = self.embedding(x)
        
        #context vector과 임베딩 결과를 결합한 후 x는 (batch_size, 1, embedding_dim+hidden_size)로 구성
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        #위에서 결합된 벡터를 GRU에 전달
        output, state = self.gru(x)
        
        #output은 (batch_size*1, hidden_size)로 구성
        output = tf.reshape(output, (-1, output.shape[2]))
        
        #FC(완전연결층)을 지난 x는 (batch_size, vocab)으로 구성
        x = self.fc(output)

        return x, state, attention_weights

In [21]:
decoder = Decoder(vocab_target_size, embedding_size, units, BATCH_SIZE)

sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE, 1)), sample_hidden, sample_output)

print(f'Decoder output shape: (batch_size, vocab size) {sample_decoder_output.shape}')

Decoder output shape: (batch_size, vocab size) (4, 6960)


In [22]:
optimizer = tf.keras.optimizers.Adam()
loss_objects = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,
                                                             reduction='none')


def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_objects(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

In [23]:
#체크포인트(객체 기반 저장)
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "cpkt")

checkpoint = tf.train.Checkpoint(optimizer=optimizer, 
                                 encoder=encoder,
                                 decoder=decoder)

manager = tf.train.CheckpointManager(checkpoint, directory=checkpoint_dir,
                                     checkpoint_name='model.ckpt',
                                     max_to_keep=3)

In [24]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([jej_lang.word_index['<start>']] * BATCH_SIZE, 1)

        for t in range(1, targ.shape[1]):
            # enc_output을 디코더에 전달
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:, t], predictions)
            dec_input = tf.expand_dims(targ[:, t], 1)  # teacher forcing

    batch_loss = (loss / int(targ.shape[1]))
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))

    return batch_loss


In [25]:

EPOCHS = 50

for epoch in range(EPOCHS):
    start = time.time()

    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0

    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss

        if batch % 100 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1, batch, batch_loss.numpy()))

    print('Epoch {} Loss {:.4f}'.format(epoch+1, total_loss / steps_per_epoch))
    print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
    
    checkpoint.save(file_prefix=checkpoint_prefix)
    manager.save()


Epoch 1 Batch 0 Loss 2.0419
Epoch 1 Batch 100 Loss 0.9824
Epoch 1 Batch 200 Loss 1.6082
Epoch 1 Batch 300 Loss 2.3268
Epoch 1 Batch 400 Loss 1.2970
Epoch 1 Batch 500 Loss 1.6157
Epoch 1 Loss 1.7871
Time taken for 1 epoch 389.8684298992157 sec

Epoch 2 Batch 0 Loss 1.5070
Epoch 2 Batch 100 Loss 2.6486
Epoch 2 Batch 200 Loss 0.9572
Epoch 2 Batch 300 Loss 1.5941
Epoch 2 Batch 400 Loss 0.6741
Epoch 2 Batch 500 Loss 2.1042
Epoch 2 Loss 1.6007
Time taken for 1 epoch 372.1045353412628 sec

Epoch 3 Batch 0 Loss 1.9647
Epoch 3 Batch 100 Loss 1.3118
Epoch 3 Batch 200 Loss 0.9728
Epoch 3 Batch 300 Loss 1.8008
Epoch 3 Batch 400 Loss 0.8979
Epoch 3 Batch 500 Loss 1.4724
Epoch 3 Loss 1.4097
Time taken for 1 epoch 375.95343351364136 sec

Epoch 4 Batch 0 Loss 0.5231
Epoch 4 Batch 100 Loss 1.2778
Epoch 4 Batch 200 Loss 0.6188
Epoch 4 Batch 300 Loss 1.5558
Epoch 4 Batch 400 Loss 1.3813
Epoch 4 Batch 500 Loss 1.4697
Epoch 4 Loss 1.2222
Time taken for 1 epoch 377.79545307159424 sec

Epoch 5 Batch 0 Loss 1

In [None]:
def evaluate(sentence):
    attention_plot = np.zeros((max_length_jej, max_length_std))

    sentence = preprocess_kr(sentence)
    
    inputs = [std_lang.word_index.get(i, 0) for i in sentence.split(' ')]
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs], maxlen=max_length_std, padding='post')
    
    inputs = tf.convert_to_tensor(inputs)

    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)

    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([jej_lang.word_index['<start>']], 0)
    
    for t in range(max_length_jej):
        predictions, dec_hidden, attention_weights = decoder(
            dec_input, dec_hidden, enc_out)
        
        # 나중에 attention 가중치를 시각화하기 위해 저장해두기
        attention_weights = tf.reshape(attention_weights, (-1, ))
        attention_plot[t] = attention_weights.numpy()
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += jej_lang.index_word.get(predicted_id, '') + ' '

        if jej_lang.index_word.get(predicted_id) == '<end>':
            return result, sentence, attention_plot

        # 예측된 id를 모델에 다시 feeding
        dec_input = tf.expand_dims([predicted_id], 0)

    return result, sentence, attention_plot


In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

def plot_attention(attention, sentence, predicted_sentence):
    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot(1, 1, 1)
    cax = ax.matshow(attention, cmap='viridis')

    fontdict = {'fontsize': 16}
    ax.set_xticklabels([''] + sentence.split(' '), fontdict=fontdict, rotation=90)
    ax.set_yticklabels([''] + predicted_sentence.split(' '), fontdict=fontdict)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

    fig.colorbar(cax)
    plt.show()


In [None]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)

    print("Input : %s" % (sentence))
    print("Translation : {}".format(result))
    


In [None]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x286daa13fc8>

In [None]:
val_df = pd.read_csv('val_df.csv', index_col=0)
val_sample = val_df.sample(10)
val_sample

Unnamed: 0,표준어,제주어
253834,{laughing},{laughing}
182268,그러니까 여기는,그러니까 여기는
153813,응.,응.
319043,그게 큰일이 아니고 피임 같은 거 없었지 않았잖아,그게 큰일이 아니고 피임 같은 거 없었지 않안게이
243571,걔네도 귤 따서,가이네 미깡 타네
82106,그냥 맞아 걍 걍 대한항공 아무 데나 이렇게 띡 누르고,그냥 맞아 걍 걍 대한항공 아무 데나 이렇게 띡 누르고
77858,누구?,누구?
250733,오빠 봐라 (()),오라방 보라 (())
162596,돈도 안 쓰고 밥하고 깨끗하게 청소하고 이런 거는 하는데,돈도 안 쓰고 밥하고 깨끗하게 청소하고 이런 거는 허는디
30430,거기는 진짜 개 맛 있어 껍데기도 서비스로 주고,거기는 진짜 개 맛 인 껍데기도 서비스로 주고


In [None]:
for idx, i in enumerate(val_sample['표준어'].values):
    try:
        translate(u'{}'.format(i))
        print("Intended Output : %s" % (val_sample.iloc[idx,1]))
        print(" ")
    except:
        print(i, '=> 데이터셋에 없는 단어 포함')
        print(" ")

Input : <start> {laughing <end>
Translation : {laughing <end> 
Intended Output : {laughing}
 
Input : <start> 그러니까 여기 <end>
Translation : 종류가 여기 <end> 
Intended Output : 그러니까 여기는
 
Input : <start> 응 <end>
Translation : 응 <end> 
Intended Output : 응.
 
Input : <start> 그게 큰일이 아니고 피임 같은 거 없었지 않았잖 <end>
Translation : 그게 <end> 
Intended Output : 그게 큰일이 아니고 피임 같은 거 없었지 않안게이
 
Input : <start> 걔네도 귤 따 <end>
Translation : <end> 
Intended Output : 가이네 미깡 타네
 
Input : <start> 그냥 맞아 걍 걍 대한항공 아무 데나 이렇게 띡 누르 <end>
Translation : 그냥 가이 그냥 (()) 멜 이게 이게 <end> 
Intended Output : 그냥 맞아 걍 걍 대한항공 아무 데나 이렇게 띡 누르고
 
Input : <start> 누구 ? <end>
Translation : 누구 ? <end> 
Intended Output : 누구?
 
Input : <start> 오빠 봐라 (() <end>
Translation : 바닥이 지나서 학교를 (() <end> 
Intended Output : 오라방 보라 (())
 
Input : <start> 돈도 안 쓰고 밥하고 깨끗하게 청소하고 이런 거는 하는 <end>
Translation : 해녀가 안 오는 <end> 
Intended Output : 돈도 안 쓰고 밥하고 깨끗하게 청소하고 이런 거는 허는디
 
Input : <start> 거기는 진짜 개 맛 있어 껍데기도 서비스로 주 <end>
Translation : 그디는 진짜 개라그네 개라그네 안가고 있 <en