In [1]:
import pandas as pd
import os
import numpy as np
import tensorflow as tf

os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [2]:
data_path = "../sugo3/OneDrive/바탕 화면/AIHUB_dataset/"
print(os.listdir(data_path))

['1_구어체(1)_200226.xlsx', '1_구어체(2)_200226.xlsx', '2_대화체_200226.xlsx', '3_문어체_뉴스(1)_200226.zip', '3_문어체_뉴스(2)_200226.xlsx', '3_문어체_뉴스(3)_200226.xlsx', '3_문어체_뉴스(4)_200226.xlsx', '4_문어체_한국문화_200226.xlsx', '5_문어체_조례_200226.xlsx', '6_문어체_지자체웹사이트_200226.xlsx', '불용어.xlsx']


In [3]:
lines = pd.read_excel(data_path+"1_구어체(1)_200226.xlsx", sheets = 0)

In [4]:
lines = lines.loc[:,"src":"tar"]
lines = lines[0:60000]
lines.sample(5)

Unnamed: 0,src,tar
16023,귀사의 웹사이트에 나와있는 주소지로 방문하면 이미 제작되어진 상품을 볼 수 있습니까?,Can I see the products that have already been ...
53333,"나는 솔직히 그들이 3,000달러를 요구하는 것 자체가 이해가 안 가.",I honestly don't understand why they are deman...
49634,나는 네가 한 달에 얼마 버는지 궁금해.,I am curious about your monthly salary.
4266,6 학년은 초등학교에서 가장 높은 학년입니다.,The sixth grade is the highest grade in an ele...
4542,8위는 EXO 멤버였던 크리스입니다.,"The 8th place is Chris, a former member of EXO."


# EOS, SOS 추가

In [5]:
lines.tar = lines.tar.apply(lambda x : 'sos '+x+' eos')
lines.sample(5)

Unnamed: 0,src,tar
32907,그래서 계산이 되지 않았으니 다시 계산해줄게요.,"sos So the payment was not made, let me help y..."
29273,그는 원래 안경을 쓰고 다녀요.,sos He basically wears glasses. eos
53432,나는 순식간에 그들의 팬이 되어 버렸어.,sos I became their fan immediately. eos
29584,그는 자신이 결근하길 원합니다.,sos He wants to be absent. eos
48885,나는 너를 집까지 데려다줘야 했어.,sos I had to take you to your home. eos


# 토큰화 및 임베딩 진행

In [6]:
import re
import MeCab

mecab = MeCab.Tagger()

stopfeatures = ['SF', 'SE', 'SS', 'SP', 'SY', 'SN']
stopwords = pd.read_excel(data_path+"불용어.xlsx",sheets = 0)
stopwords = np.array(stopwords) 

def preprocessing(text):
    preprocessed_text=[]
    tokens = mecab.parseToNode(text)
    
    tokens = tokens.next
    
    while tokens:
        token = tokens.surface
        tokens_feature = tokens.feature.split(',')[0]
        
        if token in stopwords:
            tokens = tokens.next
            continue                 
        
        if tokens_feature in stopfeatures:
            tokens = tokens.next
            continue
            
        if(tokens_feature=='BOS/EOS'):
            break;
        else:
            #print(token, "\t", tokens_feature)            
            preprocessed_text.append(str(token))
            tokens = tokens.next        
        
    return preprocessed_text


In [7]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize

In [8]:
def kor_tokenize(line):
    kor = []
    for line in lines.src:
        tokens = preprocessing(line)
        kor.append(tokens)
    return kor

def eng_tokenize(line):
    eng = []
    for line in lines.tar:
        tokens = word_tokenize(line)
        eng.append(tokens)
    return eng

def tokenize(lang):
    lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    lang_tokenizer.fit_on_texts(lang)
    tensor = lang_tokenizer.texts_to_sequences(lang)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding='post')
    
    return tensor, lang_tokenizer

In [9]:
def load_dataset(line): # src와 tar을 읽어와 전처리 시키는 과정
    src_t = kor_tokenize(line)
    tar_t = eng_tokenize(line)
    
    input_tensor, inp_lan_tokenizer = tokenize(src_t)
    targ_tensor, targ_lan_tokenizer = tokenize(tar_t)
    
    return input_tensor, targ_tensor, inp_lan_tokenizer, targ_lan_tokenizer

In [10]:
input_tensor, target_tensor, input_lan_tokenizer, target_lan_tokenizer = load_dataset(lines)

In [11]:
max_length_targ, max_length_inp = target_tensor.shape[1], input_tensor.shape[1]

In [12]:
print(f"English Tensor의 MAX 길이 : {max_length_targ}")
print(f"Kerean Tensor의 MAX 길이 : {max_length_inp}")

English Tensor의 MAX 길이 : 48
Kerean Tensor의 MAX 길이 : 34


In [13]:
from sklearn.model_selection import train_test_split
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size = 0.2)
print(f"Kor train : {len(input_tensor_train)}, Kor val : {len(input_tensor_val)}")  
print(f"Eng train : {len(target_tensor_train)}, Eng val : {len(target_tensor_val)}")

Kor train : 48000, Kor val : 12000
Eng train : 48000, Eng val : 12000


In [14]:
def convert(lang, tensor):
    for t in tensor:
        if t!=0:
            print(f"{t} ---> {lang.index_word[t]}")

In [15]:
print("Input Language; index to  word mapping")
convert(target_lan_tokenizer, target_tensor_train[0])

Input Language; index to  word mapping
2 ---> sos
5 ---> i
23 ---> was
723 ---> born
12 ---> in
12200 ---> frankfurt
8 ---> ,
1569 ---> germany
3 ---> .
1 ---> eos


# tf.data 데이터셋 생성하기

In [16]:
BUFFER_SIZE = len(input_tensor_train)
BATCH_SIZE = 32
steps_per_epoch = len(input_tensor_train)//BATCH_SIZE
embedding_dim = 256
units = 1024
vocab_inp_size = len(input_lan_tokenizer.word_index)+1
vocab_tar_size = len(target_lan_tokenizer.word_index)+1

In [17]:
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder = True)

In [18]:
example_input_batch, example_target_batxh = next(iter(dataset))
print(example_input_batch.shape)
print(example_target_batxh.shape)

(32, 34)
(32, 48)


# MODELING - Encoder

In [19]:
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim,enc_units, batch_sz):
        super(Encoder, self).__init__()
        self.batch_sz = batch_sz
        self.enc_units = enc_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.enc_units,
                                     return_sequences = True,
                                     return_state = True,
                                     recurrent_initializer = 'glorot_uniform')
        
    def call(self, x, hidden):
        x = self.embedding(x)
        output, state = self.gru(x, initial_state = hidden)
        return output, state
    
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_sz, self.enc_units))
    

In [20]:
encoder = Encoder(vocab_inp_size, embedding_dim, units, BATCH_SIZE)

In [21]:
sample_hidden = encoder.initialize_hidden_state()
sample_output, sample_hidden = encoder(example_input_batch, sample_hidden)

In [22]:
print(sample_output.shape)
print(sample_hidden.shape)

(32, 34, 1024)
(32, 1024)


# MODELING - Attention

In [23]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)
        
    def call(self, query, values):
        query_with_time_axis = tf.expand_dims(query, 1)
        
        score = self.V(tf.nn.tanh(self.W1(query_with_time_axis)+self.W2(values)))
        
        attention_weights = tf.nn.softmax(score, axis = 1) 
        
        context_vector = attention_weights*values
        context_vector = tf.reduce_sum(context_vector, axis =1)
        
        return context_vector, attention_weights
            

In [24]:
attention_layer = BahdanauAttention(10)
attention_result, attention_weights = attention_layer(sample_hidden, sample_hidden)

In [25]:
print(attention_result.shape)
print(attention_weights.shape)

(32, 1024)
(32, 32, 1)


# MODELING - Decoder

In [26]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, dec_units, batch_sz):
        super(Decoder, self).__init__()
        self.batch_sz = batch_sz
        self.dec_units = dec_units
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru=tf.keras.layers.GRU(self.dec_units,
                                    return_sequences = True,
                                    return_state = True,
                                    recurrent_initializer = 'glorot_uniform')
        self.fc = tf.keras.layers.Dense(vocab_size)
        
        #using Attention
        self.attention = BahdanauAttention(self.dec_units)
        
    def call(self, x, hidden, enc_output):
        context_vector, attention_weights = self.attention(hidden, enc_output)
        
        x = self.embedding(x)
        
        x = tf.concat([tf.expand_dims(context_vector,1),x],axis = -1)
        output, state = self.gru(x)
        
        output = tf.reshape(output, (-1, output.shape[2]))
        
        x = self.fc(output)
        
        return x, state, attention_weights

In [27]:
decoder = Decoder(vocab_tar_size, embedding_dim, units, BATCH_SIZE)

In [28]:
sample_decoder_output, _, _ = decoder(tf.random.uniform((BATCH_SIZE,1 )),
                                     sample_hidden, sample_output)
print(sample_decoder_output.shape)

(32, 24699)


# 최적화 및 Loss Function 정의

In [29]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction ='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real,0))
    loss_ = loss_object(real, pred)
    
    mask = tf.cast(mask, dtype = loss_.dtype)
    loss_ *= mask
    
    return tf.reduce_mean(loss_)

# 체크 포인트

In [30]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer = optimizer,
                                encoder = encoder,
                                decoder = decoder)

# 훈련

In [31]:
@tf.function
def train_step(inp, targ, enc_hidden):
    loss = 0
    
    with tf.GradientTape() as tape:
        enc_output, enc_hidden = encoder(inp, enc_hidden)
        dec_hidden = enc_hidden
        dec_input = tf.expand_dims([target_lan_tokenizer.word_index['sos']]*BATCH_SIZE,1)
        
        for t in range(1, targ.shape[1]):
            predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
            loss += loss_function(targ[:,t], predictions)
            
            dec_input = tf.expand_dims(targ[:,t], 1)
            
        batch_loss = (loss/int(targ.shape[1]))
        variables = encoder.trainable_variables + decoder.trainable_variables
        gradients = tape.gradient(loss, variables)
        optimizer.apply_gradients(zip(gradients, variables))
        
        return batch_loss

In [32]:
import time
EPOCHS = 10

for epoch in range(EPOCHS):
    start = time.time()
    
    enc_hidden = encoder.initialize_hidden_state()
    total_loss = 0
    
    for (batch, (inp, targ)) in enumerate(dataset.take(steps_per_epoch)):
        batch_loss = train_step(inp, targ, enc_hidden)
        total_loss += batch_loss
        
        if batch % 100 == 0:
            print(f"Epoch : {epoch+1}, Batch : {batch}. Loss : {batch_loss.numpy()}")
    
    if (epoch + 1)%2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    print(f'Epoch : {epoch+1}, Loss : {total_loss/steps_per_epoch}')
    print(f'Time taken for 1 epoch {time.time()-start} sec\n')

Epoch : 1, Batch : 0. Loss : 2.620814323425293
Epoch : 1, Batch : 100. Loss : 1.9371814727783203
Epoch : 1, Batch : 200. Loss : 1.523059606552124
Epoch : 1, Batch : 300. Loss : 1.3313764333724976
Epoch : 1, Batch : 400. Loss : 1.810246229171753
Epoch : 1, Batch : 500. Loss : 1.5048942565917969
Epoch : 1, Batch : 600. Loss : 1.2774181365966797
Epoch : 1, Batch : 700. Loss : 1.4247686862945557
Epoch : 1, Batch : 800. Loss : 1.4597079753875732
Epoch : 1, Batch : 900. Loss : 1.272698998451233
Epoch : 1, Batch : 1000. Loss : 1.2617205381393433
Epoch : 1, Batch : 1100. Loss : 1.222309947013855
Epoch : 1, Batch : 1200. Loss : 1.3153375387191772
Epoch : 1, Batch : 1300. Loss : 1.272398829460144
Epoch : 1, Batch : 1400. Loss : 1.3401367664337158
Epoch : 1, Loss : 1.4608796834945679
Time taken for 1 epoch 507.9321892261505 sec

Epoch : 2, Batch : 0. Loss : 1.2547104358673096
Epoch : 2, Batch : 100. Loss : 1.2105871438980103
Epoch : 2, Batch : 200. Loss : 1.1035583019256592
Epoch : 2, Batch : 300

Epoch : 10, Batch : 1400. Loss : 0.24913248419761658
Epoch : 10, Loss : 0.2255912870168686
Time taken for 1 epoch 475.42292857170105 sec



In [37]:
def evaluate(sentence):
    sentence_tokens = preprocessing(sentence)
    attention_plot = np.zeros((max_length_targ, max_length_inp))
    inputs = [input_lan_tokenizer.word_index[i] for i in sentence_tokens]
    #print(inputs)
    inputs = tf.keras.preprocessing.sequence.pad_sequences([inputs],
                                                          maxlen = max_length_inp,
                                                          padding = 'post')
    #print(inputs)   
    inputs = tf.convert_to_tensor(inputs)
    
    result = ''
    
    hidden = [tf.zeros((1, units))]
    enc_out, enc_hidden = encoder(inputs, hidden)
    
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lan_tokenizer.word_index['sos']],0)
    #print(max_length_targ)
    for t in range(max_length_targ):
        predictions, dec_hidden, attention_weights = decoder(dec_input,
                                                            dec_hidden,
                                                            enc_out)
        
        attention_weights = tf.reshape(attention_weights, (-1,))
        attention_plot[t] = attention_weights.numpy()
        
        predicted_id = tf.argmax(predictions[0]).numpy()
        
        result += target_lan_tokenizer.index_word[predicted_id]+' '
        #print(target_lan_tokenizer.index_word[predicted_id])
        if target_lan_tokenizer.index_word[predicted_id] == 'eos':
            return result, sentence, attention_plot
        
        dec_input = tf.expand_dims([predicted_id], 0)
        
    return result, sentence, attention_plot

In [77]:
def translate(sentence):
    result, sentence, attention_plot = evaluate(sentence)
    
    print(f'Input : {sentence}')
    print(f'Predicted translation : {result[:-4]}')

In [81]:
translate(u'저랑 영화보러 갈까요?')

Input : 저랑 영화보러 갈까요?
Predicted translation : will you go to the movie ? 


In [89]:
translate(u'바빠요')

Input : 바빠요
Predicted translation : i am too busy . 


In [87]:
 translate(u'같이 저녁식사는 먹죠?')

Input : 같이 저녁식사는 먹죠?
Predicted translation : are we going on the dinner ? 


In [88]:
 translate(u'오늘 좀 바쁜일이 있네요')

Input : 오늘 좀 바쁜일이 있네요
Predicted translation : i have been busy today . 


In [105]:
translate(u'바쁘다니까')

Input : 바쁘다니까
Predicted translation : i was so busy , i 've been busy , i 've been busy , i 've been busy , i 've been busy , i 've been busy , i 've been busy , i 've been busy , i 've been busy , i 've b
