In [1]:
import os
import shutil
import zipfile

import pandas as pd
import tensorflow as tf
import urllib3
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [2]:
http = urllib3.PoolManager()
url ='http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path,filename)
with http.request('GET', url, preload_content=False) as r,open(zipfilename,'wb') as out_file:
    shutil.copyfileobj(r,out_file)
with zipfile.ZipFile(zipfilename,'r') as zip_ref:
    zip_ref.extractall(path)

In [3]:
lines = pd.read_csv('fra.txt',names=['src','tar','lic'],sep='\t')
del lines['lic']
print('전체 샘플의 개수 :',len(lines))

전체 샘플의 개수 : 194513


In [4]:
lines = lines.loc[:,'src':'tar']
lines = lines[0:60000]
lines.sample(10)

Unnamed: 0,src,tar
39580,Why are you with me?,Pourquoi êtes-vous avec moi ?
25500,She's still young.,Elle est encore jeune.
14521,I'll allow this.,Je le permettrai.
8831,Tom liked you.,Tom t'aimait bien.
36031,I stole it from Tom.,Je l'ai volé à Tom.
9201,We're touched.,Nous sommes touchés.
41933,I eat here every day.,Je mange ici tous les jours.
37956,The lights went out.,Les lumières se sont éteintes.
36439,"I'm fine, thank you.","Ça va, merci."
44450,That doesn't help me.,Ça ne m'aide pas.


In [5]:
lines.tar = lines.tar.apply(lambda x: '\t' + x + '\n')
lines.sample(10)

Unnamed: 0,src,tar
11746,That's the one.,\tC'est celui-là.\n
27740,You're disgusting.,\tVous êtes dégoûtant.\n
31808,They came together.,\tElles sont venues ensemble.\n
17135,You're the best.,\tVous êtes le meilleur.\n
18011,He shares a room.,\tIl partage une chambre.\n
36328,I'll do my homework.,\tJe ferai mes devoirs.\n
48742,I could never do that.,\tJe ne pourrais jamais faire cela.\n
8533,That's stupid.,\tC'est bête.\n
19990,Please phone him.,\tVeuillez l'appeler.\n
45118,This place is a dump.,\tCet endroit est une décharge.\n


In [6]:
src_vocab = set()
for line in lines.src:
    for char in line:
        src_vocab.add(char)
tar_vocab = set()
for line in lines.tar:
    for char in line:
        tar_vocab.add(char)

In [7]:
src_vocab_size = len(src_vocab)+1
tar_vocab_size = len(tar_vocab)+1
print('source 문장의 char 집합 :',src_vocab_size)
print('target 문장의 char 집합 :',tar_vocab_size)

source 문장의 char 집합 : 80
target 문장의 char 집합 : 105


In [8]:
src_vocab = sorted(list(src_vocab))
tar_vocab = sorted(list(tar_vocab))
print(src_vocab[45:75])
print(tar_vocab[45:75])

['W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
['T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w']


In [9]:
src_to_index = dict([(word,i+1) for i,word in enumerate(src_vocab)])
tar_to_index = dict([(word,i+1) for i,word in enumerate(tar_vocab)])
print(src_to_index)
print(tar_to_index)

{' ': 1, '!': 2, '"': 3, '$': 4, '%': 5, '&': 6, "'": 7, ',': 8, '-': 9, '.': 10, '/': 11, '0': 12, '1': 13, '2': 14, '3': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, '?': 23, 'A': 24, 'B': 25, 'C': 26, 'D': 27, 'E': 28, 'F': 29, 'G': 30, 'H': 31, 'I': 32, 'J': 33, 'K': 34, 'L': 35, 'M': 36, 'N': 37, 'O': 38, 'P': 39, 'Q': 40, 'R': 41, 'S': 42, 'T': 43, 'U': 44, 'V': 45, 'W': 46, 'X': 47, 'Y': 48, 'Z': 49, 'a': 50, 'b': 51, 'c': 52, 'd': 53, 'e': 54, 'f': 55, 'g': 56, 'h': 57, 'i': 58, 'j': 59, 'k': 60, 'l': 61, 'm': 62, 'n': 63, 'o': 64, 'p': 65, 'q': 66, 'r': 67, 's': 68, 't': 69, 'u': 70, 'v': 71, 'w': 72, 'x': 73, 'y': 74, 'z': 75, '°': 76, 'é': 77, '’': 78, '€': 79}
{'\t': 1, '\n': 2, ' ': 3, '!': 4, '"': 5, '$': 6, '%': 7, '&': 8, "'": 9, '(': 10, ')': 11, ',': 12, '-': 13, '.': 14, '0': 15, '1': 16, '2': 17, '3': 18, '4': 19, '5': 20, '6': 21, '7': 22, '8': 23, '9': 24, ':': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 3

In [14]:
encoder_input = []
for line in lines.src:
    encoded_line = []
    
    for char in line:
        encoded_line.append(src_to_index[char])
    encoder_input.append(encoded_line)
print('source 문장의 정수 인코딩 :',encoder_input[:5])

source 문장의 정수 인코딩 : [[30, 64, 10], [30, 64, 10], [30, 64, 10], [31, 58, 10], [31, 58, 10]]


In [12]:
decoder_input = []
for line in lines.tar:
    encoded_line = []
    for char in line:
        encoded_line.append(tar_to_index[char])
    decoder_input.append(encoded_line)
print('target 문장의 정수 인코딩 :',decoder_input[:5])

target 문장의 정수 인코딩 : [[1, 48, 53, 3, 4, 2], [1, 39, 53, 70, 55, 60, 57, 14, 2], [1, 28, 67, 73, 59, 57, 3, 4, 2], [1, 45, 53, 64, 73, 72, 3, 4, 2], [1, 45, 53, 64, 73, 72, 14, 2]]


In [13]:
decoder_target = []
for line in lines.tar:
    timestep = 0
    encoded_line = []
    for char in line:
        if timestep > 0:
            encoded_line.append(tar_to_index[char])
        timestep = timestep + 1
    decoder_target.append(encoded_line)
print('target 문장 레이블의 정수 인코딩 :',decoder_target[:5])

target 문장 레이블의 정수 인코딩 : [[48, 53, 3, 4, 2], [39, 53, 70, 55, 60, 57, 14, 2], [28, 67, 73, 59, 57, 3, 4, 2], [45, 53, 64, 73, 72, 3, 4, 2], [45, 53, 64, 73, 72, 14, 2]]


In [15]:
max_src_len = max([len(line) for line in lines.src])
max_tar_len = max([len(line) for line in lines.tar])
print('source 문장의 최대 길이 :',max_src_len)
print('target 문장의 최대 길이 :',max_tar_len)

source 문장의 최대 길이 : 23
target 문장의 최대 길이 : 74


In [16]:
encoder_input = pad_sequences(encoder_input,maxlen=max_src_len,padding='post')
decoder_input = pad_sequences(decoder_input,maxlen=max_tar_len,padding='post')
decoder_target = pad_sequences(decoder_target,maxlen=max_tar_len,padding='post')

In [17]:
encoder_input = to_categorical(encoder_input)
decoder_input = to_categorical(decoder_input)
decoder_target = to_categorical(decoder_target)

In [18]:
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense
from tensorflow.keras.models import Model
import numpy as np

In [19]:
encoder_inputs = Input(shape=(None,src_vocab_size))
encoder_lstm = LSTM(units=256,return_state=True)

encoder_outputs,state_h,state_c = encoder_lstm(encoder_inputs)

encoder_states = [state_h,state_c]

In [20]:
decoder_inputs = Input(shape=(None,tar_vocab_size))
decoder_lstm = LSTM(units=256,return_sequences=True,return_state=True)

decoder_outputs,_,_ = decoder_lstm(decoder_inputs,initial_state=encoder_states)

decoder_softmax_layer = Dense(tar_vocab_size,activation='softmax')
decoder_outputs = decoder_softmax_layer(decoder_outputs)

model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer='rmsprop',loss="categorical_crossentropy")

In [21]:
model.fit(x=[encoder_input,decoder_input],y=decoder_target,batch_size=64,epochs=40,validation_split=0.2)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x1ff86d37d60>

In [22]:
encoder_model = Model(inputs=encoder_inputs,outputs=encoder_states)

In [25]:
decoder_state_input_h = Input(shape=(256,))
decoder_state_input_c = Input(shape=(256,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs,state_h,state_c = decoder_lstm(decoder_inputs,initial_state=decoder_states_inputs)

decoder_states = [state_h,state_c]
decoder_outputs = decoder_softmax_layer(decoder_outputs)
decoder_model = Model(inputs=[decoder_inputs] + decoder_states_inputs,outputs=[decoder_outputs] + decoder_states)

In [26]:
index_to_src = dict((i,char) for char,i in src_to_index.items())
index_to_tar = dict((i,char) for char,i in tar_to_index.items())

In [27]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1,tar_vocab_size))
    target_seq[0,0,tar_to_index['\t']] = 1.
    
    stop_condition = False
    decoded_sentence = ""
    
    while not stop_condition:
        output_tokens,h,c = decoder_model.predict([target_seq] + states_value)
        
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
        sampled_char = index_to_tar[sampled_token_index]
        
        decoded_sentence += sampled_char
        
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_tar_len):
            stop_condition = True
        target_seq = np.zeros((1,1,tar_vocab_size))
        target_seq[0,0,sampled_token_index] = 1.
        
        states_value=[h,c]
    return decoded_sentence

In [31]:
for seq_index in [3,50,100,300,1000]:
    input_seq = encoder_input[seq_index:seq_index+1]
    decoded_sentence = decode_sequence(input_seq)
    print(35 * '-')
    print("입력 문장:",lines.src[seq_index])
    print("정답 문장:",lines.tar[seq_index][1:len(lines.tar[seq_index])-1])
    print("번역 문장:",decoded_sentence[:len(decoded_sentence)-1])

-----------------------------------
입력 문장: Hi.
정답 문장: Salut !
번역 문장: Salut !
-----------------------------------
입력 문장: Hello!
정답 문장: Salut !
번역 문장: Bonjour !
-----------------------------------
입력 문장: Hop in.
정답 문장: Montez.
번역 문장: Monte les filles !
-----------------------------------
입력 문장: Help me!
정답 문장: Aide-moi !
번역 문장: Aidez-moi !
-----------------------------------
입력 문장: How's Tom?
정답 문장: Comment va Tom ?
번역 문장: Comment Tom va-t-il ?


In [23]:
import os
import re
import shutil
import zipfile

import numpy as np
import pandas as pd
import tensorflow as tf
import unicodedata
import urllib3
from tensorflow.keras.layers import Embedding, GRU, Dense
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [38]:
http = urllib3.PoolManager()
url = 'http://www.manythings.org/anki/fra-eng.zip'
filename = 'fra-eng.zip'
path = os.getcwd()
zipfilename = os.path.join(path, filename)
with http.request('GET', url, preload_content=False) as r, open(zipfilename, 'wb') as out_file:
    shutil.copyfileobj(r, out_file)
with zipfile.ZipFile(zipfilename, 'r') as zip_ref:
    zip_ref.extractall(path)

In [46]:
num_samples = 33000

In [40]:
def to_ascii(s):
    
    return ''.join(c for c in unicodedata.normalize('NFD', s)
                    if unicodedata.category(c) != 'Mn')
def preprocess_sentence(sent):

    sent = to_ascii(sent.lower())
    sent = re.sub(r"([?.!,¿])", r" \1", sent)
    sent = re.sub(r"[^a-zA-Z!.?]+", r" ", sent)
    sent = re.sub(r"\s+", " ", sent)
    return sent

In [41]:
en_sent = u"Have you had dinner?"
fr_sent = u"Avez-vous déjà diné?"

print("전처리 전 영어 문장 :",en_sent)
print("전처리 후 영어 문장 :",preprocess_sentence(en_sent))
print("전처리 전 프랑스어 문장 :",fr_sent)
print("전처리 후 프랑스어 문장 :",preprocess_sentence(fr_sent))

전처리 전 영어 문장 : Have you had dinner?
전처리 후 영어 문장 : have you had dinner ?
전처리 전 프랑스어 문장 : Avez-vous déjà diné?
전처리 후 프랑스어 문장 : avez vous deja dine ?


In [47]:
def load_preprocessed_data():
    encoder_input, decoder_input, decoder_target = [], [], []

    with open("fra.txt", "r", encoding='UTF8') as lines:
        for i, line in enumerate(lines):
            src_line, tar_line, _ = line.strip().split('\t')

            src_line = [w for w in preprocess_sentence(src_line).split()]

            tar_line = preprocess_sentence(tar_line)
            tar_line_in = [w for w in ("<sos> " + tar_line).split()]
            tar_line_out = [w for w in (tar_line + " <eos>").split()]

            encoder_input.append(src_line)
            decoder_input.append(tar_line_in)
            decoder_target.append(tar_line_out)

            if i == num_samples - 1:
                break

    return encoder_input, decoder_input, decoder_target

In [48]:
sents_en_in, sents_fra_in, sents_fra_out = load_preprocessed_data()
print('인코더의 입력 :',sents_en_in[:5])
print('디코더의 입력 :',sents_fra_in[:5])
print('디코더의 레이블 :',sents_fra_out[:5])

인코더의 입력 : [['go', '.'], ['go', '.'], ['go', '.'], ['hi', '.'], ['hi', '.']]
디코더의 입력 : [['<sos>', 'va', '!'], ['<sos>', 'marche', '.'], ['<sos>', 'bouge', '!'], ['<sos>', 'salut', '!'], ['<sos>', 'salut', '.']]
디코더의 레이블 : [['va', '!', '<eos>'], ['marche', '.', '<eos>'], ['bouge', '!', '<eos>'], ['salut', '!', '<eos>'], ['salut', '.', '<eos>']]


In [49]:
tokenizer_en = Tokenizer(filters="",lower=False)
tokenizer_en.fit_on_texts(sents_en_in)
encoder_input = tokenizer_en.texts_to_sequences(sents_en_in)
encoder_input = pad_sequences(encoder_input,padding="post")

tokenizer_fra = Tokenizer(filters="",lower=False)
tokenizer_fra.fit_on_texts(sents_fra_in)
tokenizer_fra.fit_on_texts(sents_fra_out)

decoder_input = tokenizer_fra.texts_to_sequences(sents_fra_in)
decoder_input = pad_sequences(decoder_input,padding='post')

decoder_target = tokenizer_fra.texts_to_sequences(sents_fra_out)
decoder_target = pad_sequences(decoder_target,padding='post')

In [50]:
print('인코더의 입력의 크기(shape) :',encoder_input.shape)
print('디코더의 입력의 크기(shape) :',decoder_input.shape)
print('디코더의 레이블의 크기(shape) :',decoder_target.shape)

인코더의 입력의 크기(shape) : (33000, 8)
디코더의 입력의 크기(shape) : (33000, 16)
디코더의 레이블의 크기(shape) : (33000, 16)


In [52]:
src_vocab_size = len(tokenizer_en.word_index) + 1
tar_vocab_size = len(tokenizer_fra.word_index) + 1
print("영어 단어 집합의 크기 : {:d}, 프랑스어 단어 집합의 크기 : {:d}".format(src_vocab_size, tar_vocab_size))

영어 단어 집합의 크기 : 4672, 프랑스어 단어 집합의 크기 : 8153


In [53]:
src_to_index = tokenizer_en.word_index
index_to_src = tokenizer_en.index_word
tar_to_index = tokenizer_fra.word_index
index_to_tar = tokenizer_fra.index_word

In [54]:
indices = np.arange(encoder_input.shape[0])
np.random.shuffle(indices)
print('랜덤 시퀀스 :',indices)

랜덤 시퀀스 : [12613 21471 31803 ... 23026 28642  3315]


In [55]:
encoder_input = encoder_input[indices]
decoder_input = decoder_input[indices]
decoder_target = decoder_target[indices]

In [56]:
encoder_input[30997]

array([ 43,  56, 237,   1,   0,   0,   0,   0])

In [57]:
decoder_input[30997]

array([  2, 814,  40, 349,   9,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0])

In [58]:
decoder_target[30997]

array([814,  40, 349,   9,   3,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0])

In [59]:
n_of_val = int(33000*0.1)
print('검증 데이터의 개수 :',n_of_val)

검증 데이터의 개수 : 3300


In [60]:
encoder_input_train = encoder_input[:-n_of_val]
decoder_input_train = decoder_input[:-n_of_val]
decoder_target_train = decoder_target[:-n_of_val]

encoder_input_test = encoder_input[-n_of_val:]
decoder_input_test = decoder_input[-n_of_val:]
decoder_target_test = decoder_target[-n_of_val:]

In [61]:
print('훈련 source 데이터의 크기 :',encoder_input_train.shape)
print('훈련 target 데이터의 크기 :',decoder_input_train.shape)
print('훈련 target 레이블의 크기 :',decoder_target_train.shape)
print('테스트 source 데이터의 크기 :',encoder_input_test.shape)
print('테스트 target 데이터의 크기 :',decoder_input_test.shape)
print('테스트 target 레이블의 크기 :',decoder_target_test.shape)

훈련 source 데이터의 크기 : (29700, 8)
훈련 target 데이터의 크기 : (29700, 16)
훈련 target 레이블의 크기 : (29700, 16)
테스트 source 데이터의 크기 : (3300, 8)
테스트 target 데이터의 크기 : (3300, 16)
테스트 target 레이블의 크기 : (3300, 16)


In [62]:
from tensorflow.keras.layers import Input,LSTM,Embedding,Dense,Masking
from tensorflow.keras.models import Model

In [63]:
embedding_dim =64
hidden_units = 64

In [64]:
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size,embedding_dim)(encoder_inputs)
enc_masking = Masking(mask_value=0.0)(enc_emb)
encoder_lstm = LSTM(hidden_units,return_state=True)
encoder_outputs,state_h,state_c = encoder_lstm(enc_masking)
encoder_states = [state_h,state_c]

In [65]:
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(tar_vocab_size,hidden_units)
dec_emb = dec_emb_layer(decoder_inputs)
dec_masking = Masking(mask_value=0.0)(dec_emb)

decoder_lstm = LSTM(hidden_units,return_sequences=True,return_state=True)

decoder_outputs,_,_ = decoder_lstm(dec_masking,initial_state=encoder_states)

decoder_dense = Dense(tar_vocab_size,activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['acc'])

In [66]:
model.fit(x=[encoder_input_train,decoder_input_train],y=decoder_target_train, \
         validation_data=([encoder_input_test,decoder_input_test],decoder_target_test),
         batch_size=128,epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x1eba4b719a0>

In [68]:
encoder_model = Model(encoder_inputs,encoder_states)

decoder_state_input_h = Input(shape=(hidden_units,))
decoder_state_input_c = Input(shape=(hidden_units,))
decoder_states_inputs = [decoder_state_input_h,decoder_state_input_c]

dec_emb2 = dec_emb_layer(decoder_inputs)

decoder_outputs2,state_h2,state_c2 = decoder_lstm(dec_emb2,initial_state=decoder_states_inputs)
decoder_states2 = [state_h2,state_c2]

decoder_outputs2 = decoder_dense(decoder_outputs2)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs2] + decoder_states2)

In [90]:
def decode_sequence(input_seq):
    states_value = encoder_model.predict(input_seq)
    target_seq = np.zeros((1,1))
    target_seq[0,0] = tar_to_index['<sos>']
    stop_condition = False
    decoded_sentence = ''
    
    while not stop_condition:
        output_tokens,h,c = decoder_model.predict([target_seq] + states_value)
        sampled_token_index = np.argmax(output_tokens[0,-1,:])
        sampled_char = index_to_tar[sampled_token_index]
        decoded_sentence = " "+sampled_char
        if (sampled_char == '<eos>' or len(decoded_sentence) > 50):
            stop_condition =True
        target_seq = np.zeros((1,1))
        target_seq[0,0] = sampled_token_index
        states_value = [h,c]
        
    return decoded_sentence

In [93]:
def decode_sequence(input_seq):
  states_value = encoder_model.predict(input_seq)

  # <SOS>에 해당하는 정수 생성
  target_seq = np.zeros((1,1))
  target_seq[0, 0] = tar_to_index['<sos>']

  stop_condition = False
  decoded_sentence = ''

  # stop_condition이 True가 될 때까지 루프 반복
  # 구현의 간소화를 위해서 이 함수는 배치 크기를 1로 가정합니다.
  while not stop_condition:
    # 이점 시점의 상태 states_value를 현 시점의 초기 상태로 사용
    output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

    # 예측 결과를 단어로 변환
    sampled_token_index = np.argmax(output_tokens[0, -1, :])
    sampled_char = index_to_tar[sampled_token_index]

    # 현재 시점의 예측 단어를 예측 문장에 추가
    decoded_sentence += ' '+sampled_char

    # <eos>에 도달하거나 정해진 길이를 넘으면 중단.
    if (sampled_char == '<eos>' or
        len(decoded_sentence) > 50):
        stop_condition = True

    # 현재 시점의 예측 결과를 다음 시점의 입력으로 사용하기 위해 저장
    target_seq = np.zeros((1,1))
    target_seq[0, 0] = sampled_token_index

    # 현재 시점의 상태를 다음 시점의 상태로 사용하기 위해 저장
    states_value = [h, c]

  return decoded_sentence

In [87]:
def seq_to_src(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0):
            sentence = sentence + index_to_src[encoded_word] + ' '
    return sentence
def seq_to_tar(input_seq):
    sentence = ''
    for encoded_word in input_seq:
        if(encoded_word != 0 and encoded_word !=tar_to_index['<sos>'] and encoded_word != tar_to_index['<eos>']):
            sentence = sentence + index_to_tar[encoded_word] + ' '
    return sentence

In [94]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_train[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("입력문장 :",seq_to_src(encoder_input_train[seq_index]))
  print("정답문장 :",seq_to_tar(decoder_input_train[seq_index]))
  print("번역문장 :",decoded_sentence[1:-5])
  print("-"*50)

입력문장 : i am very pleased . 
정답문장 : je suis tres content . 
번역문장 : je suis tres heureux . 
--------------------------------------------------
입력문장 : i have immunity . 
정답문장 : je dispose de l immunite . 
번역문장 : je dispose de l acquisition . 
--------------------------------------------------
입력문장 : she knew the teen . 
정답문장 : elle connaissait l adolescente . 
번역문장 : elle connaissait l adolescent . 
--------------------------------------------------
입력문장 : you re great . 
정답문장 : t assures . 
번역문장 : t es incroyable . 
--------------------------------------------------
입력문장 : tom walked in . 
정답문장 : tom entra . 
번역문장 : tom est en train de parler . 
--------------------------------------------------


In [97]:
for seq_index in [3, 50, 100, 300, 1001]:
  input_seq = encoder_input_test[seq_index: seq_index + 1]
  decoded_sentence = decode_sequence(input_seq)

  print("입력문장 :",seq_to_src(encoder_input_test[seq_index]))
  print("정답문장 :",seq_to_tar(decoder_input_test[seq_index]))
  print("번역문장 :",decoded_sentence[1:-5])
  print("-"*50)

입력문장 : rest here . 
정답문장 : reposez vous ici . 
번역문장 : reviens a l argent . 
--------------------------------------------------
입력문장 : let it go . 
정답문장 : laisse tomber ! 
번역문장 : laissez tomber ! 
--------------------------------------------------
입력문장 : are you ready now ? 
정답문장 : etes vous prete maintenant ? 
번역문장 : etes vous maintenant prets ? 
--------------------------------------------------
입력문장 : she made him happy . 
정답문장 : elle le rendit heureux . 
번역문장 : elle l a fait mal . 
--------------------------------------------------
입력문장 : i m saying no . 
정답문장 : je dis non . 
번역문장 : je moi ! 
--------------------------------------------------


In [1]:
import numpy as np
from collections import Counter
from nltk import ngrams

In [2]:
def simple_count(tokens,n):
    return Counter(ngrams(tokens,n))

In [3]:
candidate = "It is a guide to action which ensures that the military always obeys the commands of the party."
tokens = candidate.split()
result = simple_count(tokens,1)
print('유니그램 카운트 :',result)

유니그램 카운트 : Counter({('the',): 3, ('It',): 1, ('is',): 1, ('a',): 1, ('guide',): 1, ('to',): 1, ('action',): 1, ('which',): 1, ('ensures',): 1, ('that',): 1, ('military',): 1, ('always',): 1, ('obeys',): 1, ('commands',): 1, ('of',): 1, ('party.',): 1})


In [5]:
candidate = 'the the the the the the the'
tokens = candidate.split()
result = simple_count(tokens,1)
print('유니그램 카운트 :',result)

유니그램 카운트 : Counter({('the',): 7})


In [8]:
def count_clip(candidate,reference_list,n):
    ca_cnt = simple_count(candidate,n)
    max_ref_cnt_dict = dict()
    
    for ref in reference_list:
        ref_cnt = simple_count(ref,n)
        
        for n_gram in ref_cnt:
            if n_gram in max_ref_cnt_dict:
                max_ref_cnt_dict[n_gram] = max(ref_cnt[n_gram],max_ref_cnt_dict[n_gram])
            else:
                max_ref_cnt_dict[n_gram] = ref_cnt[n_gram]
    return {
        n_gram: min(ca_cnt.get(n_gram,0),max_ref_cnt_dict.get(n_gram,0)) for n_gram in ca_cnt
    }

In [9]:
candidate = 'the the the the the the the'
references = [
    'the cat is on the mat',
    'there is a cat on the mat'
]
result = count_clip(candidate.split(),list(map(lambda ref: ref.split(),references)),1)
print('보정된 유니그램 카운트 :',result)

보정된 유니그램 카운트 : {('the',): 2}


In [10]:
def modified_precision(candidate,reference_list,n):
    clip_cnt = count_clip(candidate,reference_list,n)
    total_clip_cnt = sum(clip_cnt.values())
    cnt = simple_count(candidate,n)
    total_cnt = sum(cnt.values())
    if total_cnt == 0:
        total_cnt = 1
    return(total_clip_cnt/total_cnt)

In [11]:
result = modified_precision(candidate.split(),list(map(lambda ref:ref.split(),references)),1)
print('보정된 유니그램 정밀도 :',result)

보정된 유니그램 정밀도 : 0.2857142857142857


In [12]:
def closest_ref_length(candidate,reference_list):
    ca_len = len(candidate)
    ref_lens = (len(ref) for ref in reference_list)
    closest_ref_len = min(ref_lens,key=lambda ref_len: (abs(ref_len - ca_len),ref_len))
    return closest_ref_len

In [14]:
def brevity_penalty(candidate,reference_list):
    ca_len = len(candidate)
    ref_len = closest_ref_length(candidate,reference_list)
    if ca_len > ref_len:
        return 1
    elif ca_len == 0:
        return 0
    else:
        return np.exp(1 - ref_len/ca_len)

In [18]:
def bleu_score(candidate,reference_list,weights=[0.25,0.25,0.25,0.25]):
    bp = brevity_penalty(candidate,reference_list)
    p_n = [modified_precision(candidate,reference_list,n=n) for n,_ in enumerate(weights,start=1)]
    score = np.sum([w_i*np.log(p_i) if p_i != 0 else 0 for w_i,p_i in zip(weights,p_n)])
    return bp *np.exp(score)

In [20]:
import nltk.translate.bleu_score as bleu
candidate = 'It is a guide to action which ensures that the military always obeys the commands of the party'
references = [
    'It is a guide to action that ensures that the military will forever heed Party commands',
    'It is the guiding principle which guarantees the military forces always being under the command of the Party',
    'It is the practical guide for the army always to heed the directions of the party'
]
print('실습 코드의 BLEU :',bleu_score(candidate.split(),list(map(lambda ref: ref.split(),references))))
print('패키지 NLTK의 BLEU :',bleu.sentence_bleu(list(map(lambda ref: ref.split(),references)),candidate.split()))

실습 코드의 BLEU : 0.5045666840058485
패키지 NLTK의 BLEU : 0.5045666840058485


In [1]:
from tensorflow.keras.datasets import imdb
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
vocab_size = 10000
(X_train,y_train),(X_test,y_test) = imdb.load_data(num_words= vocab_size)

In [3]:
print('리뷰의 최대 길이 : {}'.format(max(len(l) for l in X_train)))
print('리뷰의 평균 길이 : {}'.format(sum(map(len,X_train))/len(X_train)))

리뷰의 최대 길이 : 2494
리뷰의 평균 길이 : 238.71364


In [4]:
max_len = 500
X_train = pad_sequences(X_train,maxlen=max_len)
X_test = pad_sequences(X_test,maxlen=max_len)

In [5]:
import tensorflow as tf

In [6]:
class BahdanauAttention(tf.keras.Model):
  def __init__(self, units):
    super(BahdanauAttention, self).__init__()
    self.W1 = Dense(units)
    self.W2 = Dense(units)
    self.V = Dense(1)

  def call(self, values, query): # 단, key와 value는 같음
    # query shape == (batch_size, hidden size)
    # hidden_with_time_axis shape == (batch_size, 1, hidden size)
    # score 계산을 위해 뒤에서 할 덧셈을 위해서 차원을 변경해줍니다.
    hidden_with_time_axis = tf.expand_dims(query, 1)

    # score shape == (batch_size, max_length, 1)
    # we get 1 at the last axis because we are applying score to self.V
    # the shape of the tensor before applying self.V is (batch_size, max_length, units)
    score = self.V(tf.nn.tanh(
        self.W1(values) + self.W2(hidden_with_time_axis)))

    # attention_weights shape == (batch_size, max_length, 1)
    attention_weights = tf.nn.softmax(score, axis=1)

    # context_vector shape after sum == (batch_size, hidden_size)
    context_vector = attention_weights * values
    context_vector = tf.reduce_sum(context_vector, axis=1)

    return context_vector, attention_weights

In [7]:
from tensorflow.keras.layers import Dense,Embedding,Bidirectional,LSTM,Concatenate,Dropout
from tensorflow.keras import Input,Model
from tensorflow.keras import optimizers
import os

In [8]:
sequence_input = Input(shape=(max_len,),dtype='int32')
embedded_sequences = Embedding(vocab_size,128,input_length=max_len,mask_zero=True)(sequence_input)

In [9]:
lstm = Bidirectional(LSTM(64,dropout=0.5,return_sequences=True))(embedded_sequences)

In [10]:
lstm, forward_h,forward_c,backward_h,backward_c = Bidirectional \
  (LSTM(64, dropout=0.5, return_sequences=True, return_state=True))(lstm)

In [11]:
print(lstm.shape,forward_h.shape,forward_c.shape,backward_h.shape,backward_c.shape)

(None, 500, 128) (None, 64) (None, 64) (None, 64) (None, 64)


In [12]:
state_h = Concatenate()([forward_h,backward_h])
state_c = Concatenate()([forward_c,backward_c])

In [13]:
attention = BahdanauAttention(64)
context_vector, attention_weights = attention(lstm,state_h)

In [14]:
dense1 = Dense(20,activation='relu')(context_vector)
dropout = Dropout(0.5)(dense1)
output = Dense(1,activation='sigmoid')(dropout)
model = Model(inputs=sequence_input,outputs=output)

In [15]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [16]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 500)]        0           []                               
                                                                                                  
 embedding (Embedding)          (None, 500, 128)     1280000     ['input_1[0][0]']                
                                                                                                  
 bidirectional (Bidirectional)  (None, 500, 128)     98816       ['embedding[0][0]']              
                                                                                                  
 bidirectional_1 (Bidirectional  [(None, 500, 128),  98816       ['bidirectional[0][0]']          
 )                               (None, 64),                                                  

In [17]:
history = model.fit(X_train, y_train, epochs = 3, batch_size = 256, validation_data=(X_test, y_test), verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [18]:
print('\n 테스트 정확도: %.4f'%(model.evaluate(X_test,y_test)[1]))


 테스트 정확도: 0.8802
