<a href="https://colab.research.google.com/github/serereuk/Information_retrieval/blob/master/seq2seq.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%tensorflow_version 2.x

TensorFlow 2.x selected.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
%cd '/content/drive/My Drive/information/CNN, RNN/RNN dataset'

/content/drive/My Drive/information/CNN, RNN/RNN dataset


In [0]:
import re
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [0]:
def Reader(file_path):
    data = open(file_path, 'r').readlines()
    en = []; fr = []
    for i in data:
        temp = i.strip().split('\t')
        en.append('<시작> ' + temp[0] + ' <끝>')
        fr.append('<시작> ' + temp[1] + ' <끝>')
    return en, fr

def tokenizer(data, max_len=30):
    tok = Tokenizer(filters='')
    tok.fit_on_texts(data)
    tensor = tok.texts_to_sequences(data)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, maxlen=max_len, padding='post')
    return tensor, tok

def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_mean(loss_)

In [0]:
train_en, train_fr = Reader('eng-fra_train.txt')
test_en, test_fr = Reader('eng-fra_test.txt')
train_en_tokened, en_tok = tokenizer(train_en)
train_fr_tokened, fr_tok = tokenizer(train_fr)

In [0]:
Buffer_size = len(train_en_tokened) + 1
Epoch = 50
Batch_size = 512
embed_dimension = 300
steps_per_epoch = len(train_en_tokened)//Batch_size
hidden_dimension = 512
learning_rate = 0.005
vocab_size = len(en_tok.word_index) + 1
tf.random.set_seed(1)
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

train_dataset = tf.data.Dataset.from_tensor_slices((train_en_tokened, train_fr_tokened)).shuffle(Buffer_size)
train_dataset = train_dataset.batch(Batch_size)

In [0]:
class Encoder(Model):

    def __init__(self, vocab_size, dimension, hidden_dimension):
        super(Encoder, self).__init__()
        self.embed = Embedding(vocab_size, dimension)
        self.encode = LSTM(hidden_dimension, return_sequences=True, return_state=True)

    def call(self, x, state):
        x = self.embed(x)
        output, hidden_state, cell_state = self.encode(x, initial_state=state)
        return output, [hidden_state, cell_state]
        
encoder = Encoder(vocab_size, 300, 512)

In [0]:
class Decoder(Model):

    def __init__(self, vocab_size, dimension, hidden_dimension):
        super(Decoder, self).__init__()
        self.embed = Embedding(vocab_size, dimension)
        self.decode = LSTM(hidden_dimension, return_state=True)
        self.fc = Dense(vocab_size)

    def call(self, x, state, encoder_output):
        x = self.embed(x)
        [_, hidden_state, cell_state] = self.decode(x, initial_state = state)
        hidden_state_cal = tf.expand_dims(hidden_state, 1)
        attention = tf.nn.softmax(tf.reduce_sum(encoder_output*hidden_state_cal, 2), axis=1)
        attention = tf.reduce_sum(encoder_output * tf.expand_dims(attention, 2), 2)
        x = self.fc(tf.concat([attention, hidden_state], axis=1))
        return x, [hidden_state, cell_state]

decoder = Decoder(len(fr_tok.word_index)+1, 300, 512)

In [0]:
@tf.function
def train_step(inp, targ, enc_state):
  loss = 0

  with tf.GradientTape() as tape:
    enc_output, enc_state = encoder(inp, enc_state)

    dec_state = enc_state

    dec_input = tf.expand_dims([fr_tok.word_index['<시작>']] * Batch_size, 1)

    # Teacher forcing - feeding the target as the next input
    for t in range(1, targ.shape[1]):
      # passing enc_output to the decoder
      predictions, dec_state  = decoder(dec_input, dec_state, enc_output)
      loss += loss_function(targ[:, t], predictions)
      dec_input = tf.expand_dims(targ[:, t], 1)

  batch_loss = (loss / int(targ.shape[1]))

  variables = encoder.trainable_variables + decoder.trainable_variables

  gradients = tape.gradient(loss, variables)

  optimizer.apply_gradients(zip(gradients, variables))

  return batch_loss

In [0]:
def train_result(inp):
    enc_state = [tf.zeros((1, 512)), tf.zeros((1,512))]
    enc_output, enc_state = encoder(inp, enc_state)
    dec_state = enc_state
    dec_input = tf.expand_dims([fr_tok.word_index['<시작>']], 0)
    ins = ''
    for t in range(30):
        if inp.numpy()[0][t] == 0:
            break
        ins += en_tok.index_word[inp.numpy()[0][t]] + ' '
    print(ins)
    result = ''
    for _ in range(30):
        prediction, dec_state = decoder(dec_input, dec_state, enc_output)
        predicted_id = tf.argmax(prediction[0]).numpy()
        a = fr_tok.index_word[predicted_id]
        result += a + ' '
        if a == '<끝>':
            break
        dec_input = tf.expand_dims([fr_tok.word_index[a]], 0)
    return result

In [0]:
EPOCHS = 10

for epoch in range(EPOCHS):
  
  enc_state = [tf.zeros((Batch_size, 512)), tf.zeros((Batch_size, 512))]
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train_dataset.take(steps_per_epoch)):
    batch_loss = train_step(inp, targ, enc_state)
    total_loss += batch_loss

    if batch % 100 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                   batch,
                                                   batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  
  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / steps_per_epoch))
  

Epoch 1 Batch 0 Loss 1.9323
Epoch 1 Batch 100 Loss 0.5687
Epoch 1 Loss 0.8455
Epoch 2 Batch 0 Loss 0.5552
Epoch 2 Batch 100 Loss 0.4596
Epoch 2 Loss 0.5021
Epoch 3 Batch 0 Loss 0.4190
Epoch 3 Batch 100 Loss 0.3790
Epoch 3 Loss 0.4072
Epoch 4 Batch 0 Loss 0.3591
Epoch 4 Batch 100 Loss 0.3247
Epoch 4 Loss 0.3486
Epoch 5 Batch 0 Loss 0.2865
Epoch 5 Batch 100 Loss 0.3292
Epoch 5 Loss 0.3078
Epoch 6 Batch 0 Loss 0.2888
Epoch 6 Batch 100 Loss 0.2934
Epoch 6 Loss 0.2778
Epoch 7 Batch 0 Loss 0.2339
Epoch 7 Batch 100 Loss 0.2530
Epoch 7 Loss 0.2555
Epoch 8 Batch 0 Loss 0.2181
Epoch 8 Batch 100 Loss 0.2538
Epoch 8 Loss 0.2373
Epoch 9 Batch 0 Loss 0.2044
Epoch 9 Batch 100 Loss 0.2221
Epoch 9 Loss 0.2202
Epoch 10 Batch 0 Loss 0.1981
Epoch 10 Batch 100 Loss 0.2218
Epoch 10 Loss 0.2088


In [0]:
train_result(tf.expand_dims(inp[5], 0))

<시작> tom didn t help me . <끝> 


'tom n y peut rien . <끝> '