In [60]:
import sys
import os
import pandas as pd
import sentencepiece as spm
import tensorflow as tf
import warnings
import numpy as np
from tqdm import tqdm 
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')


# set path for module import from user's directory
sys.path.insert(0, os.getenv('HOME') + '/saturi_lab_multi_nmt_low_resource/src/training/')
sys.path.insert(1, os.getenv('HOME') + '/saturi_lab_multi_nmt_low_resource/src/utils')
sys.path.insert(2, os.getenv('HOME') + '/saturi_lab_multi_nmt_low_resource/src/models/baseline/')

from dataset_util import CustomDatasetforTranslation
import utils

In [2]:
main_path = os.getenv('HOME') + '/saturi_lab_multi_nmt_low_resource'
data_path = main_path + '/data/processed/translated_train_data.csv'
df = pd.read_csv(data_path,)
del df['Unnamed: 0']
df.head()

Unnamed: 0,text,dial,reg,pair,eng
0,그 부주를 하기에는 조금 그렇지 않아?,그 부주를 하기에는 조금 그렇지 안?,jj,(안?)/(않아?),Isnt it a little bit like that carelessness
1,그리고 거기서 밥을 먹을지도 모르겠어.,그리고 거기서 밥을 먹을지도 모르겐.,jj,(모르겐.)/(모르겠어.),And I dont even know if Ill eat rice there
2,왜냐하면은 밥이 안 나온다고 들었어.,왜냐하면은 밥이 안 나온다고 들언.,jj,(들언.)/(들었어.),Because I heard that the rice isnt served
3,토요일 날에 토요일날에 잔치를 하고 일요일 날에 식을 올린대.,토요일 날에 토요일날에 잔치를 하고 일요일 날에 식을 올린댄.,jj,(올린댄.)/(올린대.),They have a party on Saturday and a ceremony o...
4,그럼 어떻게 혼자 아 혼자래.,그럼 어떻게 혼자 아 혼자랜.,jj,(혼자랜.)/(혼자래.),Then how are you alone Oh are you alone


In [3]:
# Tokenizer model files path
src_tok_path = main_path + '/saved_models/tokenizer/old/spm_enc_spm16000.model'
tgt_tok_path = main_path + '/saved_models/tokenizer/old/spm_dec_spm16000.model'

In [4]:
# Load tokenzier models

src_tokenizer = spm.SentencePieceProcessor()#.Load(src_tok_path)
src_tokenizer.Load(src_tok_path)
tgt_tokenizer = spm.SentencePieceProcessor()#.Load(tgt_tok_path)
tgt_tokenizer.Load(tgt_tok_path)

print('source tokenizer vocab size :',src_tokenizer.vocab_size())
print(src_tokenizer.EncodeAsPieces('Here is an example of source tokenization.'))
print('target tokenizer vocab size :',tgt_tokenizer.vocab_size())
print(tgt_tokenizer.EncodeAsPieces('이것은 토큰화 예시입니다.'))

source tokenizer vocab size : 16009
['▁Here', '▁is', '▁an', '▁example', '▁of', '▁source', '▁token', 'ization', '.']
target tokenizer vocab size : 16009
['▁이것', '은', '▁토', '큰', '화', '▁예', '시', '입니다', '.']


In [37]:
train, test = train_test_split(df, test_size=0.2,stratify=df['reg'],random_state=2,shuffle=True)

In [39]:
# Cretate dataset for training; CustomDataSetforTranslation class created in src folder
train_dataset = CustomDatasetforTranslation(train['eng'].to_numpy(),train['dial'].to_numpy(),train['reg'].to_numpy(), 128, src_tokenizer,tgt_tokenizer, True)
valid_dataset = CustomDatasetforTranslation(test['eng'].to_numpy(), test['dial'].to_numpy(), test['reg'].to_numpy(), 128, src_tokenizer,tgt_tokenizer, True)

In [40]:
# Learning Scheduler,optimizer, and loss function

learningrate = utils.LearningRateScheduler(512)
optimizer = tf.keras.optimizers.Adam(learningrate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [41]:
# loss
criterion = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = criterion(real, pred)

    # Masking 되지 않은 입력의 개수로 Scaling하는 과정
    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [42]:
# define train function

@tf.function()
def train_step(src, tgt, model, optimizer):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)

    # 계산된 loss에 tf.GradientTape()를 적용해 학습을 진행합니다.
    with tf.GradientTape() as tape:
        predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
        loss = loss_function(gold, predictions[:, :-1])

    # 최종적으로 optimizer.apply_gradients()가 사용됩니다. 
    gradients = tape.gradient(loss, model.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    
    
    return loss

In [43]:
# Validating the model
@tf.function
def model_validate(src, tgt, model):
    gold = tgt[:, 1:]
        
    enc_mask, dec_enc_mask, dec_mask = generate_masks(src, tgt)
    predictions, enc_attns, dec_attns, dec_enc_attns = model(src, tgt, enc_mask, dec_enc_mask, dec_mask)
    v_loss = loss_function(gold, predictions[:, :-1])
    
    return v_loss, predictions

In [46]:
#train function
def train(transformer,train_dataset, valid_dataset,optimizer,EPOCHS):

    for epoch in range(EPOCHS):
        t = tqdm(train_dataset)
        total_loss = 0

        for i, pairs in enumerate(t):
            src, tgt = pairs
            max_len = len(max(src,key=len))
            enc_train = tf.keras.preprocessing.sequence.pad_sequences(src, padding='post', maxlen=max_len)
            dec_train = tf.keras.preprocessing.sequence.pad_sequences(tgt, padding='post', maxlen=max_len)

            batch_loss = train_step(enc_train,
                                    dec_train,
                                    transformer,
                                    optimizer)

            total_loss += batch_loss

            t.set_description_str('Bucket %2d' % (bucket))
            t.set_postfix_str('Loss %.4f' % (total_loss.numpy() / (batch + 1)))
            
            
        #validation
        total_loss_val = 0
        tv = tqdm(valid_dataset)
        
        for k, vpairs in enumerate(tv) :
            src, tgt = vpairs
            max_len = len(max(src,key=len))
            enc_val = tf.keras.preprocessing.sequence.pad_sequences(src, padding='post', maxlen=max_len)
            dec_val = tf.keras.preprocessing.sequence.pad_sequences(tgt, padding='post', maxlen=max_len)
            val_loss = model_validate(enc_val,
                                      dec_val,
                                      transformer)
            total_loss_val += val_loss
            tv.set_postfix_str('val_Loss %.4f' % (total_loss_val.numpy() / (batch_val + 1)))

In [52]:
# get model config
import json
config_path = main_path + '/src/utils/config.json'
with open(config_path,'r') as f :
    config = json.load(f)['model']

In [62]:
config['src_vocab_size'] = src_tokenizer.vocab_size()
config['tgt_vocab_size'] = tgt_tokenizer.vocab_size()

In [63]:
from vanilla_transformer import Transformer
# model init
transformer = Transformer(
    n_layers=config['n_layers'],
    d_model=config['d_model'],
    n_heads=config['n_heads'],
    d_ff=config['d_ff'],
    src_vocab_size=config['src_vocab_size'],
    tgt_vocab_size=config['tgt_vocab_size'],
    pos_len=config['pos_len'],
    dropout=config['dropout'],
    shared=config['shared'])