# 作業 : 調整 Bert 模型的不同訓練參數

# [作業目標]
- 調整 Bert 模型的不同參數, 分別觀察 loss 數據並比較

# [作業重點]
- 調整 Bert 模型的不同訓練參數, 分別觀察 loss 數據並比較

# [參數說明]
- MODEL_DIM : Attention 特徵維度，即每一筆訓練輸入單字的個數
- N_LAYER : Attention 堆疊的層數
- LEARNING_RATE : 學習速率，影響收斂的快慢
- MASK_RATE : 掩碼比例(介於 0 到 0.5 間, 建議值 0.15)

# 程式說明
- 程式採用 tensorflow2 / keras 寫作, 執行前請先安裝 tensorflow 2.0
- 本程式執行時, 請將 utils.py / transformer.py / GPT.py 等三個檔案與執行檔放置於同一目錄下
- 程式來源 : 莫煩Python-BERT:雙向語言模型 https://mofanpy.com/tutorials/machine-learning/nlp/bert/

In [1]:
# [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/pdf/1810.04805.pdf)
import numpy as np
import tensorflow as tf
import utils    # this refers to utils.py in my [repo](https://github.com/MorvanZhou/NLP-Tutorials/)
import time
from GPT import GPT
import os
import pickle


class BERT(GPT):
    def __init__(self, model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg=3, drop_rate=0.1, padding_idx=0):
        super().__init__(model_dim, max_len, n_layer, n_head, n_vocab, lr, max_seg, drop_rate, padding_idx)
        # I think task emb is not necessary for pretraining,
        # because the aim of all tasks is to train a universal sentence embedding
        # the body encoder is the same across all tasks,
        # and different output layer defines different task just like transfer learning.
        # finetuning replaces output layer and leaves the body encoder unchanged.

        # self.task_emb = keras.layers.Embedding(
        #     input_dim=n_task, output_dim=model_dim,  # [n_task, dim]
        #     embeddings_initializer=tf.initializers.RandomNormal(0., 0.01),
        # )

    def step(self, seqs, segs, seqs_, loss_mask, nsp_labels):
        with tf.GradientTape() as tape:
            mlm_logits, nsp_logits = self.call(seqs, segs, training=True)
            mlm_loss_batch = tf.boolean_mask(self.cross_entropy(seqs_, mlm_logits), loss_mask)
            mlm_loss = tf.reduce_mean(mlm_loss_batch)
            nsp_loss = tf.reduce_mean(self.cross_entropy(nsp_labels, nsp_logits))
            loss = mlm_loss + 0.2 * nsp_loss
            grads = tape.gradient(loss, self.trainable_variables)
            self.opt.apply_gradients(zip(grads, self.trainable_variables))
        return loss, mlm_logits

    def mask(self, seqs):
        mask = tf.cast(tf.math.equal(seqs, self.padding_idx), tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]  # [n, 1, 1, step]


def _get_loss_mask(len_arange, seq, pad_id):
    rand_id = np.random.choice(len_arange, size=max(2, int(MASK_RATE * len(len_arange))), replace=False)
    loss_mask = np.full_like(seq, pad_id, dtype=np.bool)
    loss_mask[rand_id] = True
    return loss_mask[None, :], rand_id


def do_mask(seq, len_arange, pad_id, mask_id):
    loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
    seq[rand_id] = mask_id
    return loss_mask


def do_replace(seq, len_arange, pad_id, word_ids):
    loss_mask, rand_id = _get_loss_mask(len_arange, seq, pad_id)
    seq[rand_id] = np.random.choice(word_ids, size=len(rand_id))
    return loss_mask


def do_nothing(seq, len_arange, pad_id):
    loss_mask, _ = _get_loss_mask(len_arange, seq, pad_id)
    return loss_mask


def random_mask_or_replace(data, arange, batch_size):
    seqs, segs, xlen, nsp_labels = data.sample(batch_size)
    seqs_ = seqs.copy()
    p = np.random.random()
    if p < 0.7:
        # mask
        loss_mask = np.concatenate(
            [do_mask(
                seqs[i],
                np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
                data.pad_id,
                data.v2i["<MASK>"]) for i in range(len(seqs))], axis=0)
    elif p < 0.85:
        # do nothing
        loss_mask = np.concatenate(
            [do_nothing(
                seqs[i],
                np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
                data.pad_id) for i in range(len(seqs))], axis=0)
    else:
        # replace
        loss_mask = np.concatenate(
            [do_replace(
                seqs[i],
                np.concatenate((arange[:xlen[i, 0]], arange[xlen[i, 0] + 1:xlen[i].sum() + 1])),
                data.pad_id,
                data.word_ids) for i in range(len(seqs))], axis=0)
    return seqs, segs, seqs_, loss_mask, xlen, nsp_labels


def train(model, data, step=10000, name="bert"):
    t0 = time.time()
    arange = np.arange(0, data.max_len)
    for t in range(step):
        seqs, segs, seqs_, loss_mask, xlen, nsp_labels = random_mask_or_replace(data, arange, 16)
        loss, pred = model.step(seqs, segs, seqs_, loss_mask, nsp_labels)
        if t % 100 == 0:
            pred = pred[0].numpy().argmax(axis=1)
            t1 = time.time()
            print(
                "\n\nstep: ", t,
                "| time: %.2f" % (t1 - t0),
                "| loss: %.3f" % loss.numpy(),
                "\n| tgt: ", " ".join([data.i2v[i] for i in seqs[0][:xlen[0].sum()+1]]),
                "\n| prd: ", " ".join([data.i2v[i] for i in pred[:xlen[0].sum()+1]]),
                "\n| tgt word: ", [data.i2v[i] for i in seqs_[0]*loss_mask[0] if i != data.v2i["<PAD>"]],
                "\n| prd word: ", [data.i2v[i] for i in pred*loss_mask[0] if i != data.v2i["<PAD>"]],
                )
            t0 = t1
    os.makedirs("./visual/models/%s" % name, exist_ok=True)
    model.save_weights("./visual/models/%s/model.ckpt" % name)


def export_attention(model, data, name="bert"):
    model.load_weights("./visual/models/%s/model.ckpt" % name)

    # save attention matrix for visualization
    seqs, segs, xlen, nsp_labels = data.sample(32)
    model.call(seqs, segs, False)
    data = {"src": [[data.i2v[i] for i in seqs[j]] for j in range(len(seqs))], "attentions": model.attentions}
    path = "./visual/tmp/%s_attention_matrix.pkl" % name
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "wb") as f:
        pickle.dump(data, f)

In [2]:
if __name__ == "__main__":
    utils.set_soft_gpu(True)
    MODEL_DIM = 256
    N_LAYER = 4
    LEARNING_RATE = 1e-4
    MASK_RATE = 0.15

    d = utils.MRPCData("./MRPC", 2000)
    print("num word: ", d.num_word)
    m = BERT(
        model_dim=MODEL_DIM, max_len=d.max_len, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word,
        lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.v2i["<PAD>"])
    train(m, d, step=10000, name="bert")
    export_attention(m, d, "bert")

downloading from https://mofanpy.com/static/files/MRPC/msr_paraphrase_train.txt
completed
downloading from https://mofanpy.com/static/files/MRPC/msr_paraphrase_test.txt
completed
num word:  12880


step:  0 | time: 2.22 | loss: 9.734 
| tgt:  <GO> white house officials say iran <MASK> one last chance to comply <MASK> iaea <MASK> demands . <SEP> a white house spokesman added <MASK> iran had <MASK> one last chance <quote> to comply with its disarmament obligations 
| prd:  lbany crowd lbany change administered cbs.marketwatch.com schoof schoof stewart lbany stewart lbany suggesting quadrupled rated compete blair strom cbs.marketwatch.com lackawanna hills-based lbany six-pack convey six-pack six-pack stewart stewart comes lackawanna schoof six-pack lbany messenger.msn.com blindness leaderboard patrols 
| tgt word:  ['has', 'with', 'inspection', 'that', '<quote>'] 
| prd word:  ['schoof', 'suggesting', 'rated', 'convey', 'stewart']


step:  100 | time: 64.22 | loss: 7.771 
| tgt:  <GO> ent



step:  1400 | time: 69.18 | loss: 6.250 
| tgt:  <GO> the decision was among the most significant steps toward deregulation undertaken during the bush <MASK> . <SEP> the decision <MASK> <MASK> the far-reaching deregulatory actions made <MASK> the bush administration 
| prd:  <GO> the the , the the the the the the the <SEP> the the the the , <SEP> the the the the the the the the the the the the the 
| tgt word:  ['administration', 'is', 'among', 'during'] 
| prd word:  ['the', 'the', 'the', 'the']


step:  1500 | time: 64.61 | loss: 7.060 
| tgt:  <GO> tony blair has taken a hardline <MASK> <MASK> nothing should <MASK> done to lessen the pressure on <MASK> <MASK> the gathering in the capital abuja . <SEP> the prime minister has taken a hardline stance arguing nothing should be <MASK> to lessen the pressure on mugabe 
| prd:  <GO> the a the a a the a the a the a a to a the a on the a the a in the a a the <SEP> the a a a a a a a a a a a a to a the a a the 
| tgt word:  ['stance', 'argui



step:  2900 | time: 65.64 | loss: 5.939 
| tgt:  <GO> thousands of people in the south of england caught a glimpse of fowler lunar eclipse as they gazed up gender-specific the sky on saturday boyfriend . <SEP> thousands rodents people in the south of england caught a glimpse of feith lunar wa as they gazed up at the night sky publicist today 
| prd:  <GO> the of the in the the of the the a the of the the the of the the the the the the on the on , <SEP> the the the in the the of the the a the of the of the , of the the at the the the the the 
| tgt word:  ['a', 'at', 'night', 'of', 'the', 'eclipse', 'early'] 
| prd word:  ['the', 'the', 'on', 'the', 'the', 'the', 'the']


step:  3000 | time: 65.65 | loss: 5.787 
| tgt:  <GO> sco has also alleged that <MASK> generic linux kernel contains code <MASK> is from its unix property . <SEP> sco claims that the linux kernel <MASK> unix <MASK> property owned by sco 
| prd:  <GO> he has the that that the that that that <SEP> that that is <SEP> <S



step:  4200 | time: 66.46 | loss: 5.563 
| tgt:  <GO> the us federal trade <MASK> has also filed a lawsuit challenging rambus . <SEP> still pending against <MASK> <MASK> a lawsuit <MASK> by the federal trade commission 
| prd:  <GO> the company in to the has to on a the the the . <SEP> the is against the the a lawsuit the by the on company the 
| tgt word:  ['commission', 'rambus', 'is', 'brought'] 
| prd word:  ['the', 'the', 'the', 'the']


step:  4300 | time: 66.50 | loss: 5.002 
| tgt:  <GO> there is , however <MASK> no photo of peter hollingworth in the june issue examined <MASK> <MASK> herald yesterday . <SEP> there is , <MASK> , no photograph of dr hollingworth in the june <MASK> of the magazine examined by the age <MASK> 
| prd:  <GO> there is , the , is said of the june in the <NUM> <NUM> in by the in , . <SEP> there is , the , is first of the in in the june <NUM> of the first said by the first in 
| tgt word:  [',', 'by', 'the', 'however', 'issue', 'yesterday'] 
| prd word:



step:  5600 | time: 67.03 | loss: 3.266 
| tgt:  <GO> defense secretary donald h. rumsfeld and others argued that saddam hussein possessed chemical and biological weapons and was hiding them . <SEP> defense secretary donald h. rumsfeld and others argued that saddam hussein ( news - web sites ) possessed chemical , biological and other weapons and was hiding them 
| prd:  <GO> he , , and and and the and that and hussein and and and and and and was and . . <SEP> there , and there and and and and that and hussein ( news - and and ) and there , and and other and and was and . 
| tgt word:  ['chemical', 'biological', 'weapons', 'others', ')', ',', 'biological'] 
| prd word:  ['and', 'and', 'and', 'and', ')', ',', 'and']


step:  5700 | time: 67.08 | loss: 2.480 
| tgt:  <GO> the two men were allegedly trying to engage russian exiles in britain in the assassination plot . <SEP> the informant alleged that the two arrested men were trying to engage russian exiles in britain in the conspiracy



step:  6800 | time: 66.01 | loss: 4.173 
| tgt:  <GO> shares gordon microsoft fell <NUM> cent to close at $ <NUM> ceremonies the nasdaq metal market . <SEP> microsoft shares ( msft : news , chart , profile ) fell <NUM> cent to basis at $ above 
| prd:  <GO> shares of microsoft fell <NUM> cent to rose at $ <NUM> percent the nasdaq stock stock . <SEP> microsoft shares ( <NUM> : news , percent , ( ) fell <NUM> cents to <NUM> at $ <NUM> 
| tgt word:  ['of', 'on', 'stock', 'close', '<NUM>'] 
| prd word:  ['of', 'percent', 'stock', '<NUM>', '<NUM>']


step:  6900 | time: 66.69 | loss: 3.869 
| tgt:  <GO> however , scientists led by physicist frank mcdonald of the <MASK> <MASK> maryland disagree . <SEP> it 's <MASK> a matter of time , said frank <MASK> , of the university of maryland 
| prd:  <GO> the , charged this by university frank university of the university of frank suspect . <SEP> it 's , a matter of time , said frank of , of the university of frank 
| tgt word:  ['university', 'of'



step:  8100 | time: 76.26 | loss: 4.338 
| tgt:  <GO> in next week 's drill presided <quote> our objective is to improve the nation resources capacity to platforms lives in ... a terrorist gate , <quote> ridge said . <SEP> <quote> our objective is to improve the nation killing capacity to burning lives in ... a terrorist event , <quote> including use of cbs of mass destruction successor ruled said 
| prd:  <GO> in the week 's our , <quote> our need is to improve the new 's to to our lives in a a statement group , <quote> ridge said . <SEP> <quote> our lives is to be the nation , our to be , in our a terrorist of , <quote> including our of new of our statement , rights said 
| tgt word:  [',', "'s", 'save', 'event', "'s", 'save', 'weapons', ',', 'ridge'] 
| prd word:  [',', "'s", 'our', 'group', ',', 'be', 'new', ',', 'rights']


step:  8200 | time: 71.89 | loss: 3.519 
| tgt:  <GO> <MASK> talked with the boy <MASK> about an hour <MASK> a half , bragdon said . <SEP> negotiators talked



step:  9500 | time: 70.90 | loss: 2.737 
| tgt:  <GO> the dow jones industrial average ended the <MASK> down <NUM> at <NUM> , <MASK> <MASK> <NUM> wednesday . <SEP> the <MASK> jones industrial average fell <NUM> points , <MASK> <NUM> percent , to <NUM> 
| prd:  <GO> the dow jones industrial average ended the dow down <NUM> at <NUM> , or at <NUM> on . <SEP> the dow jones industrial average fell <NUM> points , or <NUM> percent , to <NUM> 
| tgt word:  ['day', 'after', 'advancing', 'dow', 'or'] 
| prd word:  ['dow', 'or', 'at', 'dow', 'or']


step:  9600 | time: 71.09 | loss: 2.738 
| tgt:  <MASK> a <MASK> psa test has to be followed up <MASK> a biopsy or other procedures before cancer can be confirmed . <SEP> before confirming <MASK> diagnosis of cancer , a positive <MASK> test must be followed up <MASK> a biopsy or other procedures 
| prd:  <GO> a psa psa psa has to be up up to a psa or other cancer before cancer can be cancer . <SEP> before psa a psa of cancer , a life psa first would

In [2]:
if __name__ == "__main__":
    utils.set_soft_gpu(True)
    MODEL_DIM = 128
    N_LAYER = 4
    LEARNING_RATE = 1e-4
    MASK_RATE = 0.15

    d = utils.MRPCData("./MRPC", 2000)
    print("num word: ", d.num_word)
    m = BERT(
        model_dim=MODEL_DIM, max_len=d.max_len, n_layer=N_LAYER, n_head=4, n_vocab=d.num_word,
        lr=LEARNING_RATE, max_seg=d.num_seg, drop_rate=0.2, padding_idx=d.v2i["<PAD>"])
    train(m, d, step=10000, name="bert")
    export_attention(m, d, "bert")

num word:  12880


step:  0 | time: 1.40 | loss: 9.648 
| tgt:  <GO> but close wondered <MASK> the package would be <MASK> the cost of licensing the third-party software , along with salesforce.com 's rental price . <MASK> close also questions whether <MASK> would be worth the cost <MASK> licensing third-party software , along with salesforce.com 's <MASK> price 
| prd:  4th serving embedded neiman embedded leslie serving outperformed outperformed embedded scourge rallied serving cam serving flagship boy scourge hynix serving vail scourge scourge scourge cucamonga neiman physically schematic boy lpga insider unwittingly scourge sbc flagship embedded flagship developmentally schaumburg boy sbc gansas island-wide cavalry cucamonga sprayregen plunder 
| tgt word:  ['whether', 'worth', '<SEP>', 'it', 'of', 'rental'] 
| prd word:  ['embedded', 'embedded', 'neiman', 'insider', 'flagship', 'sprayregen']


step:  100 | time: 40.04 | loss: 8.511 
| tgt:  <GO> <MASK> left the ship after the <MAS



step:  1400 | time: 39.80 | loss: 6.926 
| tgt:  <GO> <quote> <MASK> is a benign web where <MASK> hope to <MASK> investors and where each of us can <MASK> our enlightened <MASK> interest through cooperation with others . <quote> <SEP> it is a benign web <MASK> <MASK> hope to catch investors <MASK> where each of us can advance our enlightened self-interest through cooperation with <MASK> , ' said mr goh 
| prd:  <GO> the , , , , the , , the , , the , , the <SEP> <SEP> <SEP> the the <SEP> <SEP> <SEP> <SEP> <SEP> <SEP> <SEP> <SEP> <SEP> <SEP> the , the the , , the the , the the the , the the the the the the the the the the the the , the the , the 
| tgt word:  ['it', 'we', 'catch', 'advance', 'self', 'where', 'we', 'and', 'others'] 
| prd word:  [',', ',', ',', 'the', '<SEP>', ',', 'the', 'the', 'the']


step:  1500 | time: 39.78 | loss: 6.645 
| tgt:  <GO> meanwhile <MASK> rival contender , general electric <MASK> nbc , submitted a letter of interest <MASK> <MASK> source <MASK> <MASK> 



step:  2700 | time: 40.34 | loss: 6.123 
| tgt:  <MASK> germany 's foreign ministry said it believed <MASK> passengers were from the northern states of lower <MASK> and schleswig-holstein , but <MASK> no further details . <SEP> germany said most of the passengers were from the northern states of lower saxony <MASK> <MASK> 
| prd:  <GO> the , of the the the the the the the the the of , of the the the the , the the , the the the <SEP> the the the of the the the the the the the of , , of , 
| tgt word:  ['<GO>', 'the', 'saxony', 'had', 'and', 'schleswig-holstein'] 
| prd word:  ['<GO>', 'the', 'the', 'the', 'of', ',']


step:  2800 | time: 42.46 | loss: 6.337 
| tgt:  <GO> the delegates said <MASK> and distributing funds has <MASK> complicated by <MASK> u.s. <MASK> on jihadi charitable <MASK> , bank accounts of terror-related organizations and <MASK> transfers . <SEP> bin laden â s men <MASK> out that raising and distributing funds has been <MASK> by the u.s. crackdown on jihadi chari



step:  4000 | time: 39.65 | loss: 5.869 
| tgt:  <GO> <quote> it <MASK> probably not the easiest time to take over the shuttle program <MASK> <quote> <MASK> <MASK> , <MASK> but i look forward <MASK> the <MASK> . <SEP> <quote> it is probably not the easiest time to come in and take <MASK> the shuttle program , but then again , i look forward to the <MASK> , <quote> he said 
| prd:  <GO> <quote> it , , , the the the to to , the <quote> <quote> , <quote> <quote> <quote> , , the <quote> <quote> <quote> said the said <SEP> <SEP> <quote> <quote> , <quote> , the <quote> , to <quote> in and to , the said <quote> , the <quote> , , <quote> <quote> <quote> to the <quote> , <quote> <quote> said 
| tgt word:  ["'s", ',', 'he', 'added', '<quote>', 'to', 'challenge', 'over', 'challenge'] 
| prd word:  [',', ',', '<quote>', '<quote>', ',', 'said', 'said', ',', '<quote>']


step:  4100 | time: 39.63 | loss: 6.150 
| tgt:  <GO> burns <MASK> <MASK> confessing a crime <MASK> did not commit was the only 



step:  5400 | time: 39.65 | loss: 6.109 
| tgt:  <GO> those in their twenties who ejaculated more than five times a adequate atop then-president less likely to develop aggressive prostate cancer later in life , they say . <SEP> those who cashing more than recipients times a week were a third chechen likely to develop serious prostate cancer in later self-interest 
| prd:  <GO> a in a a a a a a a a a a a a a a to a a a a , in a , a a in <SEP> a a a a <NUM> a a a a were a a a a to a a a of in a a 
| tgt word:  ['week', 'were', 'one-third', 'ejaculated', 'five', 'less', 'life'] 
| prd word:  ['a', 'a', 'a', 'a', 'a', 'a', 'a']


step:  5500 | time: 39.71 | loss: 5.877 
| tgt:  <GO> prince <MASK> said , <MASK> the kingdom <MASK> saudi arabia has been wrongfully and morbidly accused of <MASK> in the tragic terrorist attacks of <MASK> <NUM> , <NUM> . <quote> <SEP> <quote> the kingdom of saudi <MASK> <MASK> <MASK> wrongfully and morbidly accused of complicity in the tragic terrorist attacks



step:  6700 | time: 43.56 | loss: 5.446 
| tgt:  <GO> <MASK> <MASK> televised interview on wednesday , <MASK> president wim <MASK> said <MASK> was too soon <MASK> discuss further interest rate cuts in the <NUM>-nation euro zone . <SEP> european central bank president wim duisenberg said in <MASK> televised interview that it was too soon to discuss <MASK> interest rate cuts in the euro zone 
| prd:  <GO> the a a a on a , a a a a said a was a as a a in a a a in the a a a . <SEP> a in a a a a said in a a a that it was a as to a a a a as in the a in 
| tgt word:  ['in', 'a', 'ecb', 'duisenberg', 'it', 'to', 'a', 'further'] 
| prd word:  ['the', 'a', 'a', 'a', 'a', 'a', 'a', 'a']


step:  6800 | time: 40.19 | loss: 3.525 
| tgt:  <GO> al qaeda , the terror network led by saudi-born osama bin laden and blamed for the sept . <NUM> , <NUM> , attacks , has been linked to both cases . <SEP> al-qaida , the terror network led by saudi-born bin laden and blamed for the sept . <NUM> , <NUM> attack



step:  8000 | time: 40.88 | loss: 5.096 
| tgt:  <GO> she said the store ordered <NUM> copies and reserved another <MASK> . <SEP> in one johannesburg store , <NUM> copies <MASK> <MASK> ordered 
| prd:  <GO> she said the are was <NUM> index and points <NUM> <NUM> . <SEP> in the <NUM> <NUM> , <NUM> <NUM> <NUM> <NUM> was 
| tgt word:  ['<NUM>', 'have', 'been'] 
| prd word:  ['<NUM>', '<NUM>', '<NUM>']


step:  8100 | time: 39.85 | loss: 5.794 
| tgt:  <GO> the preceded nasdaq composite index .ixic was off <NUM> restatement , or <NUM> percent jacques at <NUM> . <SEP> the broader standard discussing poor 's <NUM> index morrison up <NUM> points , or <NUM> percent , at <NUM> 
| prd:  <GO> the broader index index index <NUM> was , <NUM> points , or <NUM> percent , at <NUM> . <SEP> the nasdaq , , , 's <NUM> index , , <NUM> points , or <NUM> percent , at <NUM> 
| tgt word:  ['technology-laced', 'points', ',', '&', 'edged'] 
| prd word:  ['broader', 'points', ',', ',', ',']


step:  8200 | time



step:  9200 | time: 42.18 | loss: 4.865 
| tgt:  <MASK> <MASK> , <MASK> tropical storm henri doused an already soaked florida friday , <MASK> powerful storms which further tested already-swollen lakes and rivers . <SEP> slow-moving , drenching tropical storm henri doused an already soaked florida on friday , pushing heavy rains into areas <MASK> lakes and <MASK> were full <MASK> overflowing 
| prd:  <GO> also , also tropical are were are an are and and friday , , are and , and and and and and . . <SEP> doused , also and are doused are an are be for on to , and and and and are and and and and were <SEP> and international 
| tgt word:  ['<GO>', 'slow-moving', 'drenching', 'bringing', 'where', 'rivers', 'to'] 
| prd word:  ['<GO>', 'also', 'also', ',', 'and', 'and', 'and']


step:  9300 | time: 40.77 | loss: 5.298 
| tgt:  <GO> <quote> we 're still confident that <MASK> will get the lion 's share of union support <MASK> <quote> <SEP> <MASK> or not <MASK> get <MASK> the two-thirds , we <

將 MODEL_DIM 從 256 調為 128 後，loss差異不大，但訓練時間下降許多，對於此資料集，MODEL_DIM 應該用 128即可