In [49]:
import torch
import torch.nn as nn
import torch.nn.functional as F

from collections import namedtuple

基于PyTorch实现Seq2Seq + Attention的英汉Neural Machine Translation

<!-- from collections import namedtuple
# 命名元组对象student_info
student_info = namedtuple('stud_info','name, id, gender, age, score')
# 使用student_info对象对studinf进行赋值
studinf = student_info(name = 'xiaowang', id = '00001', gender = 'male', age = 22, score = 99)
print("name:{}, id:{}, gender:{}, age:{}, score:{}".format(studinf[0],studinf[1],studinf[2],studinf[3],studinf[4])) -->

Pack_padded_sequence

在编码句子向量的时候，通常会遇到 pack_padded_sequence 这样的函数，搞得一头雾水，重点理解下pack_padded_sequence 和 pad_packed_sequence 函数,他们是一对反函数

实际含有函数pad_sequence + pack_padded_sequence = pack_sequence， 最后使用pad_packed_sequence 进行还原。

为什么要填充：

在进行mini-batch 喂入模型数据的时候要保证mini-batch中句子长度一致，但是对于较短的句子，就需要使用特定的字符进行填充到统一的句子长度。但是我们不希望其填充的pad数据（一般为0）进入GRU或是LSTM模块，一是浪费资源，二是可能造成句子表征不准确。所以pack_padded_sequence 类应运而生。主要是对填充过的数据进行压缩。

In [50]:
Hypothesis = namedtuple('Hypothesis', ['value', 'score'])
class Encoder(nn.Module):
    def __init__(self,vocab_size,embed_size,enc_hidden_size,dec_hidden_size,dropout=0.2):
        super(Encoder,self).__init__()
        self.embed = nn.Embedding(vocab_size,embed_size)

        self.rnn = nn.GRU(embed_size,enc_hidden_size,batch_first=True,bidirectional=True)
        self.dropout = nn.Dropout(dropout)
        # 将encoder的输出转为decoder的输入，* 2 是使用了bidirectional
        self.fc = nn.Linear(enc_hidden_size*2, dec_hidden_size)

    def forward(self,x,lengths):
        embedded = self.dropout(self.embed(x))

        # 新版pytorch增加了batch里的排序功能，默认需要强制倒序
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded,lengths,batch_first=True)
        # hid 【2, batch, enc_hidden_size】
        packed_out, hid = self.rnn(packed_embedded)
        # 【batch, seq, 2 * enc_hidden_size】
        out,_ = nn.utils.rnn.pad_packed_sequence(packed_out,batch_first=True,total_length=max(lengths))

        # 将hid双向叠加 【batch, 2*enc_hidden_size】
        hid = torch.cat([hid[-2],hid[-1]],dim=1)
        # 转为decoder输入hidden state 【1,batch,dec_hidden_size】
        hid = torch.tanh(self.fc(hid)).unsqueeze(0)

        return out,hid


class Attention(nn.Module):
    """  """
    def __init__(self,enc_hidden_size,dec_hidden_size):
        super(Attention,self).__init__()

        self.enc_hidden_size = enc_hidden_size
        self.dec_hidden_size = dec_hidden_size

        self.liner_in = nn.Linear(2*enc_hidden_size,dec_hidden_size)
        self.liner_out = nn.Linear(2*enc_hidden_size+dec_hidden_size,dec_hidden_size)

    def forward(self,output,context,mask):
        # context 上下文输出，即encoder的gru hidden state 【batch,enc_seq,enc_hidden*2】
        # output  decoder的gru hidden state  【batch,dec_seq, dec_hidden】
        # mask 【batch, dec_seq, enc_seq】mask在decoder中创建

        batch_size = context.shape[0]
        enc_seq = context.shape[1]
        dec_seq = output.shape[1]

        # score计算公式使用双线性模型 h*w*s
        context_in = self.liner_in(context.reshape(batch_size*enc_seq,-1).contiguous())
        context_in = context_in.view(batch_size,enc_seq,-1).contiguous()
        atten = torch.bmm(output,context_in.transpose(1,2))
        # 【batch,dec_seq,enc_seq】

        atten.data.masked_fill(mask,-1e6)  # mask置零
        atten = F.softmax(atten,dim=2)

        # 将score和value加权求和，得到输出
        # 【batch, dec_seq, 2*enc_hidden】
        context = torch.bmm(atten,context)
        # 将attention + output 堆叠获取融合信息
        output = torch.cat((context,output),dim=2)

        # 最终输出 batch,dec_seq,dec_hidden_size
        output = torch.tanh(self.liner_out(output.view(batch_size*dec_seq,-1))).view(batch_size,dec_seq,-1)

        return output,atten


class Decoder(nn.Module):
    """"""
    def __init__(self,vocab_size,embedded_size,enc_hidden_size,dec_hidden_size,dropout=0.2):
        super(Decoder,self).__init__()
        self.embed = nn.Embedding(vocab_size,embedded_size)
        self.atten = Attention(enc_hidden_size,dec_hidden_size)
        # decoder不使用bidirectional
        self.rnn = nn.GRU(embedded_size,dec_hidden_size,batch_first=True)
        self.out = nn.Linear(dec_hidden_size,vocab_size)
        self.dropout = nn.Dropout(dropout)

    def create_mask(self,x_len,y_len):
        # 最长句子的长度
        max_x_len = x_len.max()
        max_y_len = y_len.max()
        # 句子batch
        batch_size = len(x_len)

        # 将超出自身序列长度的元素设为False
        x_mask = (torch.arange(max_x_len.item())[None, :] < x_len[:, None]).float()  # [batch,max_x_len]
        y_mask = (torch.arange(max_y_len.item())[None, :] < y_len[:, None]).float()  # [batch,max_y_len]

        # y_mask[:, :, None] size: [batch,max_y_len,1]
        # x_mask[:, None, :] size:  [batch,1,max_x_len]
        # 需要mask的地方设置为true
        mask = (1 - y_mask[:, :, None] * x_mask[:, None, :]) != 0

        # [batch_size, max_y_len, max_x_len]
        return mask

    def forward(self,ctx,ctx_lengths,y,y_lengths,hid):
        '''
        :param ctx:encoder层的输出 ： 【batch, enc_seq, 2*enc_hidden】
        :param ctx_lengths: encoder层输入句子的长度list
        :param y: decoder层的输入 【batch, dec_seq, dec_hidden】
        :param y_lengths: decoder输入的句子长度
        :param hid: encoder层输出的最后一个hidden state 【1, batch, dec_hidden】
        :return:
        '''
        y_embed = self.dropout(self.embed(y))
        # 这里没法保证译文也是排倒序
        y_packed = nn.utils.rnn.pack_padded_sequence(y_embed,y_lengths,batch_first=True,enforce_sorted=False)
        # 将emcoder的hidden state作为decoder的第一个hidden state
        pack_output, hid = self.rnn(y_packed,hid)
        output_seq,_ = nn.utils.rnn.pad_packed_sequence(pack_output,batch_first=True,total_length=max(y_lengths))

        # 做attention之前需要创建mask
        mask = self.create_mask(ctx_lengths,y_lengths)
        # annention处理
        output,atten = self.atten(output_seq,ctx,mask)
        # 将输出转为vocab_size的softmax概率分布并取对数
        output = F.log_softmax(self.out(output),dim=-1)

        return output,atten,hid

In [51]:
class seq2seq(nn.Module):
    '''
        模型架构
    '''
    def __init__(self,encoder,decoder):
        super(seq2seq,self).__init__()
        self.encoder = encoder
        self.decoder = decoder

    def forward(self,x,x_lengths,y,y_lengths):
        context,hid = self.encoder(x,x_lengths)

        output,atten,hid = self.decoder(
            context,x_lengths,
            y,y_lengths,
            hid
        )
        # output: 【batch,output_len,vocab_size】
        # atten   【batch,output_len,input_len】
        return output,atten

    def beam_search(self,x,x_lengths,y,EOS_id,topk=5,max_length=100):
        encoder_out,hid = self.encoder(x,x_lengths)

        # batch_size = x.shape[0]
        # preds = []
        # attens = []
        # for i in range(max_length):
        #
        #     output,atten,hid = self.decoder(
        #         encoder_out,x_lengths,
        #         y,torch.ones(batch_size).long().to(y.device),
        #         hid
        #     )
        #     # 取出预测概率最大index
        #     y = output.argmax(2).view(batch_size,1)
        #     preds.append(y)
        #     attens.append(atten)

        BOS_id = y[0][0].item()
        hypotheses = [[BOS_id]]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=y.device)
        completed_hypotheses = []
        t = 0
        while len(completed_hypotheses) < topk and t < max_length:
            t+=1
            hyp_num = len(hypotheses)
            # 扩展成batch
            exp_src_encodings = encoder_out.expand(hyp_num,encoder_out.shape[1],encoder_out.shape[2])
            exp_x_lengths = x_lengths.expand(hyp_num)
            exp_hid = hid.expand(hid.shape[0],hyp_num,hid.shape[2])
            output_t,atten_t,exp_hid = self.decoder(
                exp_src_encodings,exp_x_lengths,
                torch.tensor(hypotheses).long().to(y.device),torch.ones(hyp_num).long().to(y.device) * t,
                exp_hid
            )
            live_hyp_num = topk - len(completed_hypotheses)

            # 这里把num * vocab 展开来方便取topk
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand(hyp_num,output_t.shape[-1]) + output_t[:,-1,:].squeeze(1)).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores,k=live_hyp_num)

            # 标记当前概率最大的k个，其是跟在哪个单词的后面
            prev_hyp_ids = top_cand_hyp_pos // (output_t.shape[-1])
            hyp_word_ids = top_cand_hyp_pos % (output_t.shape[-1])

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                # 将当前最大概率的k个，拼接在正确的prev单词后面
                new_hyp_sent =  hypotheses[prev_hyp_id]  + [hyp_word_id]
                if hyp_word_id == EOS_id:
                    # 搜寻终止
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == topk:
                break

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=y.device)

        # 若搜寻了max_len后还没有一个到达EOS则取第一个
        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))
        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

class LanguageModelCriterion(nn.Module):
    def __init__(self):
        '''损失函数'''
        super(LanguageModelCriterion,self).__init__()

    def forward(self,inuptY,target,mask):
        # inputY batch,seq_len, vocab_size
        # target/mask: batch, seq_len
        inuptY = inuptY.contiguous().view(-1,inuptY.shape[2])
        target = target.contiguous().view(-1,1)
        mask = mask.contiguous().view(-1,1)
        # 模型seq2seq的输出已经经过log-softmax了，只需将target对应index值收集后在mask
        output = -inuptY.gather(1,target) * mask
        return torch.sum(output) / torch.sum(mask)

In [52]:
import random
import numpy as np
import pkuseg
import nltk
from nltk.translate.bleu_score import corpus_bleu
import argparse
import os

from tqdm import trange,tqdm
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
try:
    from torch.utils.tensorboard import SummaryWriter
except ImportError:
    from tensorboardX import SummaryWriter

from transformers import AdamW,get_linear_schedule_with_warmup

In [53]:
# !pip install pkuseg

<!-- ahshare, -->

In [54]:
# from model import Encoder,Attention,Decoder,seq2seq,LanguageModelCriterion

def setseed():
    random.seed(2020)
    np.random.seed(2020)
    torch.manual_seed(2020)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(2020)

def load_file(path,tgt_add_bos=True):
    en = []
    cn = []
    seg = pkuseg.pkuseg()
    with open(path,'r') as f:
        for line in f.readlines():
            line = line.strip().split('\t')
            en.append(["BOS"] + nltk.word_tokenize(line[0].lower()) + ["EOS"])
            # test时tgt不加开头结束，用于BLEU计算
            if tgt_add_bos:
                cn.append(["BOS"] + seg.cut(line[1]) + ["EOS"])
            else:
                cn.append(seg.cut(line[1]))

    return en,cn

def build_tokenizer(sentences,args):
    word_count = Counter()
    for sen in sentences:
        for word in sen:
            word_count[word] += 1
    ls = word_count.most_common(args.max_vocab_size)
    word2idx = {word:idx+2 for idx,(word,_) in enumerate(ls)}
    word2idx['UNK'] = args.UNK_IDX
    word2idx['PAD'] = args.PAD_IDX

    id2word = {v:k for k,v in word2idx.items()}
    total_vocab = len(ls) + 2

    return word2idx,id2word,total_vocab

def tokenize2num(en_sentences,cn_sentences,en_word2idx,cn_word2idx, sort_reverse = True):
    length = len(en_sentences)

    out_en_sents = [[en_word2idx.get(word,1) for word in sen] for sen in en_sentences]
    out_cn_sents = [[cn_word2idx.get(word, 1) for word in sen] for sen in cn_sentences]

    def sort_sents(sents):
        return sorted(range(len(sents)),key = lambda x : len(sents[x]),reverse=True)
    if sort_reverse:
        sorted_index =  sort_sents(out_en_sents)
        out_en_sents = [out_en_sents[idx] for idx in sorted_index]
        out_cn_sents = [out_cn_sents[idx] for idx in sorted_index]

    return out_en_sents,out_cn_sents

class Tokenizer(object):
    def __init__(self,word2idx,id2word,vocab_size):
        self.word2idx = word2idx
        self.id2word = id2word
        self.vocab_size = vocab_size

In [55]:
class DataProcessor(object):
    def __init__(self,args):
#         data_save
        if not os.path.exists(args.data_save):os.makedirs(args.data_save)
        
        cached_en_tokenizer = os.path.join(args.data_save,"cached_{}".format("en_tokenizer"))
        cached_cn_tokenizer = os.path.join(args.data_save, "cached_{}".format("cn_tokenizer"))

        if not os.path.exists(cached_en_tokenizer) or not os.path.exists(cached_cn_tokenizer):
            
            en_sents, cn_sents = load_file(args.data_dir + "train.txt")
            en_word2idx, en_id2word, en_vocab_size = build_tokenizer(en_sents,args)
            cn_word2idx, cn_id2word, cn_vocab_size = build_tokenizer(cn_sents, args)

            torch.save([en_word2idx, en_id2word, en_vocab_size],cached_en_tokenizer)
            torch.save([cn_word2idx, cn_id2word, cn_vocab_size],cached_cn_tokenizer)
        else:
            en_word2idx, en_id2word, en_vocab_size = torch.load(cached_en_tokenizer)
            cn_word2idx, cn_id2word, cn_vocab_size = torch.load(cached_cn_tokenizer)

        self.en_tokenizer = Tokenizer(en_word2idx, en_id2word, en_vocab_size)
        self.cn_tokenizer = Tokenizer(cn_word2idx, cn_id2word, cn_vocab_size)

    def get_train_examples(self,args):
        return self._create_examples(os.path.join(args.data_dir,"train.txt"),"train",args)


    def get_dev_examples(self,args):
        return self._create_examples(os.path.join(args.data_dir,"dev.txt"),"dev",args)

    def _create_examples(self,path,set_type,args):
        en_sents,cn_sents = load_file(path)
        out_en_sents,out_cn_sents = tokenize2num(en_sents,cn_sents,
                                                 self.en_tokenizer.word2idx,self.cn_tokenizer.word2idx)
        minibatches = getminibatches(len(out_en_sents),args.batch_size)

        all_examples = []
        for minibatch in minibatches:
            mb_en_sentences = [out_en_sents[i] for i in minibatch]
            mb_cn_sentences = [out_cn_sents[i] for i in minibatch]

            mb_x,mb_x_len = prepare_data(mb_en_sentences)
            mb_y,mb_y_len = prepare_data(mb_cn_sentences)

            all_examples.append((mb_x,mb_x_len,mb_y,mb_y_len))

        return all_examples

def prepare_data(seqs):
    # 处理每个batch句子（一个batch中句子长度可能不一致，需要pad）
    batch_size = len(seqs)
    lengthes = [len(seq) for seq in seqs]  # 每个句子的长度列表

    max_length = max(lengthes)  # 句子最大长度
    # 初始化句子矩阵都为0
    x = np.zeros((batch_size, max_length)).astype("int32")
    for idx in range(batch_size):
        # 按行将每行句子赋值进去
        x[idx, :lengthes[idx]] = seqs[idx]

    x_lengths = np.array(lengthes).astype("int32")
    return x, x_lengths

def getminibatches(n,batch_size,shuffle=True):
    minibatches = np.arange(0,n,batch_size)
    if shuffle:
        np.random.shuffle(minibatches)

    result = []
    for idx in minibatches:
        result.append(np.arange(idx,min(n,idx+batch_size)))
    return result


def train(args,model, data,loss_fn,eval_data):
    LOG_FILE = "translation_model.log"
    tb_writer = SummaryWriter('./runs')

    t_total = args.num_epoch * len(data)
    optimizer = AdamW(model.parameters(), lr=args.learnning_rate, eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=args.warmup_steps,
                                                num_training_steps=t_total)
    global_step = 0
    total_num_words = total_loss = 0.
    logg_loss = 0.
    logg_num_words = 0.
    val_losses = []
    train_iterator = trange(args.num_epoch,desc='epoch')
    for epoch in train_iterator:
        model.train()
        epoch_iteration = tqdm(data, desc='iteration')
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(epoch_iteration):
            # （英文batch，英文长度，中文batch，中文长度）
            mb_x = torch.from_numpy(mb_x).to(args.device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(args.device).long()
            # 前n-1个单词作为输入，后n-1个单词作为输出，因为输入的前一个单词要预测后一个单词
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(args.device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(args.device).long()
            mb_y_len = torch.from_numpy(mb_y_len - 1).to(args.device).long()
            # 输入输出的长度都减一。
            mb_y_len[mb_y_len <= 0] = 1#?

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)
            mb_out_mask = torch.arange(mb_y_len.max().item(), device=args.device)[None, :] < mb_y_len[:, None]
            # batch,seq_len . 其中每行长度超过自身句子长度的为false
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)
            # 损失函数

            # 更新模型
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.GRAD_CLIP)
            # 为了防止梯度过大，设置梯度的阈值
            optimizer.step()
            scheduler.step()

            global_step += 1
            num_words = torch.sum(mb_y_len).item()
            # 一个batch里多少个单词
            total_loss += loss.item() * num_words
            # 总损失，loss计算的是均值损失，每个单词都是都有损失，所以乘以单词数
            total_num_words += num_words
            # 总单词数

            if (it+1) % 100 == 0:
                loss_scalar = (total_loss - logg_loss) / (total_num_words-logg_num_words)
                logg_num_words = total_num_words
                logg_loss = total_loss

                with open(LOG_FILE, "a") as fout:
                    fout.write("epoch: {}, iter: {}, loss: {},learn_rate: {}\n".format(epoch, it, loss_scalar,
                                                                                       scheduler.get_lr()[0]))
                print("epoch: {}, iter: {}, loss: {}, learning_rate: {}".format(epoch, it, loss_scalar,
                                                                                scheduler.get_lr()[0]))
                tb_writer.add_scalar("learning_rate", scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar("loss", loss_scalar, global_step)

        print("Epoch", epoch, "Training loss", total_loss / total_num_words)
        eval_loss = evaluate(args, model, eval_data, loss_fn)  # 评估模型
        with open(LOG_FILE, "a") as fout:
            fout.write("===========" * 20)
            fout.write("EVALUATE: epoch: {}, loss: {}\n".format(epoch, eval_loss))
        if len(val_losses) == 0 or eval_loss < min(val_losses):
            # 如果比之前的loss要小，就保存模型
            print("best model, val loss: ", eval_loss)
            torch.save(model.state_dict(), "translate-best.th")
        val_losses.append(eval_loss)


def evaluate(args,model, data,loss_fn):
    model.eval()
    total_num_words = total_loss = 0.
    eval_iteration = tqdm(data, desc='eval iteration')
    with torch.no_grad():#不需要更新模型，不需要梯度
        for it, (mb_x, mb_x_len, mb_y, mb_y_len) in enumerate(eval_iteration):
            mb_x = torch.from_numpy(mb_x).to(args.device).long()
            mb_x_len = torch.from_numpy(mb_x_len).to(args.device).long()
            mb_input = torch.from_numpy(mb_y[:, :-1]).to(args.device).long()
            mb_output = torch.from_numpy(mb_y[:, 1:]).to(args.device).long()
            mb_y_len = torch.from_numpy(mb_y_len-1).to(args.device).long()
            mb_y_len[mb_y_len<=0] = 1

            mb_pred, attn = model(mb_x, mb_x_len, mb_input, mb_y_len)

            mb_out_mask = torch.arange(mb_y_len.max().item(), device=args.device)[None, :] < mb_y_len[:, None]
            mb_out_mask = mb_out_mask.float()

            loss = loss_fn(mb_pred, mb_output, mb_out_mask)

            num_words = torch.sum(mb_y_len).item()
            total_loss += loss.item() * num_words
            total_num_words += num_words
    print("Evaluation loss", total_loss/total_num_words)
    return total_loss/total_num_words

def test(args,model,processor):
    model.eval()
    en_sents, cn_sents = load_file(args.data_dir+'test.txt',tgt_add_bos=False)
    en_sents, _ = tokenize2num(en_sents, cn_sents,
                               processor.en_tokenizer.word2idx,
                               processor.cn_tokenizer.word2idx,
                               sort_reverse=False)

    top_hypotheses = []
    test_iteration = tqdm(en_sents, desc='test bleu')
    with torch.no_grad():
        for idx, en_sent in enumerate(test_iteration):
            mb_x = torch.from_numpy(np.array(en_sent).reshape(1, -1)).long().to(args.device)
            mb_x_len = torch.from_numpy(np.array([len(en_sent)])).long().to(args.device)
            bos = torch.Tensor([[processor.cn_tokenizer.word2idx['BOS']]]).long().to(args.device)
            completed_hypotheses = model.beam_search(mb_x, mb_x_len,
                                                     bos, processor.cn_tokenizer.word2idx['EOS'],
                                                     topk=args.beam_size,
                                                     max_length=args.max_beam_search_length)
            top_hypotheses.append([processor.cn_tokenizer.id2word[id] for id in completed_hypotheses[0].value])

    bleu_score = corpus_bleu([[ref] for ref in cn_sents],
                             top_hypotheses)

    print('Corpus BLEU: {}'.format(bleu_score * 100))

    return bleu_score

In [56]:
parse = argparse.ArgumentParser()
#     default='./nmt/en-cn/'
parse.add_argument("--data_dir",default='/kaggle/input/encndata/data/',type=str,required=False,
    help="The input data dir. Should contain the .tsv files (or other data files) for the task.",)
parse.add_argument("--batch_size", default=16, type=int)
parse.add_argument("--do_train",default=True, action="store_true", help="Whether to run training.")
parse.add_argument("--do_test",default=True, action="store_true", help="Whether to run test.")
parse.add_argument("--do_translate",default=True, action="store_true", help="Whether to run training.")
parse.add_argument("--learnning_rate", default=5e-4, type=float)
parse.add_argument("--dropout", default=0.2, type=float)
parse.add_argument("--num_epoch", default=10, type=int)
parse.add_argument("--max_vocab_size",default=50000,type=int)
parse.add_argument("--embed_size",default=300,type=int)
parse.add_argument("--enc_hidden_size", default=512, type=int)
parse.add_argument("--dec_hidden_size", default=512, type=int)
parse.add_argument("--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps.")
parse.add_argument("--GRAD_CLIP", default=1, type=float)
parse.add_argument("--UNK_IDX",default=1,type=int)
parse.add_argument("--PAD_IDX", default=0, type=int)
parse.add_argument("--beam_size", default=5, type=int)
parse.add_argument("--max_beam_search_length", default=100, type=int)

parse.add_argument("--data_save",default='/kaggle/working/data/',type=str)

#     args = parse.parse_args()
args = parse.parse_args(args=[]) 
print(args)

Namespace(data_dir='/kaggle/input/encndata/data/', batch_size=16, do_train=True, do_test=True, do_translate=True, learnning_rate=0.0005, dropout=0.2, num_epoch=10, max_vocab_size=50000, embed_size=300, enc_hidden_size=512, dec_hidden_size=512, warmup_steps=0, GRAD_CLIP=1, UNK_IDX=1, PAD_IDX=0, beam_size=5, max_beam_search_length=100, data_save='/kaggle/working/data/')


In [57]:
# def main():
#     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device="cpu"
args.device = device
print(device)

setseed()

processor = DataProcessor(args)

encoder = Encoder(processor.en_tokenizer.vocab_size,args.embed_size,
                  args.enc_hidden_size,args.dec_hidden_size,args.dropout)
decoder = Decoder(processor.cn_tokenizer.vocab_size,args.embed_size,
                  args.enc_hidden_size,args.dec_hidden_size,args.dropout)
model = seq2seq(encoder,decoder)
if os.path.exists("translate-best.th"):
    model.load_state_dict(torch.load("translate-best.th"))
model.to(device)
loss_fn = LanguageModelCriterion().to(device)

train_data = processor.get_train_examples(args)
eval_data = processor.get_dev_examples(args)

cpu


In [58]:
#     if args.do_train:
# train(args,model,train_data,loss_fn,eval_data)

In [59]:
#     if args.do_test:
test(args,model,processor)

test bleu: 100%|██████████| 1817/1817 [02:40<00:00, 11.29it/s]


Corpus BLEU: 11.079267220946853


0.11079267220946853

In [60]:
#     if args.do_translate:
model.load_state_dict(torch.load("translate-best.th"))
model.to(device)
while True:
    title = input("请输入要翻译的英文句子:\n")
    if len(title.strip()) == 0:
        continue
    title = ['BOS'] + nltk.word_tokenize(title.lower()) + ['EOS']
    title_num = [processor.en_tokenizer.word2idx.get(word,1) for word in title]
    mb_x = torch.from_numpy(np.array(title_num).reshape(1,-1)).long().to(device)
    mb_x_len = torch.from_numpy(np.array([len(title_num)])).long().to(device)

    bos = torch.Tensor([[processor.cn_tokenizer.word2idx['BOS']]]).long().to(device)

    completed_hypotheses = model.beam_search(mb_x, mb_x_len,
                                             bos,processor.cn_tokenizer.word2idx['EOS'],
                                             topk=args.beam_size,
                                             max_length=args.max_beam_search_length)

    for hypothes in completed_hypotheses:
        result = "".join([processor.cn_tokenizer.id2word[id] for id in hypothes.value])
        score = hypothes.score
        print("翻译后的中文结果为:{},score:{}".format(result,score))

请输入要翻译的英文句子:
 i love you china


翻译后的中文结果为:我愛你。,score:-2.6588497161865234
翻译后的中文结果为:我愛你了。,score:-3.492776870727539
翻译后的中文结果为:我想你愛。,score:-5.427981376647949
翻译后的中文结果为:我对你愛。,score:-5.788558483123779
翻译后的中文结果为:我对你愛了。,score:-6.098419189453125


请输入要翻译的英文句子:
 long long ago there lived in hangzhou a girl named aqiao


翻译后的中文结果为:那本书在那裡有一個大超市。,score:-12.923996925354004
翻译后的中文结果为:那本书在那裡有一個大問題。,score:-13.353044509887695
翻译后的中文结果为:那本书在那裡有一個有趣的書。,score:-14.243057250976562
翻译后的中文结果为:那本书在那裡有一本關於鳥類的書。,score:-16.189922332763672
翻译后的中文结果为:那本书在那裡有一本關於鳥類的書在这里。,score:-18.45274543762207


请输入要翻译的英文句子:
 i love you more than i can say


翻译后的中文结果为:我可以跟你说法语。,score:-6.410440921783447
翻译后的中文结果为:我比你想说得好嗎？,score:-7.209282398223877
翻译后的中文结果为:我可以跟你说得好嗎？,score:-7.415818691253662
翻译后的中文结果为:我比你想说得好吗？,score:-7.553323745727539
翻译后的中文结果为:我比你想知道我的好嗎？,score:-8.470877647399902


请输入要翻译的英文句子:
 
请输入要翻译的英文句子:
 hello everyone


翻译后的中文结果为:停火。,score:-3.715897560119629
翻译后的中文结果为:鬼魂。,score:-4.276006698608398
翻译后的中文结果为:停火是谁。,score:-5.814101696014404
翻译后的中文结果为:停火是谁的。,score:-5.817572593688965
翻译后的中文结果为:鬼魂是谁。,score:-5.86874532699585


KeyboardInterrupt: Interrupted by user

In [None]:
# code ,github.