# seq2seq translation batched

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import time
import math

import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F

### utils

In [3]:
def get_variable(tensor):
    ''' 直接获得variable，后面不用在判断，使用GPU或者不使用
    '''
    var = Variable(tensor)
    if USE_CUDA:
        var = var.cuda()
    return var

### 配置

In [4]:
# 使用GPU
USE_CUDA = False

### 语言辅助类

In [5]:
PAD_token = 0
SOS_token = 1
EOS_token = 2

class Lang(object):
    '''某一语言的辅助类，word2index, index2word, 词频等'''
    def __init__(self, name):
        self.name = name
        self.init_params()
    
    def init_params(self, trimmed = False):
        '''初始化参数'''
        # 修整标记
        self.trimmed = trimmed
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0:"PAD", 1:"SOS", 2:"EOS"}
        self.n_words = 3
    
    def index_word(self, word):
        '''添加一个词语'''
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1
    
    def index_sentence(self, sentence, split_str=' '):
        '''添加一句话
        Args:
            sentence: 字符串，单词以空格分割
            split_str: 字符串单词分隔符，默认是空格
        '''
        for word in sentence.split(split_str):
            self.index_word(word)
    
    def index_words(self, words):
        '''添加词汇列表
        Args:
            words: 词汇列表
        '''
        for word in words:
            self.index_word(word)
    
    def trim(self, min_count):
        '''移除出现次数太少的单词
        Args:
            min_count: 最少出现次数
        '''
        if self.trimmed:
            return
        keep_words = []
        
        for word, count in self.word2count.items():
            if count >= min_count:
                keep_words.append(word)
        print ("keep words: %s / %s = %.4f" % (len(keep_words), self.n_words,
              len(keep_words) / self.n_words))
        
        # 重新更新参数，重新添加
        self.init_params(True)
        self.index_words(keep_words)

### 转码和规整化字符串

In [6]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )


def normalize_str(s):
    '''小写化，留下字母和.!?，使用空格分割，删除非法字符'''
    s = unicode_to_ascii(s.lower().strip())
    # .!? 前面加空格
    s = re.sub(r'([.!?])', r' \1', s)
    # 删除非法字符，用空格代替
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    # 多个空格用1个空格代替
    s = re.sub(r'\s+', r" ", s).strip()
    return s

def test_normalize_str():
    s = 'Hello你好 %!。 .?!'
    sn = normalize_str(s)
    print ('raw:', s)
    print ('now:', sn)
test_normalize_str()

raw: Hello你好 %!。 .?!
now: hello ! . ? !


### 读取数据

In [7]:
# def read_lines(filename):
#     '''读取filename中的内容，一行一行，转换为ascii码'''
#     # io.open
#     lines = open(filename, encoding = 'utf-8').read().strip().split('\n')
#     return [unicode_to_ascii(line) for line in lines]

# def read_langs(filename, input_name = 'en', target_name ='french', reverse=False):
#     ''' 读取
#     Args:
#         filename: 文件的路径
#         input_name: 源语言名称
#         target_name: 目标语言名称
#         reverse: 是否翻转
#     Returns:
#         input_lang: 输入语言的对象，只初始化了名字
#         target_lang: 输出语言的对象，只初始化了名字
#         pairs: [[i1, o1], [i2, o2], ...] 字符串pair
#     '''
#     lines = read_lines(filename)
#     # 每一行以'\t'分隔两种语言
#     pairs = []
#     for i, line in enumerate(lines):
#         l, r = line.split('\t')
#         l, r = normalize_str(l), normalize_str(r)
#         pairs.append([l, r])
#     if reverse:
#         pairs = [list(reversed(p)) for p in pairs]
#         input_lang = Lang(target_name)
#         target_lang = Lang(input_name)
#     else:
#         input_lang = Lang(input_name)
#         target_lang = Lang(target_name)
#     return input_lang, target_lang, pairs


# def test_read_langs():
#     '''看下有几个pairs'''
#     filename = 'trans_data/en-french.txt'
#     input_lang, target_lang, pairs = read_langs(filename)
#     print (len(pairs))

# # test_read_langs()

### 筛选数据 pairs

In [125]:
MIN_LENGTH = 3
MAX_LENGTH = 25

def legal(sentence):
    '''检查句子的长度'''
    if len(sentence) >= MIN_LENGTH and len(sentence) <= MAX_LENGTH:
        return True
    return False


def filter_pairs(pairs):
    '''筛选长度合法的pair，两种语言句子都要满足长度'''
    remained = []
    for p in pairs:
        if legal(p[0]) and legal(p[1]):
            remained.append(p)
    return remained

### 准备数据pairs

In [126]:
def prepare_data(filename, src_name='english', dst_name='french', reverse = False):
    ''' 准备数据
    Args:
        filename: 数据对的文件
        src_name: 数据左边的语言
        dst_name: 数据右边的语言
        reverse: 默认(src-dst), 翻转则(dst-src)
    Returns:
        input_lang: 源语言 lang对象，name= src_name 或 dst_name (反转) 
        target_lang: 目标语言 lang对象，name= dst_name 或 src_name (反转) 
        pairs: [[i1, o1], [i2, o2], [i3, o3], ...]，都是字符串格式
    '''
    input_lang, target_lang, pairs = read_langs(filename, src_name, dst_name, reverse)
    raw_count = len(pairs)
    pairs = filter_pairs(pairs)
    print ('read: %s, remain:%s' % (raw_count, len(pairs)))
    for p in pairs:
        input_lang.index_sentence(p[0])
        target_lang.index_sentence(p[1])
    print ('%s:%s, %s:%s' % (input_lang.name, input_lang.n_words, target_lang.name, target_lang.n_words))
    return input_lang, target_lang, pairs

def test_read_langs():
    '''读取语言数据'''
    filename = 'trans_data/en-french.txt'
    input_lang, target_lang, pairs = prepare_data(filename, 'eng', 'fra', True)
    print (random.choice(pairs))

# test_read_langs()

read: 142787, remain:27065
fra:7205, eng:4447
[u'est ce ta chambre ?', u'is this your room ?']


### 读取数据

In [127]:
filename = 'trans_data/en-french.txt'
input_lang, target_lang, pairs = prepare_data(filename, 'eng', 'fra', True)
print (random.choice(pairs))

read: 142787, remain:27065
fra:7205, eng:4447
[u'est ce ton parapluie ?', u'is this your umbrella ?']


### 删除出现次数太少的词语

便于早点训练完

In [128]:
MIN_COUNT = 5
input_lang.trim(MIN_COUNT)
target_lang.trim(MIN_COUNT)

keep words: 1775 / 7205 = 0.2464
keep words: 1571 / 4447 = 0.3533


### 删除包含unknown单词的句子

In [129]:
keep_pairs = []

for p in pairs:
    input_sentence = p[0]
    target_sentence = p[1]
    keep_input = True
    keep_output = True
    
    for word in input_sentence.split(' '):
        if word not in input_lang.word2index:
            keep_input = False
            break
    for word in target_sentence.split(' '):
        if word not in target_lang.word2index:
            keep_output = False
            break    
    if keep_input and keep_output:
        keep_pairs.append(p)

info = 'Pairs raw:%s, now:%s, %.4f remain' % (len(pairs), len(keep_pairs), len(keep_pairs) / len(pairs))
print (info)
old_pairs = pairs
pairs = keep_pairs
print (len(old_pairs), len(pairs))

Pairs raw:27065, now:16947, 0.6262 remain
27065 16947


### 把数据转换成Tensor和Variable

In [130]:
'''
为了更好使用GPU，可以一次放多条数据去训练，即一个batch。
但是每个句子的长度却是不一样的，比如[2, 3, 4]和[3, 5, 6, 9, 7]。
所以需要填充短句子，使每一个batch中所有句子长度一样。计算loss的时候，忽略这些PAD_token
有2种解决方案。
1. 固定句子的长度，设置MAX_LENGTH，全部句子填充到一样长。
2. 短批，长批。短和短为一个batch，长和长为1个batch。
这里采用方案1
'''

def indexes_from_sentence(lang, sentence):
    ''' 获得句子的词汇的id列表，加上结束标记'''
    return [lang.word2index[word] for word in sentence.split(' ')] + [EOS_token]

def pad_seq(seq, max_length):
    ''' 为短句子填充到最大长度，填0
    Args:
        seq: 句子，以词汇id列表来表示
        max_length: 要填充到的长度
    Returns:
        seq: 填充好的句子
    '''
    seq += [PAD_token for i in range(max_length - len(seq))]
    return seq

### random batch

In [131]:
def random_batch(batch_size, pairs, input_lang, target_lang):
    ''' 随机选择一些样本
    Args:
        batch_size: 一批的大小
        pairs: 原数据
        input_lang, target_lang: 两种语言的工具类
    Returns:
        input_var: [s, b]，即[句子长度，s=句子个数]
        input_lengths: 真实长度 [b]
        target_var: [s, b]
        target_lengths: 真实长度 [b]
    '''
    input_seqs = []
    target_seqs = []
    
    # 随机选择pairs
    for i in range(batch_size):
        p = random.choice(pairs)
        input_seqs.append(indexes_from_sentence(input_lang, p[0]))
        target_seqs.append(indexes_from_sentence(target_lang, p[1]))
    
    # 组合排序再分开
    seq_pairs = sorted(zip(input_seqs, target_seqs), key = lambda p: len(p[0]), reverse=True)
    input_seqs, target_seqs = zip(*seq_pairs)
    
    # 填充，真实长度，[b, maxlen]
    input_lengths = [len(s) for s in input_seqs]
    input_padded = [pad_seq(s, max(input_lengths)) for s in input_seqs]
    target_lengths = [len(s) for s in target_seqs]
    target_padded = [pad_seq(s, max(target_lengths)) for s in target_seqs]
    
    # LongTensor (seq_len, batch_size)
    input_var = get_variable(torch.LongTensor(input_padded)).transpose(0, 1)
    target_var = get_variable(torch.LongTensor(target_padded)).transpose(0, 1)
    return input_var, input_lengths, target_var, target_lengths

def test_random_batch(pairs, input_lang, target_lang):
    input_var, in_lens, target_var, t_lens = random_batch(2, pairs, input_lang, target_lang)
    print ('input:', input_var.size(), in_lens)
    print ('target:', target_var.size(), t_lens)

test_random_batch(pairs, input_lang, target_lang)

input: torch.Size([5, 2]) [5, 5]
target: torch.Size([6, 2]) [6, 5]


# 网络模型

### Encoder

In [132]:
class EncoderRNN(nn.Module):
    ''' 对句子进行编码 input-embeded-gru-output 
    [s, batch_size] -- [s, b, h]，即[句子长度，句子个数] -- [句子长度，句子个数，编码维数]
    '''
    def __init__(self, vocab_size, hidden_size, n_layers=1, dropout_p=0.1, bidir=False):
        super(EncoderRNN, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.bidir = bidir
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, 
                          dropout=dropout_p, bidirectional=bidir)
    
    def forward(self, input_seqs, input_lengths, hidden=None):
        ''' 对输入的多个句子经过GRU计算出语义信息
        1. input_seqs > embeded
        2. embeded - packed > GRU > outputs - pad -output
        Args:
            input_seqs: [s, b]
            input_lengths: list[int]，每个句子的真实长度
        Returns:
            outputs: [s, b, h]
            hidden: [n_layer*n_dir, b, h]
        '''
        # 一次运行，多个batch，多个序列
        # print ('inputseqs:', input_seqs.size())
        embedded = self.embedding(input_seqs)
        packed = nn.utils.rnn.pack_padded_sequence(embedded, input_lengths)
        outputs, hidden = self.gru(packed, hidden)
        outputs, output_length = nn.utils.rnn.pad_packed_sequence(outputs)  
        if self.bidir is True:
            # 双向，求和
            outputs = outputs[:, :, :self.hidden_size] + outputs[:, :, self.hidden_size:]
        return outputs, hidden

### 测试Encoder

In [134]:
small_batch_size = 2
input_batches, input_lengths, target_batches, target_lengths \
    = random_batch(small_batch_size, pairs, input_lang, target_lang)
print ('input:', input_batches.size(), input_lengths)
print ('target:', target_batches.size(), target_lengths)

small_hidden_size = 8
small_n_layers = 2
encoder_test = EncoderRNN(input_lang.n_words, small_hidden_size, small_n_layers, bidir=False)
print (encoder_test)
encoder_outputs, encoder_hidden = encoder_test(input_batches, input_lengths)
print ('outputs:', encoder_outputs.size(), 'hidden:', encoder_hidden.size())


input: torch.Size([6, 2]) [6, 4]
target: torch.Size([6, 2]) [6, 5]
EncoderRNN (
  (embedding): Embedding(1778, 8)
  (gru): GRU(8, 8, num_layers=2, dropout=0.1)
)
outputs: torch.Size([6, 2, 8]) hidden: torch.Size([2, 2, 8])


### Attn

In [None]:
class Attn(nn.Module):
    '''计算对齐向量'''
    def __init__(self, score_type, hidden_size):
        '''
        Args:
            score_type: 计算score的方法，'dot', 'general', 'concat'
            hidden_size: Encoder和Decoder的hidden_size
        '''
        super(Attn, self).__init__()
        self.score_type = score_type
        self.hidden_size = hidden_size
        if score_type == 'general':
            self.attn = nn.Linear(hidden_size, hidden_size)
        elif score_type == 'concat':
            self.attn = nn.Linear(hidden_size * 2, hidden_size)
            self.v = nn.Parameter(torch.FloatTensor(1, hidden_size))
    
    def score(self, hidden, encoder_output):
        ''' 计算Decoder中LSTM的ht与Encoder中的hs的得分，便于后面算对齐概率
        Args:
            hidden: Decoder中最顶层LSTM的隐状态，[n_layer*n_dir, b, h]
            encoder_output: Encoder某时刻的隐状态，h_en_s，[1, h_size]
        Returns:
            energy: d_ht与e_hs的得分，即Yt与Xs的得分
        '''
        # dot 需要两个1维的向量
        if self.score_type == 'dot':
            energy = hidden.squeeze(0).dot(encoder_output.squeeze(0))
        elif self.score_type == 'general':
            energy = self.attn(encoder_output)
            #print('energy:', energy.size(), 'hidden:', hidden.size())
            energy = hidden.squeeze(0).dot(energy.squeeze(0))
        elif self.score_type == 'concat':
            h_o = torch.cat((hidden, encoder_output), 1)
            energy = self.attn(h_o)
            energy = self.v.squeeze(0).dot(energy.squeeze(0))
        return energy
    
    def forward(self, rnn_output, encoder_outputs):
        ''' 时刻t，计算对齐向量
        Args:
            rnn_output: Decoder中GRU的输出[1, b, h]
            encoder_outputs: Encoder的输出, [s_i, b, h]
        Returns:
            align_vec: 当前ht与所有encoder_outputs的对齐向量，alpha_t，len=Tx，返回[1, 1, seq_len]格式
        '''
        seq_len = encoder_outputs.size()[0]
        this_batch_size = encoder_outputs.size()[1]
        # (b,h)
        rnn_output = rnn_output.squeeze(0)
        # attn_energies b*max_len
        attn_energies = get_variable(torch.zeros(this_batch_size, seq_len))
        for i in range(this_batch_size):
            # 依次计算能量
            for j in range(max_len):
                batch_output = rnn_output[i]
                batch_encoder_output = encoder_outputs[:, i, j]
                attn_energies[i, j] = self.score(rnn_output[i], encoder_outputs[:, i, j].unsqueeze())
        
        
        
        seq_len = len(encoder_outputs)
        attn_energies = get_variable(torch.zeros(seq_len))
        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[i])
        # normalize [0, 1], resize to [1, 1, seq_len]
        align_vec = F.softmax(attn_energies)
        align_vec = align_vec.unsqueeze(0).unsqueeze(0)
        #print ('alignv:', type(align_vec))
        return align_vec

### AttnDecoder

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, score_method='general', n_layers=1, dropout_p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.score_method = score_method
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.embedding_dropout = nn.Dropout(dropout_p)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, dropout=dropout_p)
        self.contcat = nn.Linear(hidden_size * 2, hidden_size)
        self.out = nn.Linear(hidden_size * 2, output_size)
        
        # 选择attention
        if score_type != 'none':
            self.attn = Attn(score_method, hidden_size)
    
    def forward(self, input_seq, last_hidden, encoder_outputs):
        '''
        1. input > embedded 
        2. embedded, last_hidden -GRU- rnn_output, hidden
        3. rnn_output, encoder_outpus -Atn- attn_weights
        4. attn_weights, encoder_outputs -相乘- context
        5. rnn_output, context --变换,tanh,变换-- output 
        Args:
            input_seq: [b, o] 上一个的输出单词
            last_hidden: [n_layers, b, h]
            encoder_outputs: [s_i, b, h]
        Returns:
            
        '''
        batch_size = input_seq.size()[0]
        embedded = self.embedding(input_seq)
        embedded = self.embedding_dropout(embedded)
        embedded = embedded.view(1, batch_size, self.hidden_size)
        
        # (1, b, h), (n_l, b, h)
        rnn_output, hidden = self.gru(embedded, last_hidden)
        
        # attention
        attn_weights = self.attn(rnn_output, encoder_outputs)
        context = attn_weights.bmm(encoder_outputs.transpose(0, 1))
        
    
    def init_outputs(self, seq_len, batch_size):
        outputs = torch.zeros(seq_len, batch_size, self.output_size)
        return get_varaible(outputs)
    

# 训练

In [None]:
def train(input_batches, input_lengths, target_batches, target_lengths, 
         encoder, decoder, encoder_optimizer, decoder_optimizer,
         loss_func, batch_size, max_length = MAX_LENGTH):
    # zero grad
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0
    
    # batch_size = input_batches.size()[1]
    encoder_outputs, encoder_hidden = encoder(input_batches, input_lengths, None)
    decoder_input = get_variable(torch.LongTensor([SOS_token] * batch_size))
    
    tar_seq_len = max(target_lengths)
    # hidden 用encoder的前n层的hidden
    decoder_hidden = encoder_hidden(:decoder.n_layers)
    decoder_outputs = decoder.init_outputs(tar_seq_len, batch_size)
    
    

In [136]:
d = {"a":"aa"}
print ("b" in d)
s = 'a1b1c1d'
print (s.split('1'))

# zip
a = ['a', 'b', 'c']
b = [1, 2, 3]
# 配对
c = zip(a, b)
print (c)
# 解开
print (zip(*c))
print (['a'] * 3)

False
[u'a', u'b', u'c', u'd']
[(u'a', 1), (u'b', 2), (u'c', 3)]
[(u'a', u'b', u'c'), (1, 2, 3)]
[u'a', u'a', u'a']


In [138]:
a = get_variable(torch.randn(1, 1, 2))
def test(a):
    a = a.squeeze()
    print (a.size())
test(a)
print (a.size())
    

torch.Size([2])
torch.Size([1, 1, 2])
