**神经机器翻译**
一种简单方法：每种语言，只用几千个单词，并且使用one-hot编码

In [2]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import string
import re
import random
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

# 数据处理
**配置信息**

In [17]:
use_cuda = torch.cuda.is_available()
SOS_token = 0
EOS_token = 1
data_dir = 'trans_data'

**语言类**

In [18]:
class Lang(object):
    '''语言的类'''
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: 'SOS', 1: "EOS"}
        self.n_words = 2
    
    def add_sentence(self, sentence):
        '''添加一句话
        Args:
            sentence: 话，字符串，经过处理后的
        '''
        for word in sentence.split( ):
            self.add_word(word)
    
    def add_word(self, word):
        '''添加一个词汇'''
        if word not in self.word2index:
            idx = self.n_words
            self.word2index[word] = idx
            self.word2count[word] = 1
            self.index2word[idx] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

**转码和字符串归整化函数**

In [19]:
def unicode_to_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

def normalize_str(s):
    '''小写化，去首尾空格，去掉特殊字符，给标点符号加上空格'''
    s = unicode_to_ascii(s.lower().strip())
    # 给.!? 前面加上空格
    s = re.sub(r"([.!?])", r" \1", s)
    # 把非字母和.!?的字符串用空格代替
    s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
    return s

def test():
    s = 'Hello %! .?!'
    sn = normalize_str(s)
    print (s)
    print (sn)
test()

Hello %! .?!
hello ! . ? !


**读取数据**

In [20]:
def read_lines(filename):
    '''读取filename中的内容，一行一行，转换为ascii码'''
    # io.open
    lines = open(filename, encoding = 'utf-8').read().strip().split('\n')
    return [unicode_to_ascii(line) for line in lines]

def read_langs(filename, src_name = 'en', dst_name ='french', reverse=False):
    ''' 读取
    Args:
        filename: 文件的路径
        src_name: 源语言名称
        dst_name: 目标语言名称
        reverse: 是否翻转
    Returns:
        src_lang: 输入语言的对象，只初始化了名字
        dst_lang: 输出语言的对象，只初始化了名字
        pairs: [[in_str1, out_str1], [i2, o2], [i3, o3], ...]
    '''
    lines = read_lines(filename)
    # 每一行以'\t'分隔两种语言
    pairs = []
    for i, line in enumerate(lines):
        l, r = line.split('\t')
        l, r = normalize_str(l), normalize_str(r)
        pairs.append([l, r])
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        src_lang = Lang(dst_name)
        dst_lang = Lang(src_name)
    else:
        src_lang = Lang(src_name)
        dst_lang = Lang(dst_name)
    return src_lang, dst_lang, pairs
filename = 'trans_data/en-french.txt'
src_lang, dst_lang, pairs = read_langs(filename)
print (len(pairs))

142787


In [24]:
# 句子长度
MAX_LENGTH = 10
eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def is_simple(pair, en_idx):
    ''' 选择简单的句子
    长度小于MAX_LENGTH。格式，英语满足上面的前缀格式
    Args:
        pair: [en_sentence, otherlang_sentence]
    '''
    src, dst = pair[0].split(' '), pair[1].split(' ')
    return (len(src) < MAX_LENGTH and len(dst) < MAX_LENGTH 
            and pair[en_idx].startswith(eng_prefixes))


def filter_pairs(pairs, en_idx):
    return [p for p in pairs if is_simple(p, en_idx)]
    

def prepare_data(filename, src_name='english', dst_name='french', reverse = False):
    ''' 准备数据
    Args:
        filename: 数据对的文件
        src_name: 数据左边的语言
        dst_name: 数据右边的语言
        reverse: 默认(src-dst), 翻转则(dst-src)
    Returns:
        src_lang: 源语言 lang对象，name= src_name 或 dst_name (反转) 
        dst_lang: 目标语言 lang对象，name= dst_name 或 src_name (反转) 
        pairs: [[i1, o1], [i2, o2], [i3, o3], ...]，都是字符串格式
    '''
    src_lang, dst_lang, pairs = read_langs(filename, src_name, dst_name, reverse)
    en_idx = 0 if reverse is not True else 1
    print ("read %s lines" % len(pairs))
    pairs = filter_pairs(pairs, en_idx)
    print ("remain %s lines" % len(pairs))
    for p in pairs:
        src_lang.add_sentence(p[0])
        dst_lang.add_sentence(p[1])
    print (src_lang.name, src_lang.n_words)
    print (dst_lang.name, dst_lang.n_words)
    return src_lang, dst_lang, pairs

src_lang, dst_lang, pairs = prepare_data(filename, 'eng', 'fra', True)
print (random.choice(pairs))

read 142787 lines
remain 11182 lines
fra 4559
eng 2967
[u'tu es grande .', u'you re big .']


# 网络结构
**Encoder网络**

In [21]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        ''' init
        embed(num=input_size, embed_dim=hidden_size)
        GRU(input_size=hidden_size, hidden_size=hidden_size)
        Args:
            input_size: 总的词汇的id的数量，用于embedding的第一个参数
            hidden_size: embedding_dim, GRU的input_size和hidden_size
            n_layers: GRU的层数
        '''
        super(EncoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.word_embeddings = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
    
    def forward(self, input, hidden):
        '''forward
        Args:
            input: 单词id
            hidden: 上一时刻的状态
        '''
        # (seq_len, batch, input_size) = (1, 1, hidden_size)
        embedded = self.word_embeddings(input).view(1, 1, -1)
        output = embedded
        for i in range(self.n_layers):
            output, hidden = self.gru(output, hidden)
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(1, 1, self.hidden_size))
        hidden = hidden if use_cuda is not True else hidden.cuda()
        return hidden

**DecoderRNN**

In [22]:
class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1):
        '''初始化
        Embedding(embed_num=output_size, embed_dim=hidden_size)
        GRU(input_size=output_size, hidden_size=hidden_size)
        Args:
            hidden_size: embed的embed_dim, GRU的input_size和hidden_size, Linear的input_size
            output_size: embed的embed_num, Linear的output_size
            n_layers: GRU的层数
        '''
        super(DecoderRNN, self).__init__()
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.word_embeddings = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax()
    
    def forward(self, input):
        ''' GRU+softmax
        Args:
            input: 单词id
        Returns:
            output: GRU的output再softmax，[1, hidden_size]
            hidden: GRU的hidden (n_layer*n_dir, batch, hidden_size)
        '''
        output = self.word_embeddings(input).view(1, 1, -1)
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        # output(seq_len, batch, hidden_size)=(1,1,hidden_size), output[0]=(1, hidden_size)
        output = self.softmax(self.out(output[0]))
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(1, 1, self.hidden_size))
        hidden = hidden if use_cuda is not True else hidden.cuda()
        return hidden

**Attention**

In [25]:
class AttnDecoderRNN(nn.Module):
    
    def __init__(self, hidden_size, output_size,
                n_layers = 1, dropout_p = 0.1, max_length = MAX_LENGTH):
        '''
        Args:
            hidden_size: embed_dim, GRU input_size, hidden_size
            output_size: num_embed, Lienar: output_size
            n_layers: GRU的层数
            dropout_p: word_embedd时丢弃的概率
            max_length: 句子长度
        '''
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        self.max_length = max_length
        
        self.word_embeddings = nn.Embedding(self.output_size, self.hidden_size)
        # why *2 (2h, max_len)
        self.attn = nn.Linear(self.hidden_size * 2, self.max_length)
        # (2h, h)
        self.attn_combile = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dropout_p)
        #(h, h)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        # (h, o)
        self.out = nn.Linear(self.hidden_size, self.output_size)
    
    def forward(self, input, hidden, encoder_output, encoder_outputs):
        '''
        Args:
            input:
            hidden: [1, 1, h_size]
            encoder_output:
            encoder_outputs: (max_len, encoder.hidden_size)
        Returns:
            output: [1, out_size]
            hidden: [1, 1, h_size]
            attn_weights: [1, max_len]
        '''
        # (1, 1, h)
        input_embed = self.word_embeddings(input).view(1, 1, -1)
        input_embed = self.dropout(embedded)
        
        # (1, 2h)
        i_h = torch.cat((input_embed[0], hidden[0]), 1)
        # (1, max_len) input和hidden经过线性变化，得到给各个单词的注意力权值
        attn_weights = F.softmax(self.attn(i_h))
        
        # 相乘 (1, 1, max_len) (1, max_len, h) > (1, 1, h)
        attn_applied = torch.bmm(attn_weights.unseqeeze(0), encoder_outputs.unseqeeze(0))
        
        # (1, h) (1, h) > (1, 2h)
        output = torch.cat((embedded[0], attn_applied[0]), 1)
        # (1, 2h) > (1, h) > (1, 1, h)
        output = self.attn_compile(output).unsqueeze(0)
        
        for i in range(self.n_layers):
            output = F.relu(output)
            output, hidden = self.gru(output, hidden)
        # (1, 1, h) > (1, h) > (1, o)
        output = self.log_softmax(self.out(output[0]))
        return output, hidden, attn_weights
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(1, 1, self.hidden_size))
        return hidden

# 训练

**准备训练数据**

In [26]:
def indics_from_sentence(lang, sentence):
    ''' 获得句子的词汇的id列表
    Args:
        lang: sentence所在的语言对象
        sentence: 一句话
    Returns:
        [] 句子中单词的id列表
    '''
    return [lang.word2index[word] for word in sentence.split(' ')]


def variable_from_sentence(lang, sentence):
    ''' 获得一个句子的单词id组成的Variable
    Rertuns:
        indics_variable: 句子的单词的id列表，由Variable表示，(len, 1)
    '''
    indics = indics_from_sentence(lang, sentence)
    indics.append(EOS_token)
    indics_variable = Variable(torch.LongTensor(indics).view(-1, 1))
    return indics_variable


def variables_from_pair(pair):
    input_variable = variable_from_sentence(pair[0])
    target_variable = variable_from_sentence()


IndentationError: expected an indented block (<ipython-input-26-b2df0c1e06bc>, line 11)

# API总结
**正则表达式**[RE.Sub](https://www.crifan.com/python_re_sub_detailed_introduction/)

In [18]:
'''
1. 正则表达式
[] 里面是任意一个 [.!?] 任意一个
() 是一个group，如([.!?])
{} 是数量词，如{2,} 至少2个
'''

s = "hello 123 world 456"
# 把匹配到的 \d+ 用222来替换
s1 = re.sub(r'\d+', "222", s)
print (s1)

# \1 是和前面的()内匹配到的字符串完全一样
s1 = re.sub(r'hello (\w+), nihao \1', 'AAAA', "hello re, nihao reBBBB")
print (s1)
# 不相同，不能匹配到
print (re.sub(r'hello (\w+), nihao \1', 'AAAA', "hello re, nihao rEe"))
# 提取出组的信息
print (re.sub(r'hello (\w+), nihao \1', r'\g<1>', 'hello plm, nihao plm'))
print (re.sub(r'hello (\w+), nihao \1', r'\1', 'hello plm, nihao plm'))
res = re.search(r'hello (\w+), nihao \1', 'hello plmsss, nihao plmsss')
print (res.group(1))

print (re.sub(r'hello (?P<pname>\w+), nihao (?P=pname)', r'\g<pname>', 'hello plm, nihao plm'))
print (re.sub(r"([.!?])", r" \1", 'a.b'))
print (re.sub(r"([.!?])", r" \g<1>", 'a.b'))
# 一般用search或者match

hello 222 world 222
AAAABBBB
hello re, nihao rEe
plm
plm
plmsss
plm
a .b
a .b


In [25]:
'''2. reversed'''
p = ['hello', 'you']
print (reversed(p),list(reversed(p)))

reverse = False
en_idx = 0 if reverse is True else 1
print (en_idx)

<listreverseiterator object at 0x7f7485bf2490> [u'you', u'hello']
1


In [10]:
'''3. torch.cat 把多个tensor组合在一起。形状要统一一样
'''
x = torch.randn(2, 3)
# 按行排列 6*3
print (torch.cat((x, x, x), 0).size())
# 按列排 2*9
print (torch.cat((x, x, x), 1).size())
y = torch.randn(3, 3)

torch.Size([6, 3])
torch.Size([2, 9])


In [8]:
'''4. torch.bmm 矩阵相乘的Tensor
bach1 b*n*m, batch2: b*m*p, 结果：b*n*p
'''
batch1 = torch.randn(10, 3, 4)
batch2 = torch.randn(10, 4, 5)
res = torch.bmm(batch1, batch2)
print (res.size())

torch.Size([10, 3, 5])


In [15]:
'''5 torch.unsqueeze 解缩，增加维数.0在前面加维数，1在后面加维数'''
#[4]
x = torch.Tensor([1, 2, 3, 4])
print (x.size())

# [1, 4]
y1 = torch.unsqueeze(x, 0)
print (y1.size())

# [4, 1]
y2 = torch.unsqueeze(x, 1)
print (y2.size())

# [1, 1, 4]
print (y1.unsqueeze(0).size())
# [1, 4, 1]
print (y2.unsqueeze(0).size())

torch.Size([4])
torch.Size([1, 4])
torch.Size([4, 1])
torch.Size([1, 1, 4])
torch.Size([1, 4, 1])


In [16]:
# 6. softmax
print (F.softmax(torch.Tensor([1, 2, 1])))
print (F.log_softmax(torch.Tensor([1, 2, 1])))

Variable containing:
 0.2119
 0.5761
 0.2119
[torch.FloatTensor of size 3]

Variable containing:
-1.5514
-0.5514
-1.5514
[torch.FloatTensor of size 3]



In [29]:
# 7. view -1在前面，保留在前面，后面维数+1；-1在后面，保留在后面，前面维数+1
x = torch.Tensor([1, 2, 3])
print (x.view(-1, 1).size())
print (x.view(1, -1).size())

torch.Size([3, 1])
torch.Size([1, 3])
