In [1]:
import torch as t
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
from collections import Counter

In [2]:
# path = '../data/cmn-eng/'
path = '../data/'

In [3]:
with open(path + 'train/train.zh') as f:
    line_zh = f.readlines()
    f.close()

In [4]:
with open(path + 'train/train.en') as f:
    line_en = f.readlines()
    f.close()

In [5]:
def deal_en_sen( raw ):
    raw.strip()
    letters_only = re.sub("[^a-zA-Z]", " ", raw) 
    words = letters_only.lower().split()                             
    
    return(" ".join(words )) 

def deal_zh_sen( raw ):
    raw.strip()
    letters_only = re.sub("[^\u4e00-\u9fa5]", "", raw)                        
    
    return(letters_only) 

In [6]:
pairs = []
pair = []
for en, zh in zip(line_en, line_zh):
#     nen = en.strip()
#     nzh = zh.strip()
    nen = deal_en_sen(en)
    nzh = deal_zh_sen(zh)
    pair.append(nen)
    pair.append(nzh)
    pairs.append(pair)
    pair = []

In [7]:
pairs[0:3]

[['a pair of red crowned cranes have staked out their nesting territory',
  '一对丹顶鹤正监视着它们的筑巢领地'],
 ['a pair of crows had come to nest on our roof as if they had come for lhamo',
  '一对乌鸦飞到我们屋顶上的巢里它们好像专门为拉木而来的'],
 ['a couple of boys driving around in daddy s car', '一对乖乖仔开着老爸的车子']]

In [8]:
en_counts = Counter()
zh_counts = Counter()

In [9]:
for i in range(len(pairs)):
    for word in str(pairs[i][0]).split(' '):
        en_counts[word] += 1
    for word in list(jieba.cut(pairs[i][1])):
        zh_counts[word] += 1

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.709 seconds.
Prefix dict has been built succesfully.


In [10]:
vocab_en = set(en_counts)
vocab_zh = set(zh_counts)

In [11]:
print(len(vocab_en))
print(len(vocab_zh))

384495
609044


In [13]:
word2index_en = {}
for i, word in enumerate(vocab_en):
    word2index_en[word] = i

In [14]:
word2index_zh = {}
for i, word in enumerate(vocab_zh):
    word2index_zh[word] = i

In [15]:
pairs_to_vec = []

for i in range(len(pairs)):
    pair_to_vec = []
    pair_en_to_vec = []
    pair_zh_to_vec = []
    for word in str(pairs[i][0]).split(' '):
        pair_en_to_vec.append(word2index_en[word])
    for word in list(jieba.cut(pairs[i][1])):
        pair_zh_to_vec.append(word2index_zh[word])
    pair_to_vec.append(pair_en_to_vec)
    pair_to_vec.append(pair_zh_to_vec)    
    pairs_to_vec.append(pair_to_vec)

In [91]:
# 参数
USE_CUDA = False

In [92]:
# 功能函数
def iterate_minibatches(data, batchsize, shuffle=False):
    length = len(data)
    if shuffle:
        indices = np.arange(length)
        np.random.shuffle(indices)
        
    for start_idx in range(0, length - batchsize + 1, batchsize):
        if shuffle:
            ran = indices[start_idx:start_idx + batchsize]
        else:
            ran = slice(start_idx, start_idx + batchsize)
        yield data[ran]
    
    
def gen_minibatch(data, batch_size, shuffle=True):
    for pair in iterate_minibatches(data, batch_size, shuffle):
        yield pair
        
def s(name, val):
    print(name + "'s size is {}".format(val))

In [195]:
# Model
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, batch_size=1, bidirectional=False):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.batch_size = batch_size
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True, bidirectional=False)

    def forward(self, sens_vec, hidden):
        s("sens_vec", sens_vec.size())
        embedded = self.embedding(sens_vec)
        s("embedded", embedded.size())
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(t.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: 
            hidden = hidden.cuda()
        return hidden

# Attn 层
class Attn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size, self.hidden_size)
        
    def forward(self, hidden, encoder_outputs):
        s("encoder_outputs", encoder_outputs.size())
        seq_len = encoder_outputs.size()[1]
        attn_energies = Variable(t.zeros(seq_len))
        
        if USE_CUDA:
            attn_energies.cuda()

        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[0][i])

        return F.softmax(attn_energies)
    
    def score(self, hidden, encoder_output):
#         s("encoder_output", encoder_output.size())
        energy = self.attn(encoder_output)
        # 矩阵维度有些不理解
#         s("enenrgy", energy.size())
#         s("hidden", hidden.squeeze(0).squeeze(0).size())
        
        energy = t.dot(hidden.squeeze(0).squeeze(0), energy)
#         s("new energy", energy.size())
        return energy
# 改进的解码层
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=.1):
        super(AttnDecoderRNN, self).__init__()
        # 定义参数
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # 定义层
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p, batch_first=True)
        self.out = nn.Linear(hidden_size * 2, output_size)
        self.attn = Attn(hidden_size)

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        s("word_input", word_input.size())
        word_embedded = self.embedding(word_input)
        s("word_embedded", word_embedded.size())
        
        s("last_context", last_context.size())
        rnn_input = t.cat((word_embedded, last_context), 2)
        s("rnn_input", rnn_input.size())
        s("last_hidden", last_hidden.size())
        rnn_output, hidden = self.gru(rnn_input, last_hidden)
        s("rnn_output", rnn_output.size())
        
        attn_weights = self.attn(rnn_output, encoder_outputs)
        s("encoder_outputs", encoder_outputs.size())
        s("attn_weights", attn_weights.unsqueeze(0).unsqueeze(1).size())
        context = attn_weights.unsqueeze(0).unsqueeze(1).bmm(encoder_outputs) 
        s("context", context.size())
        
        output = F.log_softmax(self.out(t.cat((rnn_output, context), 2)))

        return output, context, hidden, attn_weights

In [237]:
class EncoderRNN(nn.Module):
    def __init__(self, num_words, embedding_size, hidden_size, num_layers=1, batch_size=1, batch_first=True, bidirectional=False):
        super(EncoderRNN, self).__init__()
        
        self.num_words = num_words
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        
        self.embedded = nn.Embedding(num_words, embedding_size)
        self.cell_layer = nn.GRU(
            embedding_size, 
            hidden_size, 
            num_layers, 
            batch_first=batch_first, 
            bidirectional=bidirectional
        )
    
    def forward(self, sens, hidden):
        embedded = self.embedded(sens)
        output, state = self.cell_layer(embedded, hidden)
        
        return output, state
    
    def init_hidden(self):
        if self.bidirectional:
            is_double = 2
        else:
            is_double = 1
            
        hidden = Variable(torch.zeros(self.num_layers * is_double, self.batch_size, self.hidden_size * is_double))
        
        if USE_CUDA:
            hidden = hidden.cuda()
        return hidden

In [261]:
class Attn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(hidden_size, hidden_size)
    
    def forward(self, rnn_output, encoder_outputs):
        seq_len = encoder_outputs.size()[1]
        
        attn_energies = Variable(torch.zeros(seq_len))
        if USE_CUDA:
            attn_energies.cuda()
            
        for i in range(seq_len):
            attn_energies[i] = self.score(rnn_output, encoder_outputs[0][i])
            
        return F.softmax(attn_energies)
    
    def score(self, rnn_output, encoder_output):
        energy = self.attn(encoder_output)
        energy = torch.dot(rnn_output.squeeze(0).squeeze(0), energy)
        
        return energy

In [262]:
class DecoderRNN(nn.Module):
    def __init__(self, output_size, embedding_size, hidden_size, num_layers=1, batch_size=1, batch_first=True, bidirectional=False):
        super(DecoderRNN, self).__init__()
        
        self.output_size = output_size
        self.embedding_size = embedding_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.batch_size = batch_size
        self.bidirectional = bidirectional
        
        self.embedded = nn.Embedding(output_size, embedding_size)
        self.cell_layer = nn.GRU(
            embedding_size * 2, 
            hidden_size, 
            num_layers, 
            batch_first=batch_first, 
            bidirectional=bidirectional
        )
        self.attn = Attn(hidden_size)
        self.out = nn.Linear(hidden_size * 2, output_size)
    
    def forward(self, sen_word, last_context, hidden, encoder_outputs):
        embedded = self.embedded(sen_word)
        
        input_rnn = t.cat((embedded, last_context), 2)
        s("rnn_input",  input_rnn.size())
        s("last_hidden", hidden.size())
        output, state = self.cell_layer(input_rnn, hidden)
        
        attn_weights = self.attn(output, encoder_outputs)

        context = attn_weights.unsqueeze(0).unsqueeze(1).bmm(encoder_outputs) 
        
        output = F.log_softmax(self.out(t.cat((output, context), 2)))

        return output, context, state, attn_weights
        
    
    def init_hidden(self):
        if self.bidirectional:
            is_double = 2
        else:
            is_double = 1
            
        hidden = Variable(torch.zeros(self.num_layers * is_double, self.batch_size, self.hidden_size * is_double))
        
        if USE_CUDA:
            hidden = hidden.cuda()
        return hidden

In [263]:
# 对模型进行测试
encoder_test = EncoderRNN(10, 10, 10, 1)
decoder_test = DecoderRNN(10, 10, 10, 1)

print(encoder_test)
print(decoder_test)

encoder_hidden = encoder_test.init_hidden()
word_input = Variable(t.LongTensor([[1, 9, 3, 4]]))

if USE_CUDA:
    encoder_test.cuda()
    word_input.cuda()

encoder_outputs, encoder_hidden = encoder_test(word_input, encoder_hidden)

word_inputs = Variable(t.LongTensor([1, 2, 6, 6, 8]))
# 不是很理解
decoder_attns = t.zeros(1, 5, 4)
decoder_hidden = encoder_hidden 
decoder_context = Variable(t.zeros(1, 1, decoder_test.hidden_size))

if USE_CUDA:
    decoder_test.cuda()
    word_inputs = word_inputs.cuda()
    decoder_context = decoder_context.cuda()
    
for i in range(5):
    decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[i].view(1, -1), decoder_context, decoder_hidden, encoder_outputs)
    decoder_attns[0, i] = decoder_attn.squeeze(0).cpu().data

EncoderRNN (
  (embedded): Embedding(10, 10)
  (cell_layer): GRU(10, 10, batch_first=True)
)
DecoderRNN (
  (embedded): Embedding(10, 10)
  (cell_layer): GRU(20, 10, batch_first=True)
  (attn): Attn (
    (attn): Linear (10 -> 10)
  )
  (out): Linear (20 -> 10)
)
rnn_input's size is torch.Size([1, 1, 20])
last_hidden's size is torch.Size([1, 1, 10])
rnn_input's size is torch.Size([1, 1, 20])
last_hidden's size is torch.Size([1, 1, 10])
rnn_input's size is torch.Size([1, 1, 20])
last_hidden's size is torch.Size([1, 1, 10])
rnn_input's size is torch.Size([1, 1, 20])
last_hidden's size is torch.Size([1, 1, 10])
rnn_input's size is torch.Size([1, 1, 20])
last_hidden's size is torch.Size([1, 1, 10])
