In [86]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import jieba
from collections import Counter
import time

In [87]:
path = '../data/cmn-eng/'

In [88]:
with open(path + 'cmn.txt') as f:
    lines = f.readlines()
    f.close()

In [89]:
def deal_en_sen( raw ):
    raw.strip()
    letters_only = re.sub("[^a-zA-Z]", " ", raw) 
    words = letters_only.lower().split()                             
    
    return(" ".join(words )) 

def deal_zh_sen( raw ):
    raw.strip()
    letters_only = re.sub("[^\u4e00-\u9fa5]", "", raw)                        
    
    return(letters_only) 

In [90]:
pairs = []
pair = []
for line in lines:
    nen = deal_en_sen(line.split('\t')[0])
    nzh = deal_zh_sen(line.split('\t')[1]) 
    pair.append(nen)
    pair.append(nzh)
    pairs.append(pair)
    pair = []

In [91]:
pairs[0:3]

[['hi', '嗨'], ['hi', '你好'], ['run', '你用跑的']]

In [92]:
en_counts = Counter()
zh_counts = Counter()

In [93]:
for i in range(len(pairs)):
    for word in str(pairs[i][0]).split(' '):
        en_counts[word] += 1
    for word in list(jieba.cut(pairs[i][1])):
        zh_counts[word] += 1

In [94]:
vocab_en = set(en_counts)
vocab_zh = set(zh_counts)

In [95]:
print(len(vocab_en))
print(len(vocab_zh))

5955
13018


In [96]:
word2index_en = {}
for i, word in enumerate(vocab_en):
    word2index_en[word] = i
    
word2index_zh = {}
for i, word in enumerate(vocab_zh):
    word2index_zh[word] = i

In [97]:
pairs_to_vec = []

for i in range(len(pairs)):
    pair_to_vec = []
    pair_en_to_vec = []
    pair_zh_to_vec = []
    for word in str(pairs[i][0]).split(' '):
        pair_en_to_vec.append(word2index_en[word])
    for word in list(jieba.cut(pairs[i][1])):
        pair_zh_to_vec.append(word2index_zh[word])
    pair_to_vec.append(pair_en_to_vec)
    pair_to_vec.append(pair_zh_to_vec)    
    pairs_to_vec.append(pair_to_vec)

In [98]:
# 参数
USE_CUDA = False
MAX_LENGTH = 10
# 功能函数
def iterate_minibatches(data, batchsize, shuffle=False):
    length = len(data)
    if shuffle:
        indices = np.arange(length)
        np.random.shuffle(indices)
        
    for start_idx in range(0, length - batchsize + 1, batchsize):
        if shuffle:
            ran = indices[start_idx:start_idx + batchsize]
        else:
            ran = slice(start_idx, start_idx + batchsize)
        yield data[ran]
    
    
def gen_minibatch(data, batch_size, shuffle=True):
    for pair in iterate_minibatches(data, batch_size, shuffle):
        yield pair
        
def s(name, val):
    print(name + "'s size is {}".format(val))

In [105]:
# Model
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1, batch_size=1, bidirectional=False):
        super(EncoderRNN, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        self.batch_size = batch_size
        self.bidirectional = bidirectional

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers, batch_first=True, bidirectional=False)

    def forward(self, sens_vec, hidden):
        s("sens_vec", sens_vec.size())
        embedded = self.embedding(sens_vec)
        s("embedded", embedded.size())
        s("hidden", hidden.size())
        output, hidden = self.gru(embedded, hidden)
        return output, hidden

    def init_hidden(self):
        hidden = Variable(t.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: 
            hidden = hidden.cuda()
        return hidden

# Attn 层
class Attn(nn.Module):
    def __init__(self, hidden_size):
        super(Attn, self).__init__()
        
        self.hidden_size = hidden_size
        self.attn = nn.Linear(self.hidden_size, self.hidden_size)
        
    def forward(self, hidden, encoder_outputs):
        s("encoder_outputs", encoder_outputs.size())
        seq_len = encoder_outputs.size()[1]
        attn_energies = Variable(t.zeros(seq_len))
        
        if USE_CUDA:
            attn_energies.cuda()

        for i in range(seq_len):
            attn_energies[i] = self.score(hidden, encoder_outputs[0][i])

        return F.softmax(attn_energies)
    
    def score(self, hidden, encoder_output):
#         s("encoder_output", encoder_output.size())
        energy = self.attn(encoder_output)
        # 矩阵维度有些不理解
#         s("enenrgy", energy.size())
#         s("hidden", hidden.squeeze(0).squeeze(0).size())
        
        energy = t.dot(hidden.squeeze(0).squeeze(0), energy)
#         s("new energy", energy.size())
        return energy
# 改进的解码层
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, n_layers=1, dropout_p=.1):
        super(AttnDecoderRNN, self).__init__()
        # 定义参数
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.n_layers = n_layers
        self.dropout_p = dropout_p
        
        # 定义层
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p, batch_first=True)
        self.out = nn.Linear(hidden_size * 2, output_size)
        self.attn = Attn(hidden_size)

    def forward(self, word_input, last_context, last_hidden, encoder_outputs):
        s("word_input", word_input.size())
        word_embedded = self.embedding(word_input)
        s("word_embedded", word_embedded.size())
        
        s("last_context", last_context.size())
        rnn_input = t.cat((word_embedded, last_context), 2)
        s("rnn_input", rnn_input.size())
        s("last_hidden", last_hidden.size())
        rnn_output, hidden = self.gru(rnn_input, last_hidden)
        s("rnn_output", rnn_output.size())
        
        attn_weights = self.attn(rnn_output, encoder_outputs)
        s("encoder_outputs", encoder_outputs.size())
        s("attn_weights", attn_weights.unsqueeze(0).unsqueeze(1).size())
        context = attn_weights.unsqueeze(0).unsqueeze(1).bmm(encoder_outputs) 
        s("context", context.size())
        
        output = F.log_softmax(self.out(t.cat((rnn_output, context), 2)))

        return output, context, hidden, attn_weights

In [106]:
teacher_forcing_ratio = 0.5
clip = 5.0
MAX_LENGTH = 10

def train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):

    # Zero gradients of both optimizers
    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()
    loss = 0 # Added onto for each word

    # Get size of input and target sentences
    input_length = input_variable.size()[0]
    target_length = target_variable.size()[0]

    # Run words through encoder
    encoder_hidden = encoder.init_hidden()
    encoder_outputs, encoder_hidden = encoder(input_variable, encoder_hidden)
    
    # Prepare input and output variables
    decoder_input = Variable(torch.LongTensor([[SOS_token]]))
    decoder_context = Variable(torch.zeros(1, decoder.hidden_size))
    decoder_hidden = encoder_hidden # Use last hidden state from encoder to start decoder
    if USE_CUDA:
        decoder_input = decoder_input.cuda()
        decoder_context = decoder_context.cuda()

    # Choose whether to use teacher forcing
    use_teacher_forcing = random.random() < teacher_forcing_ratio
    if use_teacher_forcing:
        
        # Teacher forcing: Use the ground-truth target as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output[0], target_variable[di])
            decoder_input = target_variable[di] # Next target is next input

    else:
        # Without teacher forcing: use network's own prediction as the next input
        for di in range(target_length):
            decoder_output, decoder_context, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_context, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output[0], target_variable[di])
            
            # Get most likely word index (highest value) from output
            topv, topi = decoder_output.data.topk(1)
            ni = topi[0][0]
            
            decoder_input = Variable(torch.LongTensor([[ni]])) # Chosen word is next input
            if USE_CUDA: decoder_input = decoder_input.cuda()

            # Stop at end of sentence (not necessary when using known targets)
            if ni == EOS_token: break

    # Backpropagation
    loss.backward()
    torch.nn.utils.clip_grad_norm(encoder.parameters(), clip)
    torch.nn.utils.clip_grad_norm(decoder.parameters(), clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    
    return loss.data[0] / target_length

In [107]:
def as_minutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def time_since(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (- %s)' % (as_minutes(s), as_minutes(rs))

In [108]:
# Configuring training
n_epochs = 50000
plot_every = 200
print_every = 1000

# Keep track of time elapsed and running averages
start = time.time()
plot_losses = []
print_loss_total = 0 # Reset every print_every
plot_loss_total = 0 # Reset every plot_every

In [109]:
hidden_size = 500
n_layers = 2
dropout_p = 0.05

# Initialize models
encoder = EncoderRNN(len(vocab_en), hidden_size, n_layers)
decoder = AttnDecoderRNN(hidden_size, len(vocab_zh), n_layers, dropout_p=dropout_p)

# Move models to GPU
if USE_CUDA:
    encoder.cuda()
    decoder.cuda()

# Initialize optimizers and criterion
learning_rate = 0.0001
encoder_optimizer = optim.Adam(encoder.parameters(), lr=learning_rate)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

In [110]:
# Begin!
for epoch in range(1, n_epochs + 1):
    
    # Get training data for this cycle
    indices = np.arange(len(np.array(pairs_to_vec)))
    np.random.shuffle(indices)

    training_pair = pairs_to_vec[indices[0]]
    input_variable = Variable(torch.LongTensor(training_pair[0]))
    target_variable = Variable(torch.LongTensor(training_pair[1]))

    # Run the train function
    loss = train(input_variable, target_variable, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)

    # Keep track of loss
    print_loss_total += loss
    plot_loss_total += loss

    if epoch == 0: continue

    if epoch % print_every == 0:
        print_loss_avg = print_loss_total / print_every
        print_loss_total = 0
        print_summary = '%s (%d %d%%) %.4f' % (time_since(start, epoch / n_epochs), epoch, epoch / n_epochs * 100, print_loss_avg)
        print(print_summary)

    if epoch % plot_every == 0:
        plot_loss_avg = plot_loss_total / plot_every
        plot_losses.append(plot_loss_avg)
        plot_loss_total = 0

sens_vec's size is torch.Size([5])
embedded's size is torch.Size([5, 500])
hidden's size is torch.Size([2, 1, 500])


RuntimeError: size mismatch, m1: [1 x 5], m2: [500 x 1500] at /pytorch/torch/lib/TH/generic/THTensorMath.c:1293