In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch import optim
# 导入 SRU
from cuda_functional import SRU, SRUCell

import re
import tqdm
import jieba
import random

# 1.数据处理部分

In [2]:
USE_CUDA = True
path = './data/cmn-eng/'

SOS_token = 0
EOS_token = 1
MAX_LENGTH = 10

In [3]:
def isChinese(sen):
    zhPattern = re.compile(u'[\u4e00-\u9fa5]+')
    return zhPattern.search(sen)
# 简化句子 便于处理
def normalize_string(s):
    s = re.sub(r"[!！？.()（）""?。“”，,']", r" ", s)
    return s

In [4]:
class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2 # Count SOS and EOS
      
    def index_words(self, sentence):
        sen_list = []
        if isChinese(sentence):
            sen_list = jieba.cut(sentence)
        else:
            sen_list = sentence.split(' ')
            
        for word in sen_list:
            self.index_word(word)

    def index_word(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [5]:
def read_sen(path, lang1, lang2, reverse=False):
    with open(path + '{}-{}.txt'.format(lang1, lang2)) as f:
        lines = f.readlines()
        pairs = []
        for line in lines:
            line = line.strip()
            if reverse:
                line = line.split('\t')
                line.reverse()
                line = "\t".join(line)
                
            pair = [normalize_string(sen) for sen in line.split('\t')]
            pairs.append(pair)
        
        if reverse:
            input_lang = Lang(lang2)
            output_lang = Lang(lang1) 
        else:
            input_lang = Lang(lang1)            
            output_lang = Lang(lang2)   
            
    return input_lang, output_lang, pairs

In [6]:
def data_preprocess(path, lang1, lang2, reverse=False):
    print("Read lines......")
    input_lang, output_lang, pairs = read_sen(path, lang1, lang2, reverse)
    print("Trimmed  to {} sentence pairs".format(len(pairs)))
    
    print("Indexing words......")
    for pair in pairs:
        input_lang.index_words(pair[0])
        output_lang.index_words(pair[1])
    
    return input_lang, output_lang, pairs
    
input_lang, output_lang, pairs = data_preprocess(path, 'eng', 'cmn')
for i in range(5):
    print(random.choice(pairs))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache


Read lines......
Trimmed  to 19056 sentence pairs
Indexing words......


Loading model cost 0.718 seconds.
Prefix dict has been built succesfully.


['Their house is being remodeled ', '他們的房子正在裝潢 ']
['We sometimes swim in the lake ', '我們偶爾在湖裡游泳 ']
['They are my brothers ', '他们是我的兄弟 ']
['You look as healthy as ever ', '你看起來健康如昔 ']
['Dad bought me a camera ', '爸爸给我买了一个照相机 ']


# 2.pytorch 搭建模型

## 2.1.数据部分

In [7]:
#sen's index
def ixs_from_sen(lang, sen):
    if isChinese(lang):
        sen = jieba.cut('')
    else:
        sen = sen.split(' ')
        
    return [lang.word2index[word] for word in sen]

def var_from_sen(lang, sen):
    ixs = ixs_from_sen(lang, sen)
    ixs.append(EOS_token)
    var = Variable(torch.LongTensor(ixs).view(-1, 1))
    if USE_CUDA: 
        var = var.cuda()
    
    return var
    

def var_from_pair(pair):
    input_variable = var_from_sen(input_lang, pair[0])
    input_variable = var_from_sen(output_lang, pair[1])

    return (input_variable, output_variable)

## 2.2.模型搭建

In [8]:
# 编码层
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, n_layers=1):
        super(EncoderRNN, self).__init__()
        
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers
        
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
    
    def forward(self, X, hidden):
        seq_len = len(X)
        # 为什么 embedding 输入只有 X,不是应该还有 hidden_size
        embedded = self.embedding(X).view(seq_len, 1, -1)
        output, hidden = self.gru(embedded, hidden)
        return output, hidden
    
    def init_hidden(self):
        hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
        if USE_CUDA: hidden = hidden.cuda()
        return hidden

In [9]:
# 带有注意力机制的解码层

## 原始的 decoder with attention
?class BahdanauAttnDecoderRNN(nn.Module):
## attenion layer
?class Attn(nn.Module):
## 改进的 decoder with attention
?class AttnDecoderRNN(nn.MOdule):

Object `class` not found.
Object `class` not found.
Object `class` not found.


## 2.3.训练

In [10]:
def train():
    pass

## 2.4.模型评估

In [11]:
def evaluate():
    pass

def evaluate_randomly():
    pass

# 可视化 Attention
def show_attention():
    pass

def eva_and_show_attention():
    pass

## 2.5.模型提高