In [None]:
import numpy as np
import mxnet as mx
import collections
import datetime
import os

from mxnet import autograd, gluon, nd
from mxnet.gluon import nn, rnn, Block
from mxnet.contrib import text
from sklearn import metrics
from sklearn.model_selection import train_test_split
from io import open

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
PAD = '<PAD>'
NOT = 'N'
PAD_NATURE = 'r'

In [None]:
epochs = 5
drop_prob = 0.2
batch_size = 256
learning_rate = 0.01

max_seq_len = 30

word_vec_size = 200
nature_vec_size = 50
distance_vec_size = 50
num_channels = 10
conv_width = word_vec_size + nature_vec_size + distance_vec_size
kernels_size_ls = [(2, conv_width), (3, conv_width), (4, conv_width)]
padding_ls = None
pool_size = (2, 1) 
output_size = 6
distance_size = 2 * max_seq_len - 1

ctx = mx.gpu()

In [None]:
def read_data(max_seq_len):
    input_tokens = []   # 记录输入 X 的所有词，包含重复
    output_tokens = []  # 记录输出 Y 的所有符号，包含重复
    nature_tokens = []  # 记录所有词的词性的符号，包含重复
    input_seqs = []  # 列表中装的列表，里面的每个列表代表一条输入，填充或截断好了的
    output_seqs = []  # 同input_seqs
    nature_seqs = []
    
    with open("../data_for_seq2seq/re_cut_lines_word.txt", 'r') as fx, open("../data_for_seq2seq/re_cut_lines_label.txt", 'r') as fy, open("../data_for_seq2seq/re_cut_lines_nature.txt", 'r') as fn:
        word_lines = fx.readlines()
        label_lines = fy.readlines()
        word_natures = fn.readlines()
        
        for word_line, lable_line, word_nature in zip(word_lines, label_lines, word_natures):
            
            input_seq = word_line.strip()
            output_seq = lable_line.strip()
            nature_seq = word_nature.strip()
            
            cur_input_tokens = input_seq.split(' ')
            cur_output_tokens = output_seq.split(' ')
            cur_nature_tokens = nature_seq.split(' ')
            
            if '' in cur_output_tokens:
                continue
            
            if len(cur_input_tokens) < max_seq_len or len(cur_output_tokens) < max_seq_len or len(cur_nature_tokens) < max_seq_len:
                input_tokens.extend(cur_input_tokens)
                output_tokens.extend(cur_output_tokens)
                nature_tokens.extend(cur_nature_tokens)
                
                # 添加 PAD 符号使每个序列等长，长度为 max_seq_len
                while len(cur_input_tokens) < max_seq_len:
                    cur_input_tokens.append(PAD)
                    # 把输出也填充到了最大长度
                    cur_output_tokens.append(NOT)
                    cur_nature_tokens.append(PAD_NATURE)
                    
                input_seqs.append(cur_input_tokens)                            
                output_seqs.append(cur_output_tokens)
                nature_seqs.append(cur_nature_tokens)
                
            else:
                cur_input_tokens = cur_input_tokens[0: max_seq_len]
                cur_output_tokens = cur_output_tokens[0: max_seq_len]
                cur_nature_tokens = cur_nature_tokens[0: max_seq_len]
                
                input_tokens.extend(cur_input_tokens)
                input_seqs.append(cur_input_tokens)
                
                output_tokens.extend(cur_output_tokens)
                output_seqs.append(cur_output_tokens)
                
                nature_tokens.extend(cur_nature_tokens)
                nature_seqs.append(cur_nature_tokens)
                
        fr_vocab = text.vocab.Vocabulary(collections.Counter(input_tokens), reserved_tokens=[PAD])
        print(collections.Counter(output_tokens))
        en_vocab = text.vocab.Vocabulary(collections.Counter(output_tokens))
        
        nature_vocab = text.vocab.Vocabulary(collections.Counter(nature_tokens))
    
    return fr_vocab, en_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs

In [None]:
input_vocab, output_vocab, nature_vocab, input_seqs, output_seqs, nature_seqs = read_data(max_seq_len)

In [None]:
len(input_vocab)

In [None]:
output_vocab.idx_to_token

In [None]:
len(nature_vocab)

In [None]:
if os.path.exists("../data_for_cnn_lstm/X.npy") and  os.path.exists("../data_for_cnn_lstm/Y.npy") and os.path.exists("../data_for_cnn_lstm/nature.npy"):
    print("Loading...")
    X = np.load("../data_for_cnn_lstm/X.npy")
    Y = np.load("../data_for_cnn_lstm/Y.npy")
    nature = np.load("../data_for_cnn_lstm/nature.npy")
    print("End")
else:
    print("Converting...")
    X = nd.zeros((len(input_seqs), max_seq_len))
    Y = nd.zeros((len(output_seqs), max_seq_len))
    nature = nd.zeros((len(nature_seqs), max_seq_len))
    
    for i in range(len(input_seqs)):
        X[i] = nd.array(input_vocab.to_indices(input_seqs[i]))
        Y[i] = nd.array(output_vocab.to_indices(output_seqs[i]))
        nature[i] = nd.array(nature_vocab.to_indices(nature_seqs[i]))
    np.save("../data_for_cnn_lstm/X.npy", X.asnumpy())
    np.save("../data_for_cnn_lstm/Y.npy", Y.asnumpy())
    np.save("../data_for_cnn_lstm/nature.npy", nature.asnumpy())
    print("End")

In [None]:
nature.shape, X.shape

In [None]:
X_train, X_test, Y_train, Y_test, nature_train, nature_test = train_test_split(X, Y, nature, test_size=0.1, random_state=33)
((X_train.shape, Y_train.shape, nature_train.shape), (X_test.shape, Y_test.shape, nature_test.shape))

In [None]:
dataset_train = gluon.data.ArrayDataset(nd.array(X_train, ctx=ctx), nd.array(Y_train, ctx=ctx), nd.array(nature_train, ctx=ctx))
data_iter_train = gluon.data.DataLoader(dataset_train, batch_size, shuffle=True, last_batch='rollover')

In [None]:
del X_train, Y_train, nature_train

***
## 定义模型

In [None]:
class CNN_Model(nn.Block):
    def __init__(self, vocab_size, word_vec_size, nature_size, nature_vec_size, distance_size, distance_vec_size,
                 num_channels, kernels_size_ls, padding_ls, pool_size, output_size,
                 drop_prob=0.2,  **kwargs):
        super(CNN_Model, self).__init__(**kwargs)
        with self.name_scope():
            self.word_embedding = nn.Embedding(vocab_size, word_vec_size)
            self.nature_embedding = nn.Embedding(nature_size, nature_vec_size)
            self.distance_embedding = nn.Embedding(distance_size, distance_vec_size)
            self.num_channels = num_channels
            self.kernels_size_ls = kernels_size_ls
            self.conv_ls = []
            for kernel_size in kernels_size_ls:
                self.conv_ls.append(nn.Conv2D(channels=num_channels, kernel_size=kernel_size, activation='relu',
                                             weight_initializer="normal"))
            self.max_pool = nn.MaxPool2D(pool_size=pool_size)
            self.flatten = nn.Flatten()
            self.dense = nn.Dense(output_size)
            self.drop = nn.Dropout(drop_prob)
        
    def forward(self, x_input, nature_input, distance_input):
        batch_words_embed = self.word_embedding(x_input)
        batch_nature_embed = self.nature_embedding(nature_input)
        batch_distance_embed = self.distance_embedding(distance_input)
        
        # (batch_size, height, width)
        print(batch_words_embed.shape)
        print(batch_nature_embed.shape)
        print(batch_distance_embed.shape)
        batch_data_x = nd.concat(batch_words_embed, batch_nature_embed, batch_distance_embed, dim=2)
        # (batch_size, 1, height, width)
        batch_data_x = nd.expand_dims(batch_data_x, axis=1)
        
        conv_pool_result = []
        for conv in self.conv_ls:
            conv_result = conv(batch_data_x)    # (batch_size, num_channels, out_height, out_width)
            pool_result = self.max_pool(conv_result)    # (batch_size, num_channels, new_height, new_width)
            pool_result = self.flatten(pool_result)
            conv_pool_result.append(pool_result)
        # (batch_size, len(kernel_size_ls)*num_channels*new_height,new_width)
        conv_pool_result_concated = nd.concat(*conv_pool_result, dim=1)
        conv_pool_result_concated = self.drop(conv_pool_result_concated)
        output = self.dense(conv_pool_result_concated)
        
        return output   
        

In [None]:
dic_value = nd.array(list(output_vocab.token_to_idx.values()), ctx=ctx)

In [None]:
label_one_hot = nd.one_hot(dic_value, dic_value.shape[0])

In [None]:
label_one_hot

In [None]:
def generate_cnn_input(word_data, nature, batch_distance, pos):
    x_input = word_data[:, pos]
    nature_input = nature[:, pos]
    distance_input = batch_distance[:, pos]
    
    return x_input, nature_input, distance_input   

In [None]:
def train(model, max_seq_len, label_one_hot, output_vocab, learning_rate, ctx):
    # 对于三个网络，分别初始化它们的模型参数并定义它们的优化器。
    model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
    
    optimizer = gluon.Trainer(model.collect_params(), 'adam',
                                      {'learning_rate': learning_rate})

    softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss(sparse_label=False)

    prev_time = datetime.datetime.now()
    
    total_loss = []   
    for epoch in range(0, epochs):
        epoch_loss = 0.0
        batch_idx = 0
        for x, y, nature in data_iter_train:
            batch_preds = []
            with autograd.record():
                batch_loss = nd.array([0], ctx=ctx)
                
                for word_idx in range(x.shape[1]): 
                    distance = nd.arange(x.shape[1], ctx=ctx) - word_idx
                    distance = distance.reshape((1, -1))
                    # batch_distance 尺寸: (batch_size, max_seq_length)
                    batch_distance = nd.broadcast_axis(distance, axis=0, size=batch_size)                    
                    
                    outputs = model(x, nature, batch_distance)
                    preds = nd.argmax(nd.softmax(outputs, axis=1), axis=1)
                    print(pred.shape)
                    batch_preds.append(preds)
                    y_idx = y[:, word_idx]
                    label = nd.take(label_one_hot, y_idx)
                    
                    batch_loss = batch_loss + nd.mean(softmax_cross_entropy(outputs, label))
                
            batch_loss.backward()
            optimizer.step(batch_size)
        
            epoch_loss += batch_loss.asscalar()                   
            
            if batch_idx % 100 == 0:
                print("epoch: {0} , batch: {1}, batch_loss: {2}".format(epoch, batch_idx, batch_loss.asscalar()))
#                 for idx in range(2): 
#                     true_idx = [int(x) for x in list(y[idx].asnumpy())]
#                     pred_idx = [int(x) for x in list(pred_outputs[idx].asnumpy())]
                    
#                     true_label = output_vocab.to_tokens(true_idx)
#                     pred_label = output_vocab.to_tokens(pred_idx)
                    
#                     print("Sapmle {0} :".format(idx))
#                     print("True label : {0}".format(true_label))
#                     print("Pred label : {0}".format(pred_label))
            batch_idx += 1
            
        
        epoch_loss = epoch_loss / batch_idx
        
        
        total_loss.append(epoch_loss)
        
        print("epoch: {0} , epoch_loss: {1}".format(epoch, epoch_loss))
        print("-----------------------------------------------------")
    
    plt.plot(range(epochs), total_loss)
    plt.show()
    
             

In [None]:
model = CNN_Model(len(input_vocab), word_vec_size, len(nature_vocab), nature_vec_size, distance_size, distance_vec_size,
                 num_channels, kernels_size_ls, padding_ls, pool_size, output_size, drop_prob=drop_prob)

In [None]:
train(model, max_seq_len, label_one_hot, output_vocab, learning_rate, ctx)