### 导入数据

In [1]:
import numpy as np
import pandas as pd

X_train = np.load('./input/dataset_train.npy')
X_test = np.load('./input/dataset_test.npy')
embedding_matrix = np.load('./input/embedding_fasttext.npy')
y_train = np.load('./input/train_labels.npy')
y_test = np.load('./input/test_labels.npy')

### Traditional LSTM

In [3]:
length = 200
max_features = 20000
embed_size = 300
from lstm import basic_lstm
model = basic_lstm(length, max_features, embed_size, embedding_matrix)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          6000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 100)          140400    
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 100)          400       
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
__________

In [4]:
from train import modeltrainer
pth1 = './input/LSTM/model_bst.h5'
model = modeltrainer(model, X_train, y_train, 256, 15, pth1)



Train on 143613 samples, validate on 15958 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.977787
Epoch 00001: val_loss improved from inf to 0.04907, saving model to ./input/LSTM/model_bst.h5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.982215
Epoch 00002: val_loss improved from 0.04907 to 0.04718, saving model to ./input/LSTM/model_bst.h5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.983785
Epoch 00003: val_loss improved from 0.04718 to 0.04620, saving model to ./input/LSTM/model_bst.h5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.983984
Epoch 00004: val_loss improved from 0.04620 to 0.04607, saving model to ./input/LSTM/model_bst.h5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.984428
Epoch 00005: val_loss improved from 0.04607 to 0.04574, saving model to ./input/LSTM/model_bst.h5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.984608
Epoch 00006: val_loss did not improve
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.984544
Epoch 00007: val_loss did not improve
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.98

In [5]:
y_pred = model.predict(X_test, batch_size=1024, verbose=1)



In [6]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
score

0.9760719670674948

In [7]:
from keras.models import load_model
model.load_weights('./input/LSTM/model_bst.h5')
y_pred = model.predict(X_test, batch_size=1024, verbose=1)



In [8]:
score = roc_auc_score(y_test, y_pred)
score

0.9772473583335124

## GRU

In [2]:
length = 200
max_features = 20000
embed_size = 300
from gru import basic_gru
model = basic_gru(length, max_features, embed_size, embedding_matrix)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 300)          6000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 200, 300)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 200, 160)          182880    
_________________________________________________________________
batch_normalization_1 (Batch (None, 200, 160)          640       
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                8050      
__________

In [3]:
from train import modeltrainer
pth2 = './input/GRU/model_bst.h5'
model = modeltrainer(model, X_train, y_train, 256, 15, pth2)



Train on 143613 samples, validate on 15958 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.976215
Epoch 00001: val_loss improved from inf to 0.05057, saving model to ./input/GRU/model_bst.h5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.978109
Epoch 00002: val_loss improved from 0.05057 to 0.04836, saving model to ./input/GRU/model_bst.h5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.981477
Epoch 00003: val_loss improved from 0.04836 to 0.04740, saving model to ./input/GRU/model_bst.h5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.983213
Epoch 00004: val_loss improved from 0.04740 to 0.04636, saving model to ./input/GRU/model_bst.h5
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.983285
Epoch 00005: val_loss did not improve
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.983505
Epoch 00006: val_loss did not improve
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.983446
Epoch 00007: val_loss did not improve
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.983588
Epoch 00008: val_loss did not improve
Epoch 9/15
 ROC-AUC -

In [4]:
from keras.models import load_model
model.load_weights('./input/GRU/model_bst.h5')
y_pred = model.predict(X_test, batch_size=1024, verbose=1)



In [5]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
score

0.975045391269533

## RCNN

深度学习应用于nlp后，关于文本分类衍化出了很多模型，主要分为两类：卷积神经网络和循环神经网络

卷积神经网络
- 优点：无偏的模型，利用类似于n-gram的特性学习特征提取
- 缺点：忽略了文本上下文信息，卷积核的大小难以确定，小了丢失信息，大了参数太多

循环神经网络
- 优点：可以学习文本上下文的依赖关系
- 缺点：有偏的模型，每个单词对模型影响不一样，靠后的词语权重大

Recurrent convolutional neural network是论文《Recurrent Convolutional Neural Networks for Text Classification》中提出的深度学习模型，综合了上述两者的优点，先用RNN学习上下文时序信息，再用卷积结构学习最有效的特征，模型结构如下：

![](.\input\RCNN\architecture.png)

RCNN主要由三部分组成：

- recurrent structure（word representation learning）：对每一个输入的词$w_i$，将它对应的词向量$e(w_i)$输入到双向LSTM模块中，正向学习到上文的信息$c_l(w_i)$，反向学习到下文的信息$c_r(w_i)$，与其原本的词向量拼接形成新的词嵌入:

$$ x_i = [ c_l(w_i); e(w_i); c_r(w_i) ] $$

然后对这个新的词嵌入进行线性转换来代替卷积层，所有词向量共享参数，激活函数为tanh：

$$ y_i^{(2)} = tanh(W^{(2)}x_i + b^{(2)}) $$

- max-pooling layer(text representation learning): 最大池化提取最重要的文本特征

$$ y^{(3)} = \max \limits_{i=1}^n y_i^{(2)} $$

- output layer：最终由全连接层输出预测结果

$$ y^{(4)} = W^{(4)}y^{(3)} + b^{(4)} $$

原论文链接：https://www.aaai.org/ocs/index.php/AAAI/AAAI15/paper/view/9745/9552

In [2]:
length = 200
max_features = 20000
embed_size = 300
from rcnn import rcnn_model
model = rcnn_model(length, max_features, embed_size, embedding_matrix)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     6000000     input_1[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_1 (SpatialDro (None, 200, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 200, 100)     105300      spatial_dropout1d_1[0][0]        
__________________________________________________________________________________________________
concatenat

In [4]:
from train import modeltrainer
pth3 = './input/RCNN/model_bst.h5'
model = modeltrainer(model, X_train, y_train, 256, 15, pth3)



Train on 143613 samples, validate on 15958 samples
Epoch 1/15
 ROC-AUC - epoch: 1 - score: 0.972599
Epoch 00001: val_loss improved from inf to 0.05503, saving model to ./input/RCNN/model_bst.h5
Epoch 2/15
 ROC-AUC - epoch: 2 - score: 0.976091
Epoch 00002: val_loss improved from 0.05503 to 0.05065, saving model to ./input/RCNN/model_bst.h5
Epoch 3/15
 ROC-AUC - epoch: 3 - score: 0.978637
Epoch 00003: val_loss improved from 0.05065 to 0.04816, saving model to ./input/RCNN/model_bst.h5
Epoch 4/15
 ROC-AUC - epoch: 4 - score: 0.978727
Epoch 00004: val_loss did not improve
Epoch 5/15
 ROC-AUC - epoch: 5 - score: 0.979520
Epoch 00005: val_loss improved from 0.04816 to 0.04759, saving model to ./input/RCNN/model_bst.h5
Epoch 6/15
 ROC-AUC - epoch: 6 - score: 0.979452
Epoch 00006: val_loss did not improve
Epoch 7/15
 ROC-AUC - epoch: 7 - score: 0.980069
Epoch 00007: val_loss did not improve
Epoch 8/15
 ROC-AUC - epoch: 8 - score: 0.980192
Epoch 00008: val_loss did not improve
Epoch 9/15
 ROC-A