In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import keras
from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, LSTM,Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers import CuDNNLSTM, CuDNNGRU
from keras.preprocessing import text, sequence
from keras.optimizers import Adam
from keras.callbacks import Callback
from keras import optimizers
from keras.layers import Lambda
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint,EarlyStopping
import warnings
import os
os.environ['OMP_NUM_THREADS'] = '4'
from sklearn.model_selection import KFold
import time
import cfg

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
EMBEDDING_FILE = cfg.data_path + 'wiki_zh.vec'
# train = pd.read_csv("../inputs/train.tsv", sep='\t')
# test = pd.read_csv("../inputs/vali.tsv", sep='\t')

df_all = pd.read_pickle(cfg.data_path + 'all_v2.pkl')
print('df_all.shape: ', df_all.shape)

train = df_all.loc[df_all['type'] == 'train']
test = df_all.loc[df_all['type'] == 'test']
print('df_tr.shape: ', train.shape)

df_all.shape:  (224382, 4)
df_tr.shape:  (146341, 4)


In [3]:
X_train = train["query"]
y_train = train["label"]

X_test = test["query"]
print(X_train.head())
print(y_train[:10])
print(y_train.shape)
print(X_test.head())

0    今年22岁，即将在2014年6月毕业的西昌学院大四学生吴学娅选择不大。作为食品专业的学生，她...
1    “现在我们每天生产40万到50万个，回收4,000-5,000个包括我们生产的死灯泡。”他说...
2    。记者悄悄来到医院二楼病房。只见一名睫毛浓密、皮肤黝黑的小姑娘被捆住双脚，躺在病床上。她就是...
3    一带一路倡议被形容为针对印度。反华立场可以帮助印度政治家赢得选票。中国威胁论被夸大了一段时间...
4    如同中国的金融体系没有在这次全球的金融危机中受到重大冲击是得益于中国尚未开放的资本市场和外汇...
Name: query, dtype: object
0    3
1    2
2    0
3    2
4    0
5    3
6    0
7    2
8    1
9    2
Name: label, dtype: int64
(146341,)
0    昨天，她们还在学校旁的小摊贩那儿淘便宜的发卡；今天，她们穿上制服走上街头对小摊小贩说“请不要...
1    难题ACROSS1只非洲蛇5大肆宣扬9印度州14州1715美元的对口16沉积岩17强调讽刺2...
2    专家认为，未来对贪腐官员的惩治和预防腐败体系的构建都很关键。一方面应该提高腐败的成本，同时还...
3    英国首相特蕾莎·梅（左）和德国总理安格拉·默克尔图片：CFP英国首相特蕾莎·梅终于与北爱尔兰...
4    5月4日9时30分左右，一名中年男子来到南宁市良庆区南晓镇南晓街水果市场的一家服装店里。男子...
Name: query, dtype: object


In [6]:
max_features = 100000
# max_features = 1000
maxlen = 800
embed_size = 300
batch_size = 1024
# epochs = 30
epochs = 10

In [7]:
tok = text.Tokenizer(num_words=max_features)
tok.fit_on_texts(list(X_train) + list(X_test))
X_train = tok.texts_to_sequences(X_train)
X_test = tok.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [8]:
embeddings_index = {}
with open(EMBEDDING_FILE, encoding='utf8') as f:
    for line in f:
        values = line.rstrip().rsplit(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [9]:
word_index = tok.word_index
# prepare embedding matrix
num_words = min(max_features, len(word_index) + 1)
embedding_matrix = np.zeros((num_words, embed_size))
for word, i in word_index.items():
    if i >= max_features:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [10]:
embedding_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [11]:
sequence_input = Input(shape=(maxlen,))
x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(sequence_input)
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(128, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Conv1D(64, kernel_size=3, padding="valid", kernel_initializer="glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool])
preds = Dense(4, activation="softmax")(x)
model = Model(sequence_input, preds)
model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=1e-3), metrics=['accuracy'])

In [12]:
y_train = keras.utils.to_categorical(y_train, num_classes=4)

print(y_train.shape)
print(y_train[:5])

(146341, 4)
[[0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]


In [13]:
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, 
                                              train_size=0.9, random_state=0)
print(X_tra.shape)
print(y_tra.shape)
print(X_val.shape)
X_tra

(131706, 800)
(131706, 4)
(14635, 800)


array([[    0,     0,     0, ...,     0,     0,    51],
       [    0,     0,     0, ..., 20876,  2629,  1248],
       [    0,     0,     0, ...,     0,     0,     0],
       ...,
       [    0,     0,     0, ...,     0, 18080, 48934],
       [    0,     0,     0, ...,     0,     0,     0],
       [    0,     0,     0, ...,     0,     0,     0]], dtype=int32)

In [14]:
class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: {:d} - score: {:.6f}".format(epoch + 1, score))

In [None]:
filepath = cfg.data_path + "rcnn.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
f1_val = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)
callbacks_list = [f1_val, checkpoint, early]

In [None]:
model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs,
          validation_data=(X_val, y_val), callbacks=callbacks_list,
          verbose=1)

Train on 131706 samples, validate on 14635 samples
Epoch 1/10
  1024/131706 [..............................] - ETA: 5:38:08 - loss: 1.3863 - acc: 0.3574

In [None]:
model.load_weights(filepath)
print('Predicting....')
y_pred = model.predict(x_test, batch_size=batch_size, verbose=1)

In [None]:
y_p = np.argmax(y_pred, 1)

In [None]:
lookupTable

In [None]:
y_p.shape

In [None]:
label_revserv_dict = {0: '人类作者',
                      1: '机器作者',
                      2: '机器翻译',
                      3: '自动摘要'}

In [None]:
label_revserv_dict

In [None]:
test['label'] = np.vectorize(label_revserv_dict.get)(y_p)
test.head()

In [None]:
test.to_csv(cfg.data_path + 'rcnn_result.csv', columns=['Id', 'label'],
            header=False, index=False)