In [1]:
import re
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import XLNetTokenizer, TFXLNetModel,TFXLNetPreTrainedModel,TFXLNetMainLayer
from transformers.modeling_tf_utils import get_initializer

In [2]:
tokenizer = XLNetTokenizer.from_pretrained('../model/xlnet/tf_zh')

In [3]:
class InputFeatures(object):
    def __init__(self,input_ids,token_type_ids,attention_mask,label):
        self.input_ids=input_ids
        self.token_type_ids=token_type_ids
        self.attention_mask=attention_mask 
        self.label=int(label)
        
class InputExample(object):
    def __init__(self,category,query1,query2,label):
        self.re_punctuation='[{}]+'.format(''';'",.!?；‘’“”，。！？''')
        self.category=category
        self.query1=re.sub(self.re_punctuation, '', query1)
        self.query2=re.sub(self.re_punctuation, '', query2 )
        self.label=int(label)
        
    def convert_to_features(self,trans=False):
        encode_data=None
        if trans:
            encode_data=tokenizer.encode_plus(self.query2,self.query1,max_length=64,pad_to_max_length=True)
        else:
            encode_data=tokenizer.encode_plus(self.query1,self.query2,max_length=64,pad_to_max_length=True)
#         return model(inputs=encode_data['input_ids'],
#                      attention_mask=encode_data['attention_mask'],
#                      token_type_ids=encode_data['token_type_ids'])[0],self.label
        return InputFeatures(encode_data['input_ids'],encode_data['token_type_ids'],encode_data['attention_mask'],self.label)

        
def read_file(data_path):
    if os.path.exists(data_path):
        return pd.read_csv(data_path).dropna()
    else:
        raise FileNotFoundError('{0} not found.'.format(data_path))

def get_examples(data_path):
    examples = []
    for i,line in read_file(data_path).iterrows():
        examples.append(InputExample(line['category'],line['query1'],line['query2'],line['label']))
    return examples

def get_features(examples):
    features=[]
    for e in examples:
        features.append(e.convert_to_features(False))
        features.append(e.convert_to_features(True))
    return features

def get_dataset(features):
    def gen():
        for ex in features:
            yield ({'input_ids': ex.input_ids,'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids},ex.label)
    return tf.data.Dataset.from_generator(gen,
                                          ({'input_ids': tf.int32,
                                            'attention_mask': tf.int32,
                                            'token_type_ids': tf.int32},
                                           tf.int64),
                                          ({'input_ids': tf.TensorShape([None]),
                                            'attention_mask': tf.TensorShape([None]),
                                            'token_type_ids': tf.TensorShape([None])},
                                           tf.TensorShape([])))

In [4]:
train_data = get_examples('data/train.csv')
dev_data = get_examples('data/dev.csv')

In [5]:
train_features = get_features(train_data)
dev_features = get_features(dev_data)

In [6]:
train_dataset = get_dataset(train_features)
dev_dataset = get_dataset(dev_features)

In [7]:
train_dataset = train_dataset.shuffle(256).batch(64).repeat(-1)
dev_dataset = dev_dataset.shuffle(256).batch(64).repeat(-1)

In [12]:
class TFXLNetForYiQing(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")
        self.first_dropout = tf.keras.layers.Dropout(0.2)
        self.lstm = tf.keras.layers.LSTM(256,activation='relu')
        self.last_dropout = tf.keras.layers.Dropout(0.2)
        self.logits_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range),activation='sigmoid', name="logits_proj")
    def call(self, inputs, **kwargs):
        output = self.transformer(inputs, **kwargs)[0]
        output = self.first_dropout(output)
        output = self.lstm(output)
        output = self.last_dropout(output)
        output = self.logits_proj(output)
        return output


model = TFXLNetForYiQing.from_pretrained('../model/xlnet/tf_zh')
model.transformer.trainable=False
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [13]:
model.summary()

Model: "tfxl_net_for_yi_qing_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFXLNetMainLaye multiple                  116718336 
_________________________________________________________________
dropout_76 (Dropout)         multiple                  0         
_________________________________________________________________
lstm_1 (LSTM)                multiple                  1049600   
_________________________________________________________________
dropout_77 (Dropout)         multiple                  0         
_________________________________________________________________
logits_proj (Dense)          multiple                  257       
Total params: 117,768,193
Trainable params: 1,049,857
Non-trainable params: 116,718,336
_________________________________________________________________


In [None]:
train_steps = 273
valid_steps = 62
model.fit(train_dataset,
          epochs=6,
          steps_per_epoch=train_steps,
          validation_data=dev_dataset,
          validation_steps=valid_steps,
          verbose=2)

Train for 273 steps, validate for 62 steps
Epoch 1/6
273/273 - 84s - loss: 1117.8144 - accuracy: 0.5243 - val_loss: 743.8649 - val_accuracy: 0.5854
Epoch 2/6
273/273 - 72s - loss: 511.7227 - accuracy: 0.5175 - val_loss: 544.3617 - val_accuracy: 0.5827
Epoch 3/6
273/273 - 72s - loss: 496.6515 - accuracy: 0.5192 - val_loss: 943.3679 - val_accuracy: 0.4055
Epoch 4/6


In [96]:
print(dev_features)

[({'input_ids': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 19, 3018, 3210, 23757, 955, 66, 23834, 955, 33, 5614, 7644, 4, 19, 3018, 3210, 23757, 955, 66, 23834, 955, 11948, 100, 1481, 33, 5614, 7644, 4, 3], 'token_type_ids': [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2], 'attention_mask': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}, 1), ({'input_ids': [5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 19, 3018, 3210, 23757, 955, 66, 23834, 955, 11948, 100, 1481, 33, 5614, 7644, 4, 19, 3018, 3210, 23757, 955, 66, 23834, 955, 33, 5614, 7644, 4, 3], 'token_type_ids': [3, 3, 3




In [17]:
data=pd.read_csv('data/train.csv')

In [18]:
data.loc[7885]

category                 哮喘
query1      孩子咳嗽哮喘，坚持凉水洗澡行吗
query2       孩子咳嗽哮喘吃什么药效果好？
label                     0
Name: 7885, dtype: object

In [49]:
tokenizer.decode([  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,  19, 971,   4,  19, 620,   4,   3])

'<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad> a<sep> t<sep><cls>'

In [48]:
features_data

[{'input_ids': <tf.Tensor: id=7611, shape=(1, 64), dtype=int32, numpy=
  array([[  5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,   5,
            5,   5,   5,   5,   5,  19, 971,   4,  19, 620,   4,   3]])>,
  'token_type_ids': <tf.Tensor: id=7612, shape=(1, 64), dtype=int32, numpy=
  array([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
          3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 1, 1, 1, 2]])>,
  'attention_mask': <tf.Tensor: id=7613, shape=(1, 64), dtype=int32, numpy=
  array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
          0, 0, 0, 0, 0, 

In [32]:
new_data

Unnamed: 0,category,query1,query2,label
0,咳血,请问呕血与咯血有什么区别,请问呕血与咯血这两者之间有什么区别,1
1,咳血,请问呕血与咯血有什么区别,请问呕血与咯血异同,1
2,咳血,请问呕血与咯血有什么区别,请问呕血与咯血怎么治疗,0
3,咳血,请问呕血与咯血有什么区别,请问呕血与咯血是什么原因导致的,0
4,咳血,请问呕血与咯血有什么区别,请问呕血与咯血与其他疾病有关联吗,0
...,...,...,...,...
1997,哮喘,变应性哮喘就是过敏性哮喘吗,变应性哮喘与过敏性哮喘一样吗,1
1998,哮喘,变应性哮喘就是过敏性哮喘吗,变应性哮喘是否就是过敏性哮喘,1
1999,哮喘,变应性哮喘就是过敏性哮喘吗,变应性哮喘的饮食禁忌有哪些,0
2000,哮喘,变应性哮喘就是过敏性哮喘吗,变应性哮喘怎么治疗,0
