In [1]:
import re
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import XLNetTokenizer, TFXLNetModel,TFXLNetPreTrainedModel,TFXLNetMainLayer,TFSequenceSummary,TFXLNetModel
from transformers.modeling_tf_utils import get_initializer

In [23]:
xlnet_tokenizer.encode_plus('地方根深蒂固','森岛帆高',max_length=32,pad_to_max_length=True,return_tensors='tf')

{'input_ids': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
 array([[   5,    5,    5,    5,    5,    5,    5,    5,    5,    5,    5,
            5,    5,    5,    5,    5,    5,    5,   19,  821,  648,  658,
          724, 2012,    4,   19,  417,  747, 6949,   98,    4,    3]])>,
 'token_type_ids': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
 array([[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0,
         0, 0, 0, 1, 1, 1, 1, 1, 1, 2]])>,
 'attention_mask': <tf.Tensor: shape=(1, 32), dtype=int32, numpy=
 array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])>}

In [33]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('model/xlnet')
xlnet_model = TFXLNetModel.from_pretrained('model/xlnet')
class InputFeatures(object):
    def __init__(self,input_ids,token_type_ids,attention_mask,input_b_ids,token_b_type_ids,attention_b_mask,label):
        self.input_ids=input_ids
        self.token_type_ids=token_type_ids
        self.attention_mask=attention_mask 
        
        self.input_b_ids=input_b_ids
        self.token_b_type_ids=token_b_type_ids
        self.attention_b_mask=attention_b_mask 
        
        self.label=int(label)
        
class InputExample(object):
    def __init__(self,category,query1,query2,label):
        self.re_punctuation='[{}]+'.format(''';'",.!?；‘’“”，。！？''')
        self.category=category
        self.query1=re.sub(self.re_punctuation, '', query1)
        self.query2=re.sub(self.re_punctuation, '', query2 )
        self.label=int(label)
        
    def convert_to_features(self,tokenizer,trans=False):
        encode_a=tokenizer.encode_plus(self.query1,max_length=32,pad_to_max_length=True)
        encode_b=tokenizer.encode_plus(self.query2,max_length=32,pad_to_max_length=True)
        
        return InputFeatures(encode_a['input_ids'],encode_a['token_type_ids'],encode_a['attention_mask'],
                             encode_b['input_ids'],encode_b['token_type_ids'],encode_b['attention_mask'],
                             self.label)

class DataProcess(object):
    def __init__(self,data_path,tokenizer,model):
        self.data_path=data_path
        self.tokenizer=tokenizer
        self.model=model
        
    def getTrainDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'train.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features,labels = self._get_features(examples)
#         return self._get_dataset(features),len(features)
    
    def getValidDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'dev.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features = self._get_features(examples,is_exchange=False)
        return self._get_dataset(features),len(features)
    
    def getTestDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'test.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features,labels = self._get_features(examples)
        return features,labels
    
    def savePredictData(self,file_name=None):
        if file_name is None:
            file_name = 'result.csv'
    
    def _get_examples(self,file_name):
        if os.path.exists(file_name):
            data = pd.read_csv(file_name).dropna()
            examples = []
            for i,line in data.iterrows():
                examples.append(InputExample(line['category'],line['query1'],line['query2'],line['label']))
            return examples   
        else:
            raise FileNotFoundError('{0} not found.'.format(data_path))   
    def _get_features(self,examples):
        encode_a=[]
        encode_b=[]
        labels = []
        for e in examples:
            encode_a.append(self.tokenizer.encode_plus(e.query1,max_length=32,pad_to_max_length=True,return_tensors='tf'))
            encode_b.append(self.tokenizer.encode_plus(e.query2,max_length=32,pad_to_max_length=True,return_tensors='tf'))
            labels.append(e.label)
        encode_a = self._get_dataset(encode_a).batch(64)
        encode_b = self._get_dataset(encode_b).batch(64)
        
        steps = len(labels)//64+1
        print(encode_a)
        encode_a = self.model.predict(encode_a,steps=steps)
        encode_b = self.model.predict(encode_b,steps=steps)
        print(encode_a)
        return tf.concat([encode_a,encode_b],axis=-1),labels
    
    def _get_dataset(self,features):
        def gen():
            for ex in features:
                yield {'input_ids': ex.input_ids,'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids}
        return tf.data.Dataset.from_generator(gen,
                                              {'input_ids': tf.int32,
                                                'attention_mask': tf.int32,
                                                'token_type_ids': tf.int32},
                                              {'input_ids': tf.TensorShape([None]),
                                                'attention_mask': tf.TensorShape([None]),
                                                'token_type_ids': tf.TensorShape([None])})

In [34]:
data_process = DataProcess(data_path='data',tokenizer=xlnet_tokenizer,model=xlnet_model)

In [35]:
test_dataset,labels=data_process.getTestDataSet()

<BatchDataset shapes: {input_ids: (None, None), attention_mask: (None, None), token_type_ids: (None, None)}, types: {input_ids: tf.int32, attention_mask: tf.int32, token_type_ids: tf.int32}>


UnknownError: AttributeError: 'dict' object has no attribute 'input_ids'
Traceback (most recent call last):

  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\ops\script_ops.py", line 236, in __call__
    ret = func(*args)

  File "C:\ProgramData\Anaconda3\lib\site-packages\tensorflow_core\python\data\ops\dataset_ops.py", line 789, in generator_py_func
    values = next(generator_state.get_iterator(iterator_id))

  File "<ipython-input-33-5122a0cf728e>", line 92, in gen
    yield {'input_ids': ex.input_ids,'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids}

AttributeError: 'dict' object has no attribute 'input_ids'


	 [[{{node PyFunc}}]]

In [7]:
data_process = DataProcess(data_path='data',tokenizer=xlnet_tokenizer)
train_dataset,train_length = data_process.getTrainDataSet()
dev_dataset,dev_length = data_process.getValidDataSet()
test_dataset,test_length=data_process.getTestDataSet()

In [8]:
train_dataset = train_dataset.shuffle(train_length).batch(64).repeat(-1)
dev_dataset = dev_dataset.shuffle(dev_length).batch(64).repeat(-1)
test_dataset = test_dataset.batch(2)

In [9]:
train_length

8747

In [14]:
class TFXLNetForYiQing(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer1 = TFXLNetMainLayer(config, name="transformer1")
        self.transformer2 = TFXLNetMainLayer(config, name="transformer2")
#         self.seq_summary = TFSequenceSummary(config,name="seq_summary")
#         self.first_dropout = tf.keras.layers.Dropout(0.2)
        self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128,return_sequences= True,dropout=0.2))
        self.lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,return_sequences= True,dropout=0.2))
        self.lstm3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,return_sequences= True,dropout=0.2))
        self.lstm4 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,dropout=0.2))
        self.logits_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range),activation='sigmoid', name="logits_proj")
    def call(self, inputs,
             attention_mask=None,token_type_ids=None,
             attention_b_mask=None,token_b_type_ids=None,
             **kwargs):
        input_ids = inputs.get('input_ids')
        attention_mask = inputs.get("attention_mask", attention_mask)
        token_type_ids = inputs.get("token_type_ids", token_type_ids)
        
        input_b_ids = inputs.get('input_b_ids',input_ids)
        attention_b_mask = inputs.get("attention_b_mask", attention_b_mask)
        token_b_type_ids = inputs.get("token_b_type_ids", token_b_type_ids)
        
        inputs_a = {'input_ids':input_ids,'attention_mask':attention_mask,'token_type_ids':token_type_ids}
        inputs_b = {'input_ids':input_b_ids,'attention_mask':attention_b_mask,'token_type_ids':token_b_type_ids}
        
        output_a = self.transformer1(inputs_a, **kwargs)[0]
        output_b = self.transformer2(inputs_b, **kwargs)[0]
        output =tf.concat([output_a,output_b],axis=1)
        
        output_lstm = self.lstm1(output)
        output_lstm = self.lstm2(output_lstm)
        output_lstm = self.lstm3(output_lstm)
        output_lstm = self.lstm4(output_lstm)
        output = self.logits_proj(output)
        return output


model = TFXLNetForYiQing.from_pretrained('model/xlnet')
model.transformer1.trainable=False
model.transformer2.trainable=False
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [15]:
model.summary()

Model: "tfxl_net_for_yi_qing_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer1 (TFXLNetMainLay multiple                  116718336 
_________________________________________________________________
transformer2 (TFXLNetMainLay multiple                  116718336 
_________________________________________________________________
bidirectional_8 (Bidirection multiple                  918528    
_________________________________________________________________
bidirectional_9 (Bidirection multiple                  1050624   
_________________________________________________________________
bidirectional_10 (Bidirectio multiple                  1574912   
_________________________________________________________________
bidirectional_11 (Bidirectio multiple                  1574912   
_________________________________________________________________
logits_proj (Dense)          multiple       

In [16]:
train_steps = train_length//64
valid_steps = dev_length//64
model.fit(train_dataset,
          epochs=10,
          steps_per_epoch=train_steps,
          validation_data=dev_dataset,
          validation_steps=valid_steps,
          verbose=2)

Train for 136 steps, validate for 31 steps
Epoch 1/10


KeyboardInterrupt: 

In [16]:
MIN_FLOAT = -1e30
class TFXLNetForYiQing(TFXLNetPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.transformer = TFXLNetMainLayer(config, name="transformer")
#         self.seq_summary = TFSequenceSummary(config,name="seq_summary")
        self.lstm1 = tf.keras.layers.LSTM(256,dropout=0.2,name='lstm_a')
        self.lstm2 = tf.keras.layers.LSTM(256,dropout=0.2,name='lstm_b')
        self.logits_proj = tf.keras.layers.Dense(
            config.num_labels, kernel_initializer=get_initializer(config.initializer_range),activation='sigmoid', name="logits_proj")
        self.d_model = config.d_model
        
    def call(self, 
             inputs,
             attention_mask=None,
             token_type_ids=None,
             **kwargs):
        if isinstance(inputs, dict):
            input_ids = inputs.get("input_ids")
            attention_mask = inputs.get("attention_mask", attention_mask)
            token_type_ids = inputs.get("token_type_ids", token_type_ids)              
        else:
            input_ids = inputs
            
        if attention_mask is None:
            attention_mask = tf.ones_like(input_ids)
        if token_type_ids is None:
            token_type_ids = tf.ones_like(input_ids)
            
        token_type_ids = token_type_ids * attention_mask
        token_type_ids = tf.expand_dims(token_type_ids,axis=-1)
        token_type_ids = tf.tile(token_type_ids,multiples=[1,1,self.d_model])
        
        attention_mask = tf.expand_dims(attention_mask,axis=-1)
        attention_mask = tf.tile(attention_mask,multiples=[1,1,self.d_model])
        
        attention_mask = tf.cast(attention_mask,dtype=tf.float32)
        token_type_ids = tf.cast(token_type_ids,dtype=tf.float32)
        
        outputs = self.transformer(inputs, **kwargs)
        output = outputs[0]
#         output = output * attention_mask + MIN_FLOAT * (1.0 - attention_mask)
        
        seq_a = output * (1.0 - token_type_ids) + MIN_FLOAT * token_type_ids
        seq_b = output * token_type_ids + MIN_FLOAT * (1.0-token_type_ids)
                
        output_lstm1 = self.lstm1(seq_a+outputs[0])
        output_lstm2 = self.lstm2(seq_b+outputs[0])
        output = tf.concat([output_lstm1,output_lstm2],axis=1)
#         output = self.seq_summary(output)
        output = self.logits_proj(output)
        return output
        

model = TFXLNetForYiQing.from_pretrained('model/xlnet')
model.transformer.trainable=False
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [17]:
model.summary()

Model: "tfxl_net_for_yi_qing_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
transformer (TFXLNetMainLaye multiple                  116718336 
_________________________________________________________________
lstm_a (LSTM)                multiple                  1049600   
_________________________________________________________________
lstm_b (LSTM)                multiple                  1049600   
_________________________________________________________________
logits_proj (Dense)          multiple                  513       
Total params: 118,818,049
Trainable params: 2,099,713
Non-trainable params: 116,718,336
_________________________________________________________________


In [18]:
train_steps = 273
valid_steps = 62
model.fit(train_dataset,
          epochs=8,
          steps_per_epoch=train_steps,
          validation_data=dev_dataset,
          validation_steps=valid_steps,
          verbose=2)

Train for 273 steps, validate for 62 steps
Epoch 1/8
273/273 - 82s - loss: nan - accuracy: 0.6360 - val_loss: nan - val_accuracy: 0.5985
Epoch 2/8
273/273 - 68s - loss: nan - accuracy: 0.6001 - val_loss: nan - val_accuracy: 0.5968
Epoch 3/8
273/273 - 68s - loss: nan - accuracy: 0.5998 - val_loss: nan - val_accuracy: 0.5975
Epoch 4/8
273/273 - 68s - loss: nan - accuracy: 0.6010 - val_loss: nan - val_accuracy: 0.5983
Epoch 5/8
273/273 - 68s - loss: nan - accuracy: 0.5991 - val_loss: nan - val_accuracy: 0.5980
Epoch 6/8
273/273 - 68s - loss: nan - accuracy: 0.6005 - val_loss: nan - val_accuracy: 0.5970
Epoch 7/8
273/273 - 68s - loss: nan - accuracy: 0.6009 - val_loss: nan - val_accuracy: 0.5988
Epoch 8/8
273/273 - 68s - loss: nan - accuracy: 0.6005 - val_loss: nan - val_accuracy: 0.5988


<tensorflow.python.keras.callbacks.History at 0x2ce3efc6348>