In [1]:
import re
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import XLNetTokenizer, TFXLNetModel,TFXLNetPreTrainedModel,TFXLNetMainLayer,TFSequenceSummary,XLNetConfig,XLNetModel
from transformers.modeling_tf_utils import get_initializer
from tensorflow.python.platform import tf_logging as logging

In [15]:
# xlnet_config = XLNetConfig.from_json_file('model/xlnet_mid/config.json')
# xlnet_model=TFXLNetModel.from_pretrained('model/xlnet_mid/pytorch_model.bin',config=xlnet_config,from_pt=True)

In [2]:
class InputFeatures(object):
    def __init__(self, input_ids, token_type_ids, attention_mask, label):
        self.input_ids = input_ids
        self.token_type_ids = token_type_ids
        self.attention_mask = attention_mask
        self.label = int(label)


class InputExample(object):
    def __init__(self, category, query1, query2, label):
        self.re_punctuation = '[{}]+'.format(''';'",.!?；‘’“”，。！？''')
        self.category = category
        self.query1 = re.sub(self.re_punctuation, '', query1)
        self.query2 = re.sub(self.re_punctuation, '', query2)
        self.label = int(label)

    def convert_to_features(self, tokenizer, trans=False):
        encode_data = None
        if trans:
            encode_data = tokenizer.encode_plus(self.query2, self.query1, max_length=64, pad_to_max_length=True)
        else:
            encode_data = tokenizer.encode_plus(self.query1, self.query2, max_length=64, pad_to_max_length=True)
        return InputFeatures(encode_data['input_ids'], encode_data['token_type_ids'], encode_data['attention_mask'],
                             self.label)


class DataProcess(object):
    def __init__(self, data_path, tokenizer, model):
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.model = model

    def getDataSet(self, file_name):
        examples = self._get_examples(os.path.join(self.data_path, file_name))
        features, labels = self._get_features(examples)
        length = len(features)
        features = tf.data.Dataset.from_tensor_slices(features)
        features = tf.data.Dataset.zip((features, labels))
        return features, length

    def savePredictData(self, file_name='result.csv'):
        if file_name is None:
            file_name = 'result.csv'

    def _get_examples(self, file_name):
        if os.path.exists(file_name):
            data = pd.read_csv(file_name).dropna()
            examples = []
            for i, line in data.iterrows():
                examples.append(InputExample(line['category'], line['query1'], line['query2'], line['label']))
            return examples
        else:
            raise FileNotFoundError('{0} not found.'.format(file_name))

    def _get_features(self, examples):
        features_a = []
        features_b = []
        labels = []
        for e in examples:
            eccode_a = self.tokenizer.encode_plus(e.query1, max_length=32, pad_to_max_length=True)
            eccode_b = self.tokenizer.encode_plus(e.query2, max_length=32, pad_to_max_length=True)

            features_a.append(
                InputFeatures(eccode_a['input_ids'], eccode_a['token_type_ids'], eccode_a['attention_mask'], e.label))
            features_b.append(
                InputFeatures(eccode_b['input_ids'], eccode_b['token_type_ids'], eccode_b['attention_mask'], e.label))
            labels.append([e.label])
        features_a = self._get_dataset(features_a).batch(64)
        features_b = self._get_dataset(features_b).batch(64)

        steps = len(labels) // 64 + 1
        encode_a = self.model.predict(features_a, steps=steps)
        encode_b = self.model.predict(features_b, steps=steps)
        assert len(encode_a) == len(labels)
        assert len(encode_b) == len(labels)

        return tf.concat([encode_a, encode_b], axis=1), tf.data.Dataset.from_tensor_slices(labels)

    def _get_dataset(self, features):
        def gen():
            for ex in features:
                yield {'input_ids': ex.input_ids, 'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids}

        return tf.data.Dataset.from_generator(gen,
                                              {'input_ids': tf.int32,
                                                'attention_mask': tf.int32,
                                                'token_type_ids': tf.int32},
                                              {'input_ids': tf.TensorShape([None]),
                                                'attention_mask': tf.TensorShape([None]),
                                                'token_type_ids': tf.TensorShape([None])})

In [3]:
xlnet_tokenizer = XLNetTokenizer.from_pretrained('model/xlnet')
xlnet_model = TFXLNetModel.from_pretrained('model/xlnet')

In [4]:
data_process = DataProcess(data_path='data', tokenizer=xlnet_tokenizer, model=xlnet_model)
train_dataset, train_length = data_process.getDataSet('train.csv')
vaild_dataset, valid_length = data_process.getDataSet('dev.csv')
tests_dataset, tests_length = data_process.getDataSet('test.csv')

In [5]:
tests_dataset

<ZipDataset shapes: ((64, 768), (1,)), types: (tf.float32, tf.int32)>

In [6]:
train_dataset = train_dataset.shuffle(train_length).batch(64).repeat(-1)
vaild_dataset = vaild_dataset.shuffle(valid_length).batch(64).repeat(-1)
tests_dataset = tests_dataset.shuffle(tests_length).batch(64)

In [14]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.LSTM(512,return_sequences= True,dropout=0.2,input_shape=(None,768)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,return_sequences= True,dropout=0.2)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,return_sequences= True,dropout=0.2)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,dropout=0.2)))
model.add(tf.keras.layers.Dense(1,activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(2e-5),loss='binary_crossentropy',metrics=['accuracy'])

In [15]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_12 (LSTM)               (None, None, 512)         2623488   
_________________________________________________________________
bidirectional_9 (Bidirection (None, None, 1024)        4198400   
_________________________________________________________________
bidirectional_10 (Bidirectio (None, None, 1024)        6295552   
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 1024)              6295552   
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 1025      
Total params: 19,414,017
Trainable params: 19,414,017
Non-trainable params: 0
_________________________________________________________________


In [17]:
train_steps = train_length//64
valid_steps = valid_length//64
model.fit(train_dataset,
          epochs=6,
          steps_per_epoch=train_steps,
          validation_data=vaild_dataset,
          validation_steps=valid_steps,
          verbose=2,
#           callbacks=[custom_callback]
          )

Train for 136 steps, validate for 31 steps
Epoch 1/6
136/136 - 18s - loss: 0.4162 - accuracy: 0.8061 - val_loss: 0.6993 - val_accuracy: 0.6552
Epoch 2/6
136/136 - 16s - loss: 0.3743 - accuracy: 0.8279 - val_loss: 0.7122 - val_accuracy: 0.6764
Epoch 3/6
136/136 - 16s - loss: 0.3365 - accuracy: 0.8504 - val_loss: 0.7767 - val_accuracy: 0.6809
Epoch 4/6
136/136 - 16s - loss: 0.2948 - accuracy: 0.8754 - val_loss: 0.8132 - val_accuracy: 0.6895
Epoch 5/6
136/136 - 16s - loss: 0.2798 - accuracy: 0.8810 - val_loss: 0.7877 - val_accuracy: 0.6739
Epoch 6/6
136/136 - 16s - loss: 0.2427 - accuracy: 0.8973 - val_loss: 0.8921 - val_accuracy: 0.6850


<tensorflow.python.keras.callbacks.History at 0x20b68d75388>

In [None]:
train_steps = train_length//64+1
valid_steps = vaild_dataset//64+1
query_a_datas = model.predict(train_dataset,steps=train_steps)
query_b_datas = model.predict(dev_dataset,steps=valid_steps)

In [10]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self,monitor='val_accuracy',baseline=0.85,target_accuracy = 0.877):
        self.baseline=baseline
        self.monitor=monitor
        self.target_accuracy=target_accuracy
        self.count = 0
    
    def on_epoch_end(self, epoch, logs=None):
        current = self.get_monitor_value(logs)
        if current > self.baseline and not self.model.transformer.trainable:
            logging.info('current `%s` is %s ,begin train all params',self.monitor,current)
            self.model.transformer.trainable=True
            self.model.summary()
        if current > self.target_accuracy and self.count > 2:
            self.model.stop_training = True
        else:
            self.count = self.count + 1
        
    def get_monitor_value(self, logs):
        logs = logs or {}
        monitor_value = logs.get(self.monitor)
        if monitor_value is None:
            monitor_value = 0.0
        return monitor_value

custom_callback = CustomCallback(baseline=0.86,target_accuracy=0.95)

In [17]:
train_steps = train_length//64
valid_steps = dev_length//64
model.fit(train_dataset,
          epochs=6,
          steps_per_epoch=train_steps,
          validation_data=dev_dataset,
          validation_steps=valid_steps,
          verbose=2,
#           callbacks=[custom_callback]
          )

Train for 136 steps, validate for 31 steps
Epoch 1/6
136/136 - 68s - loss: 0.3340 - accuracy: 0.8543 - val_loss: 0.2988 - val_accuracy: 0.8740
Epoch 2/6
136/136 - 68s - loss: 0.3271 - accuracy: 0.8543 - val_loss: 0.2894 - val_accuracy: 0.8821
Epoch 3/6
136/136 - 68s - loss: 0.3128 - accuracy: 0.8620 - val_loss: 0.2759 - val_accuracy: 0.8795
Epoch 4/6
136/136 - 68s - loss: 0.3192 - accuracy: 0.8552 - val_loss: 0.3264 - val_accuracy: 0.8725
Epoch 5/6
136/136 - 68s - loss: 0.3089 - accuracy: 0.8647 - val_loss: 0.3033 - val_accuracy: 0.8765
Epoch 6/6
136/136 - 68s - loss: 0.3088 - accuracy: 0.8623 - val_loss: 0.3070 - val_accuracy: 0.8780


<tensorflow.python.keras.callbacks.History at 0x14e3cb31a08>

In [51]:
test_dataset,test_length=data_process.getTestDataSet()
test_dataset = test_dataset.batch(2)
test_steps = test_length//2 +1
predict_data = model.predict(test_dataset,steps=test_steps)

In [59]:
predict_data = np.squeeze(predict_data)+0.5


In [60]:
predict_data.astype(int)

array([1, 1, 1, 0, 0])

In [64]:
predict_data = [ [i,d] for i,d in enumerate(predict_data.astype(int))]

In [65]:
predict_data = pd.DataFrame(predict_data,columns=['id','label'])

In [66]:
predict_data

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,1
3,3,0
4,4,0


In [None]:
predict_data.to_csv()