In [1]:
import re
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from tqdm import tqdm
from transformers import TFBertPreTrainedModel,BertTokenizer,BertConfig,TFBertMainLayer,TFBertForSequenceClassification
from transformers.modeling_tf_utils import get_initializer
from tensorflow.python.platform import tf_logging as logging

In [24]:
bert_tokenizer.decode([101, 7350, 3172, 5881, 5705, 102, 5855, 3791, 102])

'[CLS] 阿 斯 蒂 芬 [SEP] 萨 法 [SEP]'

In [9]:
bert_tokenizer = BertTokenizer.from_pretrained("model/chinese-roberta-wwm-ext-large")
class InputFeatures(object):
    def __init__(self,input_ids,token_type_ids,attention_mask,label):
        self.input_ids=input_ids
        self.token_type_ids=token_type_ids
        self.attention_mask=attention_mask 
        self.label=label
#         if label == 0:
#             self.label=[1,0]
#         else:
#             self.label=[0,1]
        
        
class InputExample(object):
    def __init__(self,category,query1,query2,label):
        self.re_punctuation='[{}]+'.format(''';'",.!?；‘’“”，。！？''')
        self.category=category
        self.query1=re.sub(self.re_punctuation, '', query1)
        self.query2=re.sub(self.re_punctuation, '', query2 )
        self.label=int(label)
        
    def convert_to_features(self,tokenizer,trans=False):
        encode_data=None
        if trans:
            encode_data=tokenizer.encode_plus(self.query2,self.query1,max_length=64,pad_to_max_length=True)
        else:
            encode_data=tokenizer.encode_plus(self.query1,self.query2,max_length=64,pad_to_max_length=True)
        return InputFeatures(encode_data['input_ids'],encode_data['token_type_ids'],encode_data['attention_mask'],self.label)

class DataProcess(object):
    def __init__(self,data_path,tokenizer):
        self.data_path=data_path
        self.tokenizer=tokenizer
        
    def getTrainDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'train.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features = self._get_features(examples,is_exchange=False)
        return self._get_dataset(features),len(features)
    
    def getValidDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'dev.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features = self._get_features(examples,is_exchange=False)
        return self._get_dataset(features),len(features)
    
    def getTestDataSet(self,file_name=None):
        if file_name is None:
            file_name = 'test.csv'
        examples = self._get_examples(os.path.join(self.data_path,file_name))
        features = self._get_features(examples,is_exchange=False)
        return self._get_dataset(features),len(features)
    
    def savePredictData(self,file_name=None):
        if file_name is None:
            file_name = 'result.csv'
    
    def _get_examples(self,file_name):
        if os.path.exists(file_name):
            data = pd.read_csv(file_name).dropna()
            examples = []
            for i,line in data.iterrows():
                examples.append(InputExample(line['category'],line['query1'],line['query2'],line['label']))
            return examples   
        else:
            raise FileNotFoundError('{0} not found.'.format(data_path))   
    def _get_features(self,examples,is_exchange=True):
        features=[]
        for e in examples:
            features.append(e.convert_to_features(self.tokenizer,False))
            if is_exchange:
                features.append(e.convert_to_features(self.tokenizer,True))
        return features
    
    def _get_dataset(self,features):
        def gen():
            for ex in features:
                yield ({'input_ids': ex.input_ids,'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids},ex.label)
        return tf.data.Dataset.from_generator(gen,
                                              ({'input_ids': tf.int32,
                                                'attention_mask': tf.int32,
                                                'token_type_ids': tf.int32},
                                               tf.int32),
                                              ({'input_ids': tf.TensorShape([None]),
                                                'attention_mask': tf.TensorShape([None]),
                                                'token_type_ids': tf.TensorShape([None])},
                                               tf.TensorShape([])))

In [10]:
data_process = DataProcess(data_path='data',tokenizer=bert_tokenizer)
train_dataset,train_length = data_process.getTrainDataSet()
valid_dataset,valid_length = data_process.getValidDataSet()
tests_dataset,tests_length = data_process.getTestDataSet()

In [11]:
train_dataset = train_dataset.shuffle(train_length).batch(64).repeat(-1)
valid_dataset = valid_dataset.shuffle(valid_length).batch(64).repeat(-1)
tests_dataset = tests_dataset.batch(64)

In [22]:
class TFBertForYiQing(TFBertPreTrainedModel):
    def __init__(self, config, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = TFBertMainLayer(config, name="bert")
        self.bert.pool = tf.keras.layers.LSTM(config.hidden_size,kernel_initializer=get_initializer(config.initializer_range))
        self.num_labels = config.num_labels
        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
        self.classifier = tf.keras.layers.Dense(
            1, kernel_initializer=get_initializer(config.initializer_range), activation='sigmoid',name="classifier"
        )
        
#         self.lstm1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(256,return_sequences= True,dropout=0.2))
#         self.lstm2 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,return_sequences= True,dropout=0.2))
#         self.lstm3 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,return_sequences= True,dropout=0.2))
#         self.lstm4 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512,dropout=0.2))

    def call(self, inputs, **kwargs):
        outputs = self.bert(inputs, **kwargs)
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output, training=kwargs.get("training", False))
        logits = self.classifier(pooled_output)
        return logits

model = TFBertForYiQing.from_pretrained('model/chinese-roberta-wwm-ext-large')
# model = TFBertForSequenceClassification.from_pretrained('model/chinese-roberta-wwm-ext-large')
model.bert.trainable=True
model.compile(optimizer=tf.keras.optimizers.Adam(2e-5),loss='binary_crossentropy',metrics=['accuracy'])

In [23]:
model.summary()

Model: "tf_bert_for_yi_qing_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  325522432 
_________________________________________________________________
dropout_369 (Dropout)        multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1025      
Total params: 325,523,457
Trainable params: 325,523,457
Non-trainable params: 0
_________________________________________________________________


In [10]:
class CustomCallback(tf.keras.callbacks.Callback):
    def __init__(self,monitor='val_accuracy',baseline=0.85,target_accuracy = 0.877):
        self.baseline=baseline
        self.monitor=monitor
        self.target_accuracy=target_accuracy
        self.count = 0
    
    def on_epoch_end(self, epoch, logs=None):
        current = self.get_monitor_value(logs)
        if current > self.baseline and not self.model.transformer.trainable:
            logging.info('current `%s` is %s ,begin train all params',self.monitor,current)
            self.model.bert.trainable=True
            self.model.summary()
        if current > self.target_accuracy and self.count > 2:
            self.model.stop_training = True
        else:
            self.count = self.count + 1
        
    def get_monitor_value(self, logs):
        logs = logs or {}
        monitor_value = logs.get(self.monitor)
        if monitor_value is None:
            monitor_value = 0.0
        return monitor_value

custom_callback = CustomCallback(baseline=0.86,target_accuracy=0.95)

In [18]:
model.bert.trainable=True

In [17]:
train_steps = train_length//64+1
valid_steps = valid_length//64+1
model.fit(train_dataset,
          epochs=3,
          steps_per_epoch=train_steps,
          validation_data=valid_dataset,
          validation_steps=valid_steps,
          verbose=2,
#           callbacks=[custom_callback]
          )

Train for 137 steps, validate for 32 steps
Epoch 1/3
137/137 - 106s - loss: 0.6751 - accuracy: 0.5957 - val_loss: 0.6732 - val_accuracy: 0.5984
Epoch 2/3
137/137 - 78s - loss: 0.6745 - accuracy: 0.5991 - val_loss: 0.6716 - val_accuracy: 0.5984
Epoch 3/3
137/137 - 78s - loss: 0.6735 - accuracy: 0.6007 - val_loss: 0.6720 - val_accuracy: 0.5984


<tensorflow.python.keras.callbacks.History at 0x207549eaa08>

In [51]:
test_dataset,test_length=data_process.getTestDataSet()
test_dataset = test_dataset.batch(2)
test_steps = test_length//2 +1
predict_data = model.predict(test_dataset,steps=test_steps)

In [59]:
predict_data = np.squeeze(predict_data)+0.5


In [60]:
predict_data.astype(int)

array([1, 1, 1, 0, 0])

In [64]:
predict_data = [ [i,d] for i,d in enumerate(predict_data.astype(int))]

In [65]:
predict_data = pd.DataFrame(predict_data,columns=['id','label'])

In [66]:
predict_data

Unnamed: 0,id,label
0,0,1
1,1,1
2,2,1
3,3,0
4,4,0


In [None]:
predict_data.to_csv()