In [3]:
import re
import os
import tensorflow as tf
import numpy as np
import pandas as pd
from transformers import TFBertForSequenceClassification,BertTokenizer,BertConfig

In [2]:
config = BertConfig.from_json_file('model/roberta/bert_config.json')

In [32]:
import argparse
import logging

import torch

from transformers import BertConfig, BertForPreTraining, load_tf_weights_in_bert

def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
    config = BertConfig.from_json_file(bert_config_file)
    print("Building PyTorch model from configuration: {}".format(str(config)))
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
    torch.save(model.state_dict(), pytorch_dump_path)

In [35]:
convert_tf_checkpoint_to_pytorch('model/roberta/bert_model.ckpt','model/roberta/bert_config.json','model/roberta/pt.bin')

Building PyTorch model from configuration: BertConfig {
  "architectures": null,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_beams": 1,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "num_return_sequences": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "output_past": true,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pool

In [4]:
model = TFBertForSequenceClassification.from_pretrained('model/roberta/pt.bin',config=config,from_pt=True)

In [5]:
model.save_pretrained('model/roberta')

In [6]:
model = TFBertForSequenceClassification.from_pretrained('model/roberta')

In [8]:
model.classifier.activation=tf.keras.activations.sigmoid
model.bert.trainable=False

In [9]:
model.summary()

Model: "tf_bert_for_sequence_classification_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  102267648 
_________________________________________________________________
dropout_75 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  769       
Total params: 102,268,417
Trainable params: 769
Non-trainable params: 102,267,648
_________________________________________________________________


In [10]:
tokenizer = BertTokenizer.from_pretrained('./model/roberta')

In [11]:
class InputFeatures(object):
    def __init__(self,input_ids,token_type_ids,attention_mask,label):
        self.input_ids=input_ids
        self.token_type_ids=token_type_ids
        self.attention_mask=attention_mask 
        self.label=int(label)
        
class InputExample(object):
    def __init__(self,category,query1,query2,label):
        self.re_punctuation='[{}]+'.format(''';'",.!?；‘’“”，。！？''')
        self.category=category
        self.query1=re.sub(self.re_punctuation, '', query1)
        self.query2=re.sub(self.re_punctuation, '', query2 )
        self.label=int(label)
        
    def convert_to_features(self,trans=False):
        encode_data=None
        if trans:
            encode_data=tokenizer.encode_plus(self.query2,self.query1,max_length=64,pad_to_max_length=True)
        else:
            encode_data=tokenizer.encode_plus(self.query1,self.query2,max_length=64,pad_to_max_length=True)
#         return model(inputs=encode_data['input_ids'],
#                      attention_mask=encode_data['attention_mask'],
#                      token_type_ids=encode_data['token_type_ids'])[0],self.label
        return InputFeatures(encode_data['input_ids'],encode_data['token_type_ids'],encode_data['attention_mask'],self.label)

        
def read_file(data_path):
    if os.path.exists(data_path):
        return pd.read_csv(data_path).dropna()
    else:
        raise FileNotFoundError('{0} not found.'.format(data_path))

def get_examples(data_path):
    examples = []
    for i,line in read_file(data_path).iterrows():
        examples.append(InputExample(line['category'],line['query1'],line['query2'],line['label']))
    return examples

def get_features(examples):
    features=[]
    for e in examples:
        features.append(e.convert_to_features(False))
        features.append(e.convert_to_features(True))
    return features

def get_dataset(features):
    def gen():
        for ex in features:
            yield ({'input_ids': ex.input_ids,'attention_mask': ex.attention_mask,'token_type_ids': ex.token_type_ids},ex.label)
    return tf.data.Dataset.from_generator(gen,
                                          ({'input_ids': tf.int32,
                                            'attention_mask': tf.int32,
                                            'token_type_ids': tf.int32},
                                           tf.int64),
                                          ({'input_ids': tf.TensorShape([None]),
                                            'attention_mask': tf.TensorShape([None]),
                                            'token_type_ids': tf.TensorShape([None])},
                                           tf.TensorShape([])))

In [12]:
train_data = get_examples('data/train.csv')
dev_data = get_examples('data/dev.csv')

In [13]:
train_features = get_features(train_data)
dev_features = get_features(dev_data)

In [14]:
train_steps = len(train_features) // 64
valid_steps = len(dev_features) // 64

In [15]:
train_steps

273

In [16]:
valid_steps

62

In [17]:
train_dataset = get_dataset(train_features)
dev_dataset = get_dataset(dev_features)

In [18]:
train_dataset = train_dataset.shuffle(128).batch(64).repeat(-1)
dev_dataset = dev_dataset.shuffle(128).batch(64).repeat(-1)

In [19]:
train_dataset

<RepeatDataset shapes: ({input_ids: (None, None), attention_mask: (None, None), token_type_ids: (None, None)}, (None,)), types: ({input_ids: tf.int32, attention_mask: tf.int32, token_type_ids: tf.int32}, tf.int64)>

In [20]:
dev_dataset

<RepeatDataset shapes: ({input_ids: (None, None), attention_mask: (None, None), token_type_ids: (None, None)}, (None,)), types: ({input_ids: tf.int32, attention_mask: tf.int32, token_type_ids: tf.int32}, tf.int64)>

In [21]:
model(tokenizer.encode_plus('阿斯蒂芬','大师傅',max_length=64,pad_to_max_length=True,return_tensors='tf'))

(<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.5056159]], dtype=float32)>,)

In [22]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

In [23]:
model.fit(train_dataset,
          epochs=6,
          steps_per_epoch=train_steps,
          validation_data=dev_dataset,
          validation_steps=valid_steps,
          verbose=2)

Train for 273 steps, validate for 62 steps
Epoch 1/6
273/273 - 69s - loss: 0.6584 - accuracy: 0.6128 - val_loss: 0.6384 - val_accuracy: 0.6525
Epoch 2/6
273/273 - 55s - loss: 0.6426 - accuracy: 0.6310 - val_loss: 0.6187 - val_accuracy: 0.6550
Epoch 3/6
273/273 - 55s - loss: 0.6309 - accuracy: 0.6427 - val_loss: 0.6065 - val_accuracy: 0.6678
Epoch 4/6
273/273 - 55s - loss: 0.6275 - accuracy: 0.6516 - val_loss: 0.6099 - val_accuracy: 0.6467
Epoch 5/6
273/273 - 55s - loss: 0.6269 - accuracy: 0.6472 - val_loss: 0.6101 - val_accuracy: 0.6444
Epoch 6/6
273/273 - 55s - loss: 0.6189 - accuracy: 0.6546 - val_loss: 0.5902 - val_accuracy: 0.6797


<tensorflow.python.keras.callbacks.History at 0x20e99e380c8>