# 文本相似度

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset

In [3]:
dataset = load_dataset("json", data_files="../data/train_pair_1w.json", split="train")
datasets = dataset.train_test_split(test_size=0.2)
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

In [13]:
for i in range(100):
    print(datasets["train"][i]["label"])

0
0
0
1
0
1
0
0
1
0
1
1
0
1
1
0
1
0
1
1
0
0
1
1
0
1
0
0
0
0
0
0
1
0
1
0
1
1
1
0
0
0
0
0
1
1
0
0
1
0
0
1
1
1
1
0
1
0
1
0
1
0
0
0
1
1
0
0
0
0
1
0
1
1
1
0
1
0
0
0
0
1
0
1
1
0
0
0
0
0
1
1
1
1
0
1
1
1
0
0


In [5]:
import torch

tokenizer = AutoTokenizer.from_pretrained("../hfl/chinese-macbert-base")

def process_function(examples):
    sentences = []
    labels = []

    for sen1, sen2, label in zip(examples["sentence1"], examples["sentence2"], examples["label"]):
        sentences.append(sen1)
        sentences.append(sen2)
        labels.append(1 if int(label) == 1 else -1) # 因为CosineSimilarity的输入是-1和1，所以这里需要转换一下

    tokenizer_examples = tokenizer(sentences, max_length=128, truncation=True, padding="max_length")
    tokenizer_examples = {k: [v[i:i+2] for i in range(0, len(v), 2)] for k, v in tokenizer_examples.items()} # 这里的意思是将tokenizer的输出进行拆分，每个句子有两个（sen1和sen2，所以需要拆分一下
    tokenizer_examples["labels"] = labels
    return tokenizer_examples


tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets["train"].column_names)
tokenized_datasets



Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

不进行预处理直接tokenized

In [7]:
tokenized_example = tokenizer(datasets["train"][0]["sentence1"], datasets["train"][0]["sentence2"], max_length=128, truncation=True, padding="max_length")
print(tokenized_example)

{'input_ids': [101, 671, 702, 7946, 1355, 1957, 782, 8024, 4959, 4708, 3340, 5292, 3683, 1825, 2225, 8024, 1762, 3691, 2094, 677, 3235, 3189, 1045, 3861, 511, 102, 6929, 702, 1957, 782, 4959, 4708, 671, 1947, 3807, 6132, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

预处理后进行tokenized

In [6]:
print(tokenized_datasets["train"][0])

{'input_ids': [[101, 671, 702, 7946, 1355, 1957, 782, 8024, 4959, 4708, 3340, 5292, 3683, 1825, 2225, 8024, 1762, 3691, 2094, 677, 3235, 3189, 1045, 3861, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 6929, 702, 1957, 782, 4959, 4708, 671, 1947, 3807, 6132, 511, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import BertForSequenceClassification, BertPreTrainedModel, BertModel
from typing import Optional
from transformers.configuration_utils import PretrainedConfig
from torch.nn import CosineSimilarity, CosineEmbeddingLoss


class DualModel(BertPreTrainedModel):

    def __init__(self, config: PretrainedConfig, *inputs, **kwargs):
        super().__init__(config, *inputs, **kwargs)
        self.bert = BertModel(config)
        self. post_init()

    def forward(
        self,
        input_ids: Optional[torch.Tensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        token_type_ids: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.Tensor] = None,
        head_mask: Optional[torch.Tensor] = None,
        inputs_embeds: Optional[torch.Tensor] = None,
        labels: Optional[torch.Tensor] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # Step1 分别获取sentenceA 和 sentenceB的输入
        senA_input_ids, senB_input_ids = input_ids[:, 0], input_ids[:, 1] # 在对数据进行预处理时，已经将input_ids进行了拼接，所以这里只需要获取前半部分和后半部分即可
        senA_attention_mask, senB_attention_mask = attention_mask[:, 0], attention_mask[:, 1]
        senA_token_type_ids, senB_token_type_ids = token_type_ids[:, 0], token_type_ids[:, 1]

        # Step2 分别获取sentenceA 和 sentenceB的向量表示
        senA_outputs = self.bert(
            senA_input_ids,
            attention_mask=senA_attention_mask,
            token_type_ids=senA_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senA_pooled_output = senA_outputs[1] # [batch, hidden]

        senB_outputs = self.bert(
            senB_input_ids,
            attention_mask=senB_attention_mask,
            token_type_ids=senB_token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        senB_pooled_output = senB_outputs[1] # [batch, hidden]
        
        # step3 计算相似度
        cos = CosineSimilarity()(senA_pooled_output, senB_pooled_output) # [batch, ]

        # step4 计算loss
        loss = None
        if labels is not None:
            loss_fct = CosineEmbeddingLoss(0.3)
            loss = loss_fct(senA_pooled_output, senB_pooled_output, labels)

        output = (cos,) # [batch, ]
        return ((loss,) + output) if loss is not None else output
    
model = DualModel.from_pretrained("../hfl/chinese-macbert-base")
model

loading configuration file ../hfl/chinese-macbert-base\config.json
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 21128
}

loading weights file ../hfl/chinese-macbert-base\pytorch_model.bin
  return torch.load(checkpoint_file, map_location="cpu

DualModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=

In [14]:
import evaluate

acc_metric = evaluate.load("./metric_accuracy.py")
f1_metirc = evaluate.load("./metric_f1.py")

In [15]:
def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = [int(p > 0.7) for p in predictions]
    labels = [int(l > 0) for l in labels] 
    # predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metirc.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

In [16]:
train_args = TrainingArguments(output_dir="./dual_model",      # 输出文件夹
                               per_device_train_batch_size=32,  # 训练时的batch_size
                               per_device_eval_batch_size=32,   # 验证时的batch_size
                               logging_steps=10,                # log 打印的频率
                               evaluation_strategy="epoch",           # 评估策略
                               save_strategy="epoch",           # 保存策略
                               save_total_limit=3,              # 最大保存数
                               learning_rate=2e-5,              # 学习率
                               weight_decay=0.01,               # weight_decay
                               metric_for_best_model="f1",      # 设定评估指标
                               load_best_model_at_end=True)     # 训练完成后加载最优模型

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [17]:
trainer = Trainer(model=model, 
                  args=train_args, 
                  tokenizer=tokenizer,
                  train_dataset=tokenized_datasets["train"], 
                  eval_dataset=tokenized_datasets["test"], 
                  compute_metrics=eval_metric)

In [18]:
trainer.train()

***** Running training *****
  Num examples = 8,000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 102,267,648


  0%|          | 0/750 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.3569, 'learning_rate': 1.9733333333333336e-05, 'epoch': 0.04}
{'loss': 0.2905, 'learning_rate': 1.9466666666666668e-05, 'epoch': 0.08}
{'loss': 0.3001, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.12}
{'loss': 0.2964, 'learning_rate': 1.8933333333333334e-05, 'epoch': 0.16}
{'loss': 0.2788, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}
{'loss': 0.2594, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.24}
{'loss': 0.2508, 'learning_rate': 1.8133333333333335e-05, 'epoch': 0.28}
{'loss': 0.2603, 'learning_rate': 1.7866666666666666e-05, 'epoch': 0.32}
{'loss': 0.2322, 'learning_rate': 1.76e-05, 'epoch': 0.36}
{'loss': 0.2282, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}
{'loss': 0.2359, 'learning_rate': 1.706666666666667e-05, 'epoch': 0.44}
{'loss': 0.24, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.48}
{'loss': 0.2486, 'learning_rate': 1.6533333333333333e-05, 'epoch': 0.52}
{'loss': 0.2453, 'learning_rate': 1.6266666666666668e-05, 'epoch': 0.56

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'loss': 0.2044, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


  0%|          | 0/63 [00:00<?, ?it/s]

Saving model checkpoint to ./dual_model\checkpoint-250
Configuration saved in ./dual_model\checkpoint-250\config.json


{'eval_loss': 0.19514742493629456, 'eval_accuracy': 0.769, 'eval_f1': 0.7444690265486725, 'eval_runtime': 14.5196, 'eval_samples_per_second': 137.745, 'eval_steps_per_second': 4.339, 'epoch': 1.0}


Model weights saved in ./dual_model\checkpoint-250\pytorch_model.bin
tokenizer config file saved in ./dual_model\checkpoint-250\tokenizer_config.json
Special tokens file saved in ./dual_model\checkpoint-250\special_tokens_map.json


{'loss': 0.1746, 'learning_rate': 1.3066666666666668e-05, 'epoch': 1.04}
{'loss': 0.1787, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.08}
{'loss': 0.1722, 'learning_rate': 1.2533333333333336e-05, 'epoch': 1.12}
{'loss': 0.1781, 'learning_rate': 1.2266666666666667e-05, 'epoch': 1.16}
{'loss': 0.1596, 'learning_rate': 1.2e-05, 'epoch': 1.2}
{'loss': 0.1851, 'learning_rate': 1.1733333333333335e-05, 'epoch': 1.24}
{'loss': 0.1544, 'learning_rate': 1.1466666666666668e-05, 'epoch': 1.28}
{'loss': 0.1464, 'learning_rate': 1.1200000000000001e-05, 'epoch': 1.32}
{'loss': 0.1953, 'learning_rate': 1.0933333333333334e-05, 'epoch': 1.36}
{'loss': 0.1545, 'learning_rate': 1.0666666666666667e-05, 'epoch': 1.4}
{'loss': 0.1805, 'learning_rate': 1.04e-05, 'epoch': 1.44}
{'loss': 0.1699, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.48}
{'loss': 0.1825, 'learning_rate': 9.866666666666668e-06, 'epoch': 1.52}
{'loss': 0.168, 'learning_rate': 9.600000000000001e-06, 'epoch': 1.56}
{'loss': 0.1

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'loss': 0.1686, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


  0%|          | 0/63 [00:00<?, ?it/s]

Saving model checkpoint to ./dual_model\checkpoint-500
Configuration saved in ./dual_model\checkpoint-500\config.json


{'eval_loss': 0.1861504763364792, 'eval_accuracy': 0.784, 'eval_f1': 0.740072202166065, 'eval_runtime': 14.2756, 'eval_samples_per_second': 140.099, 'eval_steps_per_second': 4.413, 'epoch': 2.0}


Model weights saved in ./dual_model\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./dual_model\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./dual_model\checkpoint-500\special_tokens_map.json


{'loss': 0.1442, 'learning_rate': 6.4000000000000006e-06, 'epoch': 2.04}
{'loss': 0.123, 'learning_rate': 6.133333333333334e-06, 'epoch': 2.08}
{'loss': 0.1237, 'learning_rate': 5.8666666666666675e-06, 'epoch': 2.12}
{'loss': 0.1421, 'learning_rate': 5.600000000000001e-06, 'epoch': 2.16}
{'loss': 0.1393, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.2}
{'loss': 0.129, 'learning_rate': 5.0666666666666676e-06, 'epoch': 2.24}
{'loss': 0.127, 'learning_rate': 4.800000000000001e-06, 'epoch': 2.28}
{'loss': 0.1434, 'learning_rate': 4.533333333333334e-06, 'epoch': 2.32}
{'loss': 0.1306, 'learning_rate': 4.266666666666668e-06, 'epoch': 2.36}
{'loss': 0.1309, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}
{'loss': 0.1362, 'learning_rate': 3.7333333333333337e-06, 'epoch': 2.44}
{'loss': 0.1222, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.48}
{'loss': 0.1386, 'learning_rate': 3.2000000000000003e-06, 'epoch': 2.52}
{'loss': 0.1244, 'learning_rate': 2.9333333333333338e-06, 'epoc

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


{'loss': 0.1412, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/63 [00:00<?, ?it/s]

Saving model checkpoint to ./dual_model\checkpoint-750
Configuration saved in ./dual_model\checkpoint-750\config.json


{'eval_loss': 0.1821373850107193, 'eval_accuracy': 0.7845, 'eval_f1': 0.741141141141141, 'eval_runtime': 14.3323, 'eval_samples_per_second': 139.545, 'eval_steps_per_second': 4.396, 'epoch': 3.0}


Model weights saved in ./dual_model\checkpoint-750\pytorch_model.bin
tokenizer config file saved in ./dual_model\checkpoint-750\tokenizer_config.json
Special tokens file saved in ./dual_model\checkpoint-750\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./dual_model\checkpoint-250 (score: 0.7444690265486725).
  state_dict = torch.load(best_model_path, map_location="cpu")


{'train_runtime': 655.2067, 'train_samples_per_second': 36.63, 'train_steps_per_second': 1.145, 'train_loss': 0.1814866623878479, 'epoch': 3.0}


TrainOutput(global_step=750, training_loss=0.1814866623878479, metrics={'train_runtime': 655.2067, 'train_samples_per_second': 36.63, 'train_steps_per_second': 1.145, 'train_loss': 0.1814866623878479, 'epoch': 3.0})

In [19]:
trainer.evaluate(tokenized_datasets["test"])

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 32


  0%|          | 0/63 [00:00<?, ?it/s]

{'eval_loss': 0.19514742493629456,
 'eval_accuracy': 0.769,
 'eval_f1': 0.7444690265486725,
 'eval_runtime': 15.0488,
 'eval_samples_per_second': 132.901,
 'eval_steps_per_second': 4.186,
 'epoch': 3.0}

In [20]:
class SentenceSimilarityPipeline:

    def __init__(self, model, tokenizer) -> None:
        self.model = model.bert
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, senA, senB):
        return self.tokenizer([senA, senB], max_length=128, truncation=True, return_tensors="pt", padding=True)

    def predict(self, inputs):
        inputs = {k: v.to(self.device) for k, v in inputs.items()}
        return self.model(**inputs)[1]  # [2, 768]

    def postprocess(self, logits):
        cos = CosineSimilarity()(logits[None, 0, :], logits[None,1, :]).squeeze().cpu().item()
        return cos

    def __call__(self, senA, senB, return_vector=False):
        inputs = self.preprocess(senA, senB)
        logits = self.predict(inputs)
        result = self.postprocess(logits)
        if return_vector:
            return result, logits
        else:
            return result

In [21]:
pipe = SentenceSimilarityPipeline(model, tokenizer)

In [22]:
pipe("我喜欢北京", "明天不行", return_vector=True)

(0.5883079767227173,
 tensor([[-0.8260, -0.9930,  0.8790,  ...,  0.7587,  0.1873, -0.4133],
         [-0.9946, -0.9996, -0.7728,  ...,  0.8266,  0.5358, -0.7195]],
        device='cuda:0', grad_fn=<TanhBackward0>))