# Text Match

In [None]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments,
)

from datasets import load_dataset
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

In [None]:
dataset = load_dataset("json", data_files="./train_pair_1w.json", split="train")

In [None]:
dataset[0]

In [None]:
datasets = dataset.train_test_split(test_size=0.2)

## Preprocess Data

In [None]:
import torch

tokenizer = AutoTokenizer.from_pretrained("../chinese-macbert-base")

def process(examples):
    tokenized_examples = tokenizer(
        examples["sentence1"],
        examples["sentence2"],
        max_length=128,
        truncation=True,
    )
    tokenized_examples["labels"] = [float(label) for label in examples["label"]]
    return tokenized_examples

tokenized_datasets = datasets.map(process, batched=True, remove_columns=datasets["train"].column_names)

In [None]:
tokenized_datasets

## Model Training

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("../chinese-macbert-base", num_labels=1)

In [None]:
import evaluate

metric_accuracy = evaluate.load("../evaluate/metrics/accuracy/accuracy.py")
metric_f1 = evaluate.load("../evaluate/metrics/f1/f1.py")

In [None]:
import numpy as np

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.asarray(predictions).flatten()
    predictions = [int(float(p) > 0.5) for p in predictions]
    labels = [int(l) for l in labels]

    acc = metric_accuracy.compute(predictions=predictions, references=labels)
    f1 = metric_f1.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

In [None]:
args = TrainingArguments(
    output_dir="./models_cross",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True
    )

In [None]:
from transformers import DataCollatorWithPadding

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()