## 文本相似模型1(Sentence Similarity)
- labels是只有0和1,2元分類的解決方案

### 載入相關套件 

In [None]:
from transformers import AutoTokenizer,AutoModelForSequenceClassification,TrainingArguments,Trainer
import evaluate
from datasets import load_dataset

In [2]:
datasets = load_dataset('roberthsu2003/for_Sentence_Similarity')
datasets

DatasetDict({
    train: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['sentence1', 'sentence2', 'label'],
        num_rows: 2000
    })
})

### 數據集處理

In [5]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

def process_function(examples):
    tokenized_examples = tokenizer(examples['sentence1'], examples['sentence2'],max_length=128, truncation=True)
    #examples['label']是字串
    tokenized_examples['labels'] = [int(label) for label in examples['label']]

    return tokenized_examples

tokenized_datasets = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)
tokenized_datasets

Map:   0%|          | 0/8000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 8000
    })
    test: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
})

In [7]:
from pprint import pprint
pprint(tokenized_datasets['train'][0],compact=True)


{'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1],
 'input_ids': [101, 4724, 3688, 5601, 4500, 784, 7938, 4277, 2094, 4638, 1962,
               8043, 102, 784, 7938, 4277, 2094, 4638, 4724, 3688, 5601, 1962,
               8043, 102],
 'labels': 1,
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
                    1, 1, 1, 1]}


### 建立模型

In [9]:
model = AutoModelForSequenceClassification.from_pretrained('google-bert/bert-base-chinese')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 建立評估函數

In [14]:
acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")

def eval_metric(eval_predict):
    predictions, labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

### 測試評估函是否正確

In [15]:
# 測試 eval_metric 函數的程式碼
import numpy as np

# 創建模擬資料
mock_predictions = np.array([
    [0.7, 0.3],  # 預測第0類的機率0.7，第1類的機率0.3
    [0.2, 0.8],
    [0.6, 0.4],
    [0.1, 0.9],
    [0.9, 0.1]
])

# 創建真實標籤
mock_labels = np.array([0, 1, 0, 1, 0])

# 測試評估函數
result = eval_metric((mock_predictions, mock_labels))
print("測試結果：")
print(f"準確率: {result['accuracy']:.3f}")
print(f"F1分數: {result['f1']:.3f}")

測試結果：
準確率: 1.000
F1分數: 1.000


### 建立TrainingArguments

In [None]:
train_args = TrainingArguments(
    output_dir="./sentence_similarity",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
    report_to='none')


### 建立Trainer

In [None]:
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model = model,
    args = train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric
)

### 模型訓練

In [None]:
trainer.train()

### 模型預測

In [None]:
from transformers import pipeline

model.config.id2label = {0:'不相似', 1:'相似'}
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)

In [None]:
pipe({"text":"我喜歡台北", "text_pair":"台北是我喜歡的地方"})

### 不上傳,非完成版
- 原因是一次比對1:1,如果比對1:1000000效能會很差