In [2]:
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments,AutoModelForMultipleChoice


In [3]:
datasets = load_dataset('roberthsu2003/for_Multiple_Choice')
datasets

DatasetDict({
    train: Dataset({
        features: ['context', 'question', 'choices', 'answer'],
        num_rows: 5856
    })
    validation: Dataset({
        features: ['context', 'question', 'choices', 'answer'],
        num_rows: 1825
    })
})

## 數據集預處理

In [4]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
tokenizer

BertTokenizerFast(name_or_path='google-bert/bert-base-chinese', vocab_size=21128, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [5]:
#只使用一筆資料的測試
from pprint import pprint
one_dataset = datasets['train'].select(range(1)) #one_dataset資料類型是Dataset
print(one_dataset)
contexts = []
question_choice = []
labels = []
ctx = one_dataset['context'][0]
question = one_dataset['question'][0]
answer = one_dataset['answer'][0]
choices = one_dataset["choices"][0]
for choice in choices:
    contexts.append(ctx)
    question_choice.append(question + " " + choice)
    
if len(choices) < 4:
    for _ in range(4 - len(choices)):
        contexts.append(ctx)
        question_choice.append(question + " " + "不知道")
labels.append(choices.index(answer))        


tokenizer_example = tokenizer(contexts, question_choice, truncation="only_first", max_length=256, padding="max_length")
tokenizer_example['labels'] = labels
tokenized_example = {k: [v[i:i+4] for i in range(0,len(v),4)] for k, v in tokenizer_example.items()}
pprint(tokenized_example,compact=True)

Dataset({
    features: ['context', 'question', 'choices', 'answer'],
    num_rows: 1
})
{'attention_mask': [[[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
                      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
                      0, 0, 0, 0, 0, 0, 0,

In [8]:
#一次1筆資料使用function的測試

def process_function(examples):
    contexts = []
    question_choice = []
    labels = []
    for idx in range(len(examples["context"])):
        ctx = examples['context'][idx]
        question = examples['question'][idx]
        
        choices = examples["choices"][idx]
        for choice in choices:
            contexts.append(ctx)
            question_choice.append(question + " " + choice)
            
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                contexts.append(ctx)
                question_choice.append(question + " " + "不知道")                
        answer = examples['answer'][idx]
        labels.append(choices.index(answer))
    tokenizer_example = tokenizer(contexts, question_choice, truncation="only_first", max_length=256, padding="max_length")    
    tokenized_example = {k: [v[i:i+4] for i in range(0,len(v),4)] for k, v in tokenizer_example.items()}
    tokenizer_example['labels'] = labels
    return tokenized_example


In [9]:
#測試10筆
res = datasets["train"].select(range(10)).map(process_function, batched=True)
res

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Dataset({
    features: ['context', 'question', 'choices', 'answer', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 10
})

In [102]:
# 檢查維度
import numpy as np
np.array(res['input_ids']).shape

(10, 4, 256)

In [10]:
tokenized_c3 = datasets.map(process_function, batched=True,remove_columns=datasets['train'].column_names)
tokenized_c3

Map:   0%|          | 0/5856 [00:00<?, ? examples/s]

Map:   0%|          | 0/1825 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5856
    })
    validation: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1825
    })
})

### 建立模型

In [106]:
model = AutoModelForMultipleChoice.from_pretrained('google-bert/bert-base-chinese')

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 建立評估函數

In [None]:
import numpy as np
accuracy = evaluate.load("accuracy")

def compute_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions,axis=-1)  
    return accuracy.compute(predictions=predictions, references=labels)

##測試評估函式compute_metric(pred)
# 假資料
# 假設我們有 5 個樣本，每個樣本有 4 個選項
num_examples = 5
num_choices = 4

# 建立假預測分數
# 每一行代表一個樣本，每一列代表一個選項的分數
dummy_predictions = np.random.rand(num_examples, num_choices)
print(dummy_predictions)

# 建立假標籤（正確答案）
# 每個元素是正確選項的索引（0, 1, 2 或 3）
dummy_labels = np.random.randint(0, num_choices, num_examples)
print(dummy_labels)
# 組合成預期的輸入格式
dummy_pred = (dummy_predictions, dummy_labels)

# 測試函數
result = compute_metric(dummy_pred)
print(result)

[[0.74552246 0.05501115 0.13920033 0.53322105]
 [0.73853369 0.73952506 0.93722425 0.32076591]
 [0.72793836 0.61533807 0.97880667 0.71576474]
 [0.06094998 0.49978098 0.18511978 0.56097825]
 [0.59720131 0.86107554 0.84847221 0.28036707]]
[3 3 1 1 1]
{'accuracy': 0.2}


## 配置訓練參數

In [None]:
args = TrainingArguments(
    output_dir="./muliple_choice",
    per_device_train_batch_size=16,#因為是3維,16*4=64,實際會是64
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to='none'
)

## 建立訓練器

In [126]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset=tokenized_c3['train'],
    eval_dataset=tokenized_c3['validation'],
    compute_metrics=compute_metric
)

## 訓練模型


In [133]:
trainer.train()

: 

In [None]:
## 模型預測
from typing import Any
import torch

class MultipleChoicePipeline:
    def __init__(self, model, tokenizer) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
    
    def preprocess(self, context, question, choices):
        cs, qcs = [], []
        for choice in choices:
            cs.append(context)
            qcs.append(question + " " + choice)
        return tokenizer(cs, qcs, truncation="only_first", max_length=256, return_tensors="pt")

    def predict(self, inputs):
        inputs = {k: v.unsqueeze(0).to(self.device) for k, v in inputs.items()}
        return self.model(**inputs).logits

    def postprocess(self, logits, choices):
        predition = torch.argmax(logits, dim=-1).cpu().item()
        return choices[predition]

    def __call__(self, context, question, choices) -> Any:
        inputs = self.preprocess(context,question,choices)
        logits = self.predict(inputs)
        result = self.postprocess(logits, choices)
        return result

In [130]:
pipe = MultipleChoicePipeline(model, tokenizer)


In [132]:
pipe("國堂在台北上班","國堂在哪裏上班?",['台北','台中'])

AttributeError: 'Tensor' object has no attribute 'cup'