In [None]:
!pip install evaluate datasets transformers

## 載入相關套件

In [None]:
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments,AutoModelForMultipleChoice


## 載入數據集

In [None]:
datasets = load_dataset('roberthsu2003/for_Multiple_Choice')

## 數據集預處理

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')
def process_function(examples):
    contexts = []
    question_choice = []
    labels = []
    for idx in range(len(examples["context"])):
        ctx = examples['context'][idx]
        question = examples['question'][idx]

        choices = examples["choices"][idx]
        for choice in choices:
            contexts.append(ctx)
            question_choice.append(question + " " + choice)

        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                contexts.append(ctx)
                question_choice.append(question + " " + "不知道")
        answer = examples['answer'][idx]
        labels.append(choices.index(answer))
    tokenizer_example = tokenizer(contexts, question_choice, truncation="only_first", max_length=256, padding="max_length")
    tokenized_example = {k: [v[i:i+4] for i in range(0,len(v),4)] for k, v in tokenizer_example.items()}
    tokenized_example['labels'] = labels #add labels
    return tokenized_example

In [None]:
tokenized_c3 = datasets.map(process_function, batched=True,remove_columns=datasets['train'].column_names)

## 建立模型

In [None]:
model = AutoModelForMultipleChoice.from_pretrained('google-bert/bert-base-chinese')

## 建立評估函數

In [None]:
import numpy as np
accuracy = evaluate.load("accuracy")

def compute_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions,axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## 配置訓練參數

In [None]:
args = TrainingArguments(
    output_dir="./for_multiple_choice",
    per_device_train_batch_size=16,#因為是3維,16*4=64,實際會是64
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to='none'
)

## 建立訓練器

In [None]:
trainer = Trainer(
    model = model,
    args = args,
    train_dataset=tokenized_c3['train'],
    eval_dataset=tokenized_c3['validation'],
    compute_metrics=compute_metric
)

## 模型訓練

In [None]:
trainer.train()

## 模型預測

In [None]:
from typing import Any
import torch

class MultipleChoicePipeline:
    def __init__(self, model, tokenizer) -> None:
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device

    def preprocess(self, context, question, choices):
        cs, qcs = [], []
        for choice in choices:
            cs.append(context)
            qcs.append(question + " " + choice)
        return tokenizer(cs, qcs, truncation="only_first", max_length=256, return_tensors="pt")

    def predict(self, inputs):
        inputs = {k: v.unsqueeze(0).to(self.device) for k, v in inputs.items()}
        return self.model(**inputs).logits

    def postprocess(self, logits, choices):
        predition = torch.argmax(logits, dim=-1).cpu().item()
        return choices[predition]

    def __call__(self, context, question, choices) -> Any:
        inputs = self.preprocess(context,question,choices)
        logits = self.predict(inputs)
        result = self.postprocess(logits, choices)
        return result

In [None]:
pipe = MultipleChoicePipeline(model, tokenizer)

In [None]:
pipe("國堂在台北上班","國堂在哪裏上班?",['台北','台中'])

In [None]:
pipe("國堂在台北上班","國堂在哪裏上班?",['台北','台中','高雄','台南','基隆','宜蘭'])

## 上傳模型

In [None]:
from huggingface_hub import login
login()

In [None]:
trainer.push_to_hub("roberthsu2003")
#同時要上傳tokenizer
model_name = "roberthsu2003/for_multiple_choice"
tokenizer.push_to_hub(model_name)