In [1]:
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForMultipleChoice
import numpy as np

# Load the dataset
datasets = load_dataset('roberthsu2003/for_Multiple_Choice')

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-chinese')

# Process function
def process_function(examples):
    contexts = []
    question_choice = []
    labels = []
    for idx in range(len(examples["context"])):
        ctx = examples['context'][idx]
        question = examples['question'][idx]

        choices = examples["choices"][idx]
        for choice in choices:
            contexts.append(ctx)
            question_choice.append(question + " " + choice)

        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                contexts.append(ctx)
                question_choice.append(question + " " + "不知道")
        answer = examples['answer'][idx]
        labels.append(choices.index(answer))
    tokenizer_example = tokenizer(contexts, question_choice, truncation="only_first", max_length=256, padding="max_length")
    tokenized_example = {k: [v[i:i+4] for i in range(0,len(v),4)] for k, v in tokenizer_example.items()}
    tokenized_example['labels'] = labels #add labels
    return tokenized_example

# Tokenize the dataset
tokenized_c3 = datasets.map(process_function, batched=True, remove_columns=datasets['train'].column_names)

# Load the model
model = AutoModelForMultipleChoice.from_pretrained('google-bert/bert-base-chinese')

# Evaluation metric
accuracy = evaluate.load("accuracy")

def compute_metric(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

# Training arguments
args = TrainingArguments(
    output_dir="./multiple_choice",
    per_device_train_batch_size=16,  # Because it's 3D, 16*4=64, the actual batch size will be 64
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    logging_steps=50,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    report_to='none'
)

# Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_c3['train'],
    eval_dataset=tokenized_c3['validation'],
    compute_metrics=compute_metric
)

# Train the model
#trainer.train()


Map:   0%|          | 0/5856 [00:00<?, ? examples/s]

Map:   0%|          | 0/1825 [00:00<?, ? examples/s]

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at google-bert/bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
