# Multiple Choice

In [1]:
!export HF_ENDPOINT=https://hf-mirror.com && ../hfd.sh clue/clue --dataset --include c3 --local-dir ./

[1;33mFetching repo metadata...[0m
[1;33mGenerating file list...[0m
[1;33mStarting download with aria2c to ./...
[0m
Download Results:
gid   |stat|avg speed  |path/URI
2ec64d|[1;32mOK[0m  |       0B/s|c3/test-00000-of-00001.parquet
a2a20d|[1;32mOK[0m  |       0B/s|c3/validation-00000-of-00001.parquet
d147eb|[1;32mOK[0m  |       0B/s|c3/train-00000-of-00001.parquet

Status Legend:
(OK):download completed.
[0;32mDownload completed successfully. Repo directory: /root/project/ZY/pytorch-transformer/transformers/MC
[0m

## import packages

In [2]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForMultipleChoice,
    TrainingArguments,
    Trainer,
    DefaultDataCollator,
)
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "2"
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

  from .autonotebook import tqdm as notebook_tqdm


## Load dataset

In [3]:
datasets = load_dataset("./c3")

In [4]:
datasets["train"][0]

{'id': 0,
 'context': ['男：你今天晚上有时间吗?我们一起去看电影吧?', '女：你喜欢恐怖片和爱情片，但是我喜欢喜剧片，科幻片一般。所以……'],
 'question': '女的最喜欢哪种电影?',
 'choice': ['恐怖片', '爱情片', '喜剧片', '科幻片'],
 'answer': '喜剧片'}

In [5]:
datasets.pop("test")

Dataset({
    features: ['id', 'context', 'question', 'choice', 'answer'],
    num_rows: 1625
})

In [6]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 11869
    })
    validation: Dataset({
        features: ['id', 'context', 'question', 'choice', 'answer'],
        num_rows: 3816
    })
})

## Preprocess dataset

In [7]:
tokenizer = AutoTokenizer.from_pretrained("../chinese-macbert-base")

we need to convert the data into a specific format.
like this:

|[CLS]|Context|[SEP]|Question|Choice A|[SEP]|

|[CLS]|Context|[SEP]|Question|Choice B|[SEP]|

|[CLS]|Context|[SEP]|Question|Choice C|[SEP]|

|[CLS]|Context|[SEP]|Question|Choice D|[SEP]|

In [8]:
def process(examples):
    context = []
    question_choices = []
    labels = [] # to store the index of the correct answer
    for idx in range(len(examples["context"])):
        ctx = "\n".join(examples["context"][idx])
        question = examples["question"][idx]
        choices = examples["choice"][idx]
        for choice in choices:
            context.append(ctx)
            question_choices.append(f"{question} {choice}")
        # some data may have no 4 choices
        if len(choices) < 4:
            for _ in range(4 - len(choices)):
                context.append(ctx)
                question_choices.append(f"{question} 无")
        labels.append(choices.index(examples["answer"][idx]))
    tokenized_examples = tokenizer(
        context,
        question_choices,
        truncation="only_first", # truncate the context only
        max_length=256,
        padding="max_length",
    ) # Now the input_ids is 4000 * 256
    tokenized_examples = {
        k: [v[i : i + 4]
        for i in range(0, len(v), 4)]
        for k, v in tokenized_examples.items()  # group every 4 choices together
    } # Now the input_ids is 1000 * 4 * 256
    tokenized_examples["labels"] = labels
    return tokenized_examples

In [9]:
tokenized_datasets = datasets.map(process, batched=True)

Map: 100%|██████████| 11869/11869 [00:14<00:00, 791.68 examples/s]


## Evaluate Function

In [10]:
!git clone https://github.com/huggingface/evaluate.git

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


fatal: destination path 'evaluate' already exists and is not an empty directory.


In [11]:
import numpy as np
import evaluate

accuracy = evaluate.load(
    "./evaluate/metrics/accuracy/accuracy.py"
)

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train Model

In [12]:
model = AutoModelForMultipleChoice.from_pretrained("../chinese-macbert-base")

args = TrainingArguments(
    output_dir="./models_mc",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50,
    load_best_model_at_end=True,
    fp16=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    compute_metrics=compute_metrics,
)

Some weights of BertForMultipleChoice were not initialized from the model checkpoint at ../chinese-macbert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,1.0066,0.956741,0.594602
2,0.71,0.996975,0.626834
3,0.3268,1.34709,0.634958


TrainOutput(global_step=2226, training_loss=0.7106486932715101, metrics={'train_runtime': 516.7814, 'train_samples_per_second': 68.901, 'train_steps_per_second': 4.307, 'total_flos': 1.873702246273229e+16, 'train_loss': 0.7106486932715101, 'epoch': 3.0})

## Pipeline for Predition

In [14]:
import torch

class MultiChoicePipeline:
    def __init__(self, model, tokenizer):
        self.model = model
        self.tokenizer = tokenizer
        self.device = model.device
    
    def __call__(self, context, question, choices):
        inputs = self.preprocess(context, question, choices)
        logits = self.predict(inputs)
        result = self.postprocess(logits, choices)
        return result

    def preprocess(self, context, question, choices):
        ctx, qcs = [], []
        for choice in choices:
            ctx.append(context)
            qcs.append(f"{question} {choice}")
        
        inputs = self.tokenizer(
            ctx,
            qcs,
            truncation="only_first",
            max_length=256,
            return_tensors="pt",
        )
    
        return inputs
    
    def predict(self, inputs):
        inputs = {k: v.unsqueeze(0).to(self.device) for k, v in inputs.items()}
        return self.model(**inputs).logits

    def postprocess(self, logits, choices):
        prediction = torch.argmax(logits, dim=1).item()
        return choices[prediction]

In [15]:
pipe = MultiChoicePipeline(model, tokenizer)

In [16]:
pipe(
    "小明在北京上班",
    "小明在哪里上班？",
    ["北京", "上海", "河北", "海南", "河北", "海南"],
)

'北京'