In [None]:
#Environment Variables
import os
import pandas as pd
from datasets import load_dataset, load_metric
import pdb
import numpy as np
import codecs


WANDB_API_KEY = '\n'
os.environ['WANDB_MODE'] = 'offline'
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_checkpoint = "nghuyong/ernie-1.0"
batch_size = 16

In [None]:
train_df = pd.read_csv('../input/proriddles/train_new_df.csv')
print(train_df.head(4))
# 加载本地数据集，指定本地路径即可
datasets = load_dataset('csv', data_files={'train': '../input/proriddles/train_new_df.csv',
                                            'val': '../input/proriddles/val_new_df.csv',
                                          'test':'../input/proriddles/test_new_df.csv'})
print(datasets)

**数据预处理**

In [None]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)#, use_fast=True)

In [None]:
choice_names = ["choice0", "choice1", "choice2", "choice3","choice4"]

def preprocess_function(examples):
    # Repeat each first sentence five times to go with the five possibilities of second sentences.
    first_sentences = [[context] * 5 for context in examples["sent1"]]
    # Grab all second sentences possible for each context.
    question_headers = examples["sent2"]
    second_sentences = [[f"{header} {examples[choice][i]}" for choice in choice_names] for i, header in enumerate(question_headers)]
    
    # Flatten everything
    first_sentences = sum(first_sentences, [])
    second_sentences = sum(second_sentences, [])
    
    # for debugging
    for sent1, sent2 in zip(first_sentences, second_sentences):
        #print(len(sent1), len(sent2))
        if type(sent1) != type("string") or type(sent2) != type("string"):
            print(type(sent1), type(sent2))
            print(sent1,sent2)
            pdb.set_trace()
            
    # Tokenize    
    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # Un-flatten
    return {k: [v[i:i+5] for i in range(0, len(v), 5)] for k, v in tokenized_examples.items()}

In [None]:
examples = datasets["train"][120:125]
features = preprocess_function(examples)
print(len(features["input_ids"]), len(features["input_ids"][0]), [len(x) for x in features["input_ids"][0]])
idx = 1
[tokenizer.decode(features["input_ids"][idx][i]) for i in range(5)]

In [None]:
encoded_datasets = datasets.map(preprocess_function, batched=True)
encoded_datasets = encoded_datasets.remove_columns(['choice0', 'sent1', 'choice4', 'riddle', 'choice1', 'choice3', 'choice2', 'sent2'])

In [None]:
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

model = AutoModelForMultipleChoice.from_pretrained(model_checkpoint)

In [None]:
args = TrainingArguments(
    output_dir = './results',
    warmup_steps = 1000,

    logging_strategy = 'steps',
    logging_steps = 250,
    
    save_strategy = "epoch",
    save_total_limit = 1,
    load_best_model_at_end=True,
    
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs = 5,
    weight_decay=0.01,
)

In [None]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union
import torch

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [[{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features]
        flattened_features = sum(flattened_features, [])
        
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )
        
        # Un-flatten
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        # Add back labels
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

In [None]:
def compute_metrics(eval_predictions):
    predictions, label_ids = eval_predictions
    preds = np.argmax(predictions, axis=1)
    return {"accuracy": (preds == label_ids).astype(np.float32).mean().item()}

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_datasets["train"],
    eval_dataset=encoded_datasets["val"],
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer),
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

In [None]:
trainer.save_model()


In [None]:
predictions, label_ids, metrics= trainer.predict(test_dataset=encoded_datasets['val'])
writer = codecs.open("./results/valpredict.txt", 'w', encoding='utf-8')
for i in range(len(predictions)):
    predict_id = np.argmax(predictions[i], axis=-1)
    writer.write(str(predict_id)+'\n')
writer.write(str(metrics))
writer.close()
print(metrics)

In [None]:
predictions, label_ids, metrics= trainer.predict(test_dataset=encoded_datasets['test'])
writer = codecs.open("./results/testpredict.txt", 'w', encoding='utf-8')
for i in range(len(predictions)):
    predict_id = np.argmax(predictions[i], axis=-1)
    writer.write(str(predict_id)+'\n')
writer.write(str(metrics))
writer.close()
print(metrics)

In [None]:
def normalize(v):
    for i in range(v.shape[0]):
        norm = np.linalg.norm(v[i])
        if norm != 0: 
            v[i] = v[i] / norm
    return v

np.save('predictions.npy', normalize(predictions))
new = np.load('predictions.npy')
new

In [None]:
print(new.shape)