先导入数据集

In [None]:
from datasets import load_dataset

trainpath = "/kaggle/input/littlesoldierriddle/train_judge_4.csv"
validpath = "/kaggle/input/littlesoldierriddle/valid_judge.csv"
testpath = "/kaggle/input/littlesoldierriddle/test_judge.csv"
data_files = {"train": trainpath, "valid": validpath, "test": testpath}
riddle_dataset = load_dataset("csv", data_files=data_files, encoding='gb18030')
#riddle_dataset = load_dataset("csv", data_files=data_files)

riddle_shuffle = riddle_dataset["train"].shuffle(seed=42).select((range(32000)))

print(riddle_shuffle)
#print(riddle_shuffle[:3])
print(riddle_dataset["valid"][0])
print(riddle_shuffle.features)

接下来我们预处理一下数据集

In [None]:
from transformers import AutoTokenizer

checkpoint = "nghuyong/ernie-1.0"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    return tokenizer(
        example["riddle"],
        example["choice"], 
        truncation=True, 
        max_length=64, 
        #return_overflowing_tokens=True,
    )

tokenized_datasets = {}

tokenized_datasets["valid"] = riddle_dataset["valid"].map(
    tokenize_function, 
    batched=True, 
    #remove_columns=riddle_dataset["valid"].column_names,
)

tokenized_datasets["test"] = riddle_dataset["test"].map(
    tokenize_function, 
    batched=True, 
)

tokenized_datasets["train"] = riddle_shuffle.map(
    tokenize_function, 
    batched=True, 
    #remove_columns=riddle_shuffle.column_names, 
)

print(tokenized_datasets)


In [None]:
from datasets import load_metric
import numpy as np

def compute_metrics(eval_preds):
  metric = load_metric("glue", 'mrpc')
  logits, labels = eval_preds
  predictions = np.argmax(logits, axis=-1)
  return metric.compute(predictions=predictions, references=labels)


In [None]:

from transformers import TrainingArguments
from transformers import Trainer
from transformers import AutoModelForSequenceClassification, DataCollatorWithPadding

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

#training_args = TrainingArguments("test-trainer")
training_args = TrainingArguments(
    output_dir = "results", 
    overwrite_output_dir = True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 16, 
    eval_steps = 500, 
    num_train_epochs = 10,
    load_best_model_at_end=True, 
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()


In [None]:
import numpy as np

predictions = trainer.predict(tokenized_datasets["test"])
print(predictions.predictions.shape, predictions.label_ids.shape)
print(predictions.predictions[:1].shape, predictions.label_ids.shape)

preds = np.argmax(predictions.predictions, axis=-1)
np.save("preds.npy", predictions.predictions)
print("successfully saved")