数据预处理

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset

# 加载数据
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 提取文本和标签
train_df = train_df[['text', 'target']]
test_df = test_df[['text']]

# 创建训练和验证集
train_data, val_data = train_test_split(train_df, test_size=0.2, random_state=42)

# 创建 Hugging Face Dataset 对象
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_df)

# 初始化 BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# 定义数据预处理函数
def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

# 对数据进行预处理
train_dataset = train_dataset.map(preprocess_function, batched=True)
val_dataset = val_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# 将标签转换为数字
train_dataset = train_dataset.rename_column('target', 'labels')
val_dataset = val_dataset.rename_column('target', 'labels')


Map:   0%|          | 0/6090 [00:00<?, ? examples/s]

Map:   0%|          | 0/1523 [00:00<?, ? examples/s]

Map:   0%|          | 0/3263 [00:00<?, ? examples/s]

模型定义

In [2]:
from transformers import BertForSequenceClassification, TrainingArguments, Trainer

# 定义模型
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 定义训练参数
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 定义 Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


训练与评估

In [3]:
# 训练模型
trainer.train()

# 验证模型
eval_results = trainer.evaluate()
print(eval_results)


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss
1,0.4646,0.38339
2,0.3393,0.56257
3,0.2512,0.627109


{'eval_loss': 0.6271092295646667, 'eval_runtime': 23.4998, 'eval_samples_per_second': 64.809, 'eval_steps_per_second': 8.128, 'epoch': 3.0}


预测与提交

In [6]:
# 对测试集进行预测
predictions = trainer.predict(test_dataset)
pred_labels = predictions.predictions.argmax(axis=-1)

# 创建提交文件
submission = pd.DataFrame({'target': pred_labels})
submission.to_csv('submission.csv', index=False)


In [5]:
print(test_df.columns)


Index(['text'], dtype='object')
