In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
train_df=pd.read_csv('data/train.csv')
test_df=pd.read_csv('data/test.csv')
print(train_df.head())
train_df['text']=train_df['text'].str.lower()
train_df['text']=train_df['text'].str.replace(r'[^\w\s]','')
train_texts,val_texts,train_labels,val_labels=train_test_split(train_df['text'],train_df['target'],test_size=0.2,random_state=42)

   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [10]:
from transformers import RobertaTokenizer

# 加载 RoBERTa tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# 对文本进行 tokenization 和 padding
def tokenize_function(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# 转换为 torch tensors
import torch
class DisasterTweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# 创建训练和验证数据集
train_dataset = DisasterTweetsDataset(train_encodings, train_labels)
val_dataset = DisasterTweetsDataset(val_encodings, val_labels)


In [3]:

from transformers import RobertaForSequenceClassification, RobertaTokenizer

# 使用 RoBERTa 替代 BERT
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)



Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import f1_score
import numpy as np
training_args = TrainingArguments(
    load_best_model_at_end=True,
    learning_rate=3e-5,
    lr_scheduler_type='cosine',
    output_dir='./result',
    num_train_epochs=5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    eval_strategy='epoch',
    save_strategy='epoch',
)
def compute_metrics(p):
    predictions, labels = p
    preds = np.argmax(predictions, axis=1)  # 获取预测标签
    f1 = f1_score(labels, preds)
    return {"f1": f1}
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

print(training_args.device)  # 输出: cuda 代表使用的是 GPU

cuda:0


In [5]:
trainer.train()
# 输出评估结果



  attn_output = torch.nn.functional.scaled_dot_product_attention(


Epoch,Training Loss,Validation Loss,F1
1,No log,0.392322,0.8
2,No log,0.438893,0.79056
3,0.456700,0.449904,0.783784
4,0.456700,0.501359,0.800597
5,0.456700,0.617964,0.790142


TrainOutput(global_step=955, training_loss=0.3571565238593136, metrics={'train_runtime': 309.3547, 'train_samples_per_second': 98.431, 'train_steps_per_second': 3.087, 'total_flos': 1330072634835000.0, 'train_loss': 0.3571565238593136, 'epoch': 5.0})

In [6]:
# 在验证集上评估模型
eval_results = trainer.evaluate()
print(f"Evaluation Results: {eval_results}")


Evaluation Results: {'eval_loss': 0.39232173562049866, 'eval_f1': 0.8, 'eval_runtime': 4.14, 'eval_samples_per_second': 367.872, 'eval_steps_per_second': 46.135, 'epoch': 5.0}


In [11]:
# 对测试集进行预测
test_encodings = tokenizer(list(test_df['text']), truncation=True, padding=True, max_length=128)
test_dataset = DisasterTweetsDataset(test_encodings, [0] * len(test_df))  # 这里标签设为0，实际会被预测

# 使用模型生成预测
predictions = trainer.predict(test_dataset)

# 获取预测结果
predicted_labels = predictions.predictions.argmax(axis=-1)

# 创建提交文件
submission = pd.DataFrame({'id': test_df['id'], 'target': predicted_labels})
submission.to_csv('submission.csv', index=False)
